-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1057 from MichaelDecent/pkg2
Add Swarmauri Keyword Extractor Plugin
- Loading branch information
Showing
8 changed files
with
155 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Swarmauri Example Plugin |
53 changes: 53 additions & 0 deletions
53
pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
[tool.poetry] | ||
name = "swarmauri_parser_keywordextractor" | ||
version = "0.6.0.dev1" | ||
description = "This repository includes an example of a First Class Swarmauri Example." | ||
authors = ["Jacob Stewart <[email protected]>"] | ||
license = "Apache-2.0" | ||
readme = "README.md" | ||
repository = "http://github.com/swarmauri/swarmauri-sdk" | ||
classifiers = [ | ||
"License :: OSI Approved :: Apache Software License", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
"Programming Language :: Python :: 3.12" | ||
] | ||
|
||
[tool.poetry.dependencies] | ||
python = ">=3.10,<3.13" | ||
|
||
# Swarmauri | ||
swarmauri_core = { path = "../../core" } | ||
swarmauri_base = { path = "../../base" } | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
flake8 = "^7.0" | ||
pytest = "^8.0" | ||
pytest-asyncio = ">=0.24.0" | ||
pytest-xdist = "^3.6.1" | ||
pytest-json-report = "^1.5.0" | ||
python-dotenv = "*" | ||
requests = "^2.32.3" | ||
|
||
[build-system] | ||
requires = ["poetry-core>=1.0.0"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.pytest.ini_options] | ||
norecursedirs = ["combined", "scripts"] | ||
|
||
markers = [ | ||
"test: standard test", | ||
"unit: Unit tests", | ||
"integration: Integration tests", | ||
"acceptance: Acceptance tests", | ||
"experimental: Experimental tests" | ||
] | ||
log_cli = true | ||
log_cli_level = "INFO" | ||
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" | ||
log_cli_date_format = "%Y-%m-%d %H:%M:%S" | ||
asyncio_default_fixture_loop_scope = "function" | ||
|
||
[tool.poetry.plugins."swarmauri.parsers"] | ||
KeywordExtractorParser = "swarmauri_parser_keywordextractor:KeywordExtractorParser" |
55 changes: 55 additions & 0 deletions
55
...mauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import yake | ||
from typing import List, Union, Any, Literal | ||
from pydantic import ConfigDict, PrivateAttr | ||
from swarmauri_standard.documents.Document import Document | ||
from swarmauri_base.parsers.ParserBase import ParserBase | ||
from swarmauri_core.ComponentBase import ComponentBase | ||
|
||
|
||
@ComponentBase.register_type(ParserBase, "KeywordExtractorParser") | ||
class KeywordExtractorParser(ParserBase): | ||
""" | ||
Extracts keywords from text using the YAKE keyword extraction library. | ||
""" | ||
|
||
lang: str = "en" | ||
num_keywords: int = 10 | ||
_kw_extractor: yake.KeywordExtractor = PrivateAttr(default=None) | ||
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) | ||
type: Literal["KeywordExtractorParser"] = "KeywordExtractorParser" | ||
|
||
def __init__(self, **data): | ||
super().__init__(**data) | ||
self._kw_extractor = yake.KeywordExtractor( | ||
lan=self.lang, | ||
n=3, | ||
dedupLim=0.9, | ||
dedupFunc="seqm", | ||
windowsSize=1, | ||
top=self.num_keywords, | ||
features=None, | ||
) | ||
|
||
def parse(self, data: Union[str, Any]) -> List[Document]: | ||
""" | ||
Extract keywords from input text and return as list of Document instances containing keyword information. | ||
Parameters: | ||
- data (Union[str, Any]): The input text from which to extract keywords. | ||
Returns: | ||
- List[Document]: A list of Document instances, each containing information about an extracted keyword. | ||
""" | ||
# Ensure data is in string format for analysis | ||
text = str(data) if not isinstance(data, str) else data | ||
|
||
# Extract keywords using YAKE | ||
keywords = self._kw_extractor.extract_keywords(text) | ||
|
||
# Create Document instances for each keyword | ||
documents = [ | ||
Document(content=keyword, metadata={"score": score}) | ||
for index, (keyword, score) in enumerate(keywords) | ||
] | ||
|
||
return documents |
14 changes: 14 additions & 0 deletions
14
...standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from .KeywordExtractorParser import KeywordExtractorParser | ||
|
||
__version__ = "0.6.0.dev26" | ||
__long_desc__ = """ | ||
# Swarmauri Keyword Extractor Plugin | ||
This repository includes an Keyword Extractor of a Swarmauri Plugin. | ||
Visit us at: https://swarmauri.com | ||
Follow us at: https://github.com/swarmauri | ||
Star us at: https://github.com/swarmauri/swarmauri-sdk | ||
""" |
28 changes: 28 additions & 0 deletions
28
...tandards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pytest | ||
from swarmauri_parser_keywordextractor.KeywordExtractorParser import ( | ||
KeywordExtractorParser as Parser, | ||
) | ||
|
||
|
||
@pytest.mark.unit | ||
def test_ubc_resource(): | ||
parser = Parser() | ||
assert parser.resource == "Parser" | ||
|
||
|
||
@pytest.mark.unit | ||
def test_ubc_type(): | ||
parser = Parser() | ||
assert parser.type == "KeywordExtractorParser" | ||
|
||
|
||
@pytest.mark.unit | ||
def test_serialization(): | ||
parser = Parser() | ||
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id | ||
|
||
|
||
@pytest.mark.unit | ||
def test_parse(): | ||
assert Parser().parse("test two burgers")[2].resource == "Document" | ||
assert Parser().parse("test two burgers")[2].content == "burgers" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters