diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/README.md b/pkgs/standards/swarmauri_parser_keywordextractor/README.md new file mode 100644 index 00000000..24ded9c4 --- /dev/null +++ b/pkgs/standards/swarmauri_parser_keywordextractor/README.md @@ -0,0 +1 @@ +# Swarmauri Example Plugin \ No newline at end of file diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml b/pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml new file mode 100644 index 00000000..4b241918 --- /dev/null +++ b/pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml @@ -0,0 +1,53 @@ +[tool.poetry] +name = "swarmauri_parser_keywordextractor" +version = "0.6.0.dev1" +description = "This repository includes an example of a First Class Swarmauri Example." +authors = ["Jacob Stewart "] +license = "Apache-2.0" +readme = "README.md" +repository = "http://github.com/swarmauri/swarmauri-sdk" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] + +[tool.poetry.dependencies] +python = ">=3.10,<3.13" + +# Swarmauri +swarmauri_core = { path = "../../core" } +swarmauri_base = { path = "../../base" } + +[tool.poetry.group.dev.dependencies] +flake8 = "^7.0" +pytest = "^8.0" +pytest-asyncio = ">=0.24.0" +pytest-xdist = "^3.6.1" +pytest-json-report = "^1.5.0" +python-dotenv = "*" +requests = "^2.32.3" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +norecursedirs = ["combined", "scripts"] + +markers = [ + "test: standard test", + "unit: Unit tests", + "integration: Integration tests", + "acceptance: Acceptance tests", + "experimental: Experimental tests" +] +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" +asyncio_default_fixture_loop_scope = "function" + +[tool.poetry.plugins."swarmauri.parsers"] +KeywordExtractorParser = "swarmauri_parser_keywordextractor:KeywordExtractorParser" diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py b/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py new file mode 100644 index 00000000..c4dba36d --- /dev/null +++ b/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py @@ -0,0 +1,55 @@ +import yake +from typing import List, Union, Any, Literal +from pydantic import ConfigDict, PrivateAttr +from swarmauri_standard.documents.Document import Document +from swarmauri_base.parsers.ParserBase import ParserBase +from swarmauri_core.ComponentBase import ComponentBase + + +@ComponentBase.register_type(ParserBase, "KeywordExtractorParser") +class KeywordExtractorParser(ParserBase): + """ + Extracts keywords from text using the YAKE keyword extraction library. + """ + + lang: str = "en" + num_keywords: int = 10 + _kw_extractor: yake.KeywordExtractor = PrivateAttr(default=None) + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + type: Literal["KeywordExtractorParser"] = "KeywordExtractorParser" + + def __init__(self, **data): + super().__init__(**data) + self._kw_extractor = yake.KeywordExtractor( + lan=self.lang, + n=3, + dedupLim=0.9, + dedupFunc="seqm", + windowsSize=1, + top=self.num_keywords, + features=None, + ) + + def parse(self, data: Union[str, Any]) -> List[Document]: + """ + Extract keywords from input text and return as list of Document instances containing keyword information. + + Parameters: + - data (Union[str, Any]): The input text from which to extract keywords. + + Returns: + - List[Document]: A list of Document instances, each containing information about an extracted keyword. + """ + # Ensure data is in string format for analysis + text = str(data) if not isinstance(data, str) else data + + # Extract keywords using YAKE + keywords = self._kw_extractor.extract_keywords(text) + + # Create Document instances for each keyword + documents = [ + Document(content=keyword, metadata={"score": score}) + for index, (keyword, score) in enumerate(keywords) + ] + + return documents diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py b/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py new file mode 100644 index 00000000..834b193b --- /dev/null +++ b/pkgs/standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py @@ -0,0 +1,14 @@ +from .KeywordExtractorParser import KeywordExtractorParser + +__version__ = "0.6.0.dev26" +__long_desc__ = """ + +# Swarmauri Keyword Extractor Plugin + +This repository includes an Keyword Extractor of a Swarmauri Plugin. + +Visit us at: https://swarmauri.com +Follow us at: https://github.com/swarmauri +Star us at: https://github.com/swarmauri/swarmauri-sdk + +""" diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py b/pkgs/standards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py new file mode 100644 index 00000000..1b4cc2dd --- /dev/null +++ b/pkgs/standards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py @@ -0,0 +1,28 @@ +import pytest +from swarmauri_parser_keywordextractor.KeywordExtractorParser import ( + KeywordExtractorParser as Parser, +) + + +@pytest.mark.unit +def test_ubc_resource(): + parser = Parser() + assert parser.resource == "Parser" + + +@pytest.mark.unit +def test_ubc_type(): + parser = Parser() + assert parser.type == "KeywordExtractorParser" + + +@pytest.mark.unit +def test_serialization(): + parser = Parser() + assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id + + +@pytest.mark.unit +def test_parse(): + assert Parser().parse("test two burgers")[2].resource == "Document" + assert Parser().parse("test two burgers")[2].content == "burgers" diff --git a/pkgs/standards/swarmauri_standard/pyproject.toml b/pkgs/standards/swarmauri_standard/pyproject.toml index 29b8f58c..a807e996 100644 --- a/pkgs/standards/swarmauri_standard/pyproject.toml +++ b/pkgs/standards/swarmauri_standard/pyproject.toml @@ -80,7 +80,7 @@ full = [ "aiofiles", "aiohttp", #"cohere", "mistralai", "fal-client", "google-generativeai", "openai", #"nltk", "textblob", - "yake", + #"yake", "beautifulsoup4", "scikit-learn", #"gensim", "scipy", "scikit-learn", diff --git a/pkgs/swarmauri/pyproject.toml b/pkgs/swarmauri/pyproject.toml index 1590c09b..9ee80e5e 100644 --- a/pkgs/swarmauri/pyproject.toml +++ b/pkgs/swarmauri/pyproject.toml @@ -51,6 +51,7 @@ full = [ ] doc2vecvectorstore = ["swarmauri_doc2vec_vectorstore"] matplotlib_tool = ["swarmauri_tool_matplotlib"] +keywordextractor_parser = ["swarmauri_parser_keywordextractor"] [tool.setuptools] namespace_packages = ["swarmauri"] diff --git a/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py b/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py index 00f19d41..a75ed24c 100644 --- a/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py +++ b/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py @@ -137,7 +137,7 @@ class PluginCitizenshipRegistry: "swarmauri.parsers.BeautifulSoupElementParser": "swarmauri_standard.parsers.BeautifulSoupElementParser", "swarmauri.parsers.CSVParser": "swarmauri_standard.parsers.CSVParser", "swarmauri.parsers.HTMLTagStripParser": "swarmauri_standard.parsers.HTMLTagStripParser", - "swarmauri.parsers.KeywordExtractorParser": "swarmauri_standard.parsers.KeywordExtractorParser", + # "swarmauri.parsers.KeywordExtractorParser": "swarmauri_standard.parsers.KeywordExtractorParser", "swarmauri.parsers.Md2HtmlParser": "swarmauri_standard.parsers.Md2HtmlParser", "swarmauri.parsers.OpenAPISpecParser": "swarmauri_standard.parsers.OpenAPISpecParser", "swarmauri.parsers.PhoneNumberExtractorParser": "swarmauri_standard.parsers.PhoneNumberExtractorParser", @@ -226,6 +226,7 @@ class PluginCitizenshipRegistry: "swarmauri.vector_stores.Doc2VecEmbedding": "swarmauri_vectorstore_doc2vec.Doc2VecEmbedding", "swarmauri.tools.MatplotlibCsvTool": "swarmauri_tool_matplotlib.MatplotlibCsvTool", "swarmauri.tools.MatplotlibTool": "swarmauri_tool_matplotlib.MatplotlibTool", + "swarmauri.parsers.KeywordExtractorParser": "swarmauri_parser_keywordextractor.KeywordExtractorParser", } SECOND_CLASS_REGISTRY: Dict[str, str] = {} THIRD_CLASS_REGISTRY: Dict[str, str] = {}