Merge pull request #1057 from MichaelDecent/pkg2

Add Swarmauri Keyword Extractor Plugin
swarmauri · Jan 9, 2025 · ce48530 · ce48530
2 parents a51d05f + 5510ea5
commit ce48530
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 2 deletions.
diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/README.md b/pkgs/standards/swarmauri_parser_keywordextractor/README.md
@@ -0,0 +1 @@
+# Swarmauri Example Plugin
diff --git a/pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml b/pkgs/standards/swarmauri_parser_keywordextractor/pyproject.toml
@@ -0,0 +1,53 @@
+[tool.poetry]
+name = "swarmauri_parser_keywordextractor"
+version = "0.6.0.dev1"
+description = "This repository includes an example of a First Class Swarmauri Example."
+authors = ["Jacob Stewart <[email protected]>"]
+license = "Apache-2.0"
+readme = "README.md"
+repository = "http://github.com/swarmauri/swarmauri-sdk"
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12"
+]
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+
+# Swarmauri
+swarmauri_core = { path = "../../core" }
+swarmauri_base = { path = "../../base" }
+
+[tool.poetry.group.dev.dependencies]
+flake8 = "^7.0"
+pytest = "^8.0"
+pytest-asyncio = ">=0.24.0"
+pytest-xdist = "^3.6.1"
+pytest-json-report = "^1.5.0"
+python-dotenv = "*"
+requests = "^2.32.3"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+norecursedirs = ["combined", "scripts"]
+
+markers = [
+    "test: standard test",
+    "unit: Unit tests",
+    "integration: Integration tests",
+    "acceptance: Acceptance tests",
+    "experimental: Experimental tests"
+]
+log_cli = true
+log_cli_level = "INFO"
+log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
+asyncio_default_fixture_loop_scope = "function"
+
+[tool.poetry.plugins."swarmauri.parsers"]
+KeywordExtractorParser = "swarmauri_parser_keywordextractor:KeywordExtractorParser"
diff --git a/...mauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py b/...mauri_parser_keywordextractor/swarmauri_parser_keywordextractor/KeywordExtractorParser.py
@@ -0,0 +1,55 @@
+import yake
+from typing import List, Union, Any, Literal
+from pydantic import ConfigDict, PrivateAttr
+from swarmauri_standard.documents.Document import Document
+from swarmauri_base.parsers.ParserBase import ParserBase
+from swarmauri_core.ComponentBase import ComponentBase
+
+
+@ComponentBase.register_type(ParserBase, "KeywordExtractorParser")
+class KeywordExtractorParser(ParserBase):
+    """
+    Extracts keywords from text using the YAKE keyword extraction library.
+    """
+
+    lang: str = "en"
+    num_keywords: int = 10
+    _kw_extractor: yake.KeywordExtractor = PrivateAttr(default=None)
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+    type: Literal["KeywordExtractorParser"] = "KeywordExtractorParser"
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self._kw_extractor = yake.KeywordExtractor(
+            lan=self.lang,
+            n=3,
+            dedupLim=0.9,
+            dedupFunc="seqm",
+            windowsSize=1,
+            top=self.num_keywords,
+            features=None,
+        )
+
+    def parse(self, data: Union[str, Any]) -> List[Document]:
+        """
+        Extract keywords from input text and return as list of Document instances containing keyword information.
+
+        Parameters:
+        - data (Union[str, Any]): The input text from which to extract keywords.
+
+        Returns:
+        - List[Document]: A list of Document instances, each containing information about an extracted keyword.
+        """
+        # Ensure data is in string format for analysis
+        text = str(data) if not isinstance(data, str) else data
+
+        # Extract keywords using YAKE
+        keywords = self._kw_extractor.extract_keywords(text)
+
+        # Create Document instances for each keyword
+        documents = [
+            Document(content=keyword, metadata={"score": score})
+            for index, (keyword, score) in enumerate(keywords)
+        ]
+
+        return documents
diff --git a/...standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py b/...standards/swarmauri_parser_keywordextractor/swarmauri_parser_keywordextractor/__init__.py
@@ -0,0 +1,14 @@
+from .KeywordExtractorParser import KeywordExtractorParser
+
+__version__ = "0.6.0.dev26"
+__long_desc__ = """
+
+# Swarmauri Keyword Extractor Plugin
+
+This repository includes an Keyword Extractor of a Swarmauri Plugin.
+
+Visit us at: https://swarmauri.com
+Follow us at: https://github.com/swarmauri
+Star us at: https://github.com/swarmauri/swarmauri-sdk
+
+"""
diff --git a/...tandards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py b/...tandards/swarmauri_parser_keywordextractor/tests/unit/KeywordExtractorParser_unit_test.py
@@ -0,0 +1,28 @@
+import pytest
+from swarmauri_parser_keywordextractor.KeywordExtractorParser import (
+    KeywordExtractorParser as Parser,
+)
+
+
+@pytest.mark.unit
+def test_ubc_resource():
+    parser = Parser()
+    assert parser.resource == "Parser"
+
+
+@pytest.mark.unit
+def test_ubc_type():
+    parser = Parser()
+    assert parser.type == "KeywordExtractorParser"
+
+
+@pytest.mark.unit
+def test_serialization():
+    parser = Parser()
+    assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id
+
+
+@pytest.mark.unit
+def test_parse():
+    assert Parser().parse("test two burgers")[2].resource == "Document"
+    assert Parser().parse("test two burgers")[2].content == "burgers"
diff --git a/pkgs/standards/swarmauri_standard/pyproject.toml b/pkgs/standards/swarmauri_standard/pyproject.toml
@@ -80,7 +80,7 @@ full = [
     "aiofiles", "aiohttp",
     #"cohere", "mistralai", "fal-client", "google-generativeai", "openai",
     #"nltk", "textblob", 
-    "yake",
+    #"yake",
     "beautifulsoup4",
     "scikit-learn",
     #"gensim", "scipy", "scikit-learn",

diff --git a/pkgs/swarmauri/pyproject.toml b/pkgs/swarmauri/pyproject.toml
@@ -51,6 +51,7 @@ full = [
 ]
 doc2vecvectorstore = ["swarmauri_doc2vec_vectorstore"]
 matplotlib_tool = ["swarmauri_tool_matplotlib"]
+keywordextractor_parser = ["swarmauri_parser_keywordextractor"]
 
 [tool.setuptools]
 namespace_packages = ["swarmauri"]

diff --git a/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py b/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py
@@ -137,7 +137,7 @@ class PluginCitizenshipRegistry:
         "swarmauri.parsers.BeautifulSoupElementParser": "swarmauri_standard.parsers.BeautifulSoupElementParser",
         "swarmauri.parsers.CSVParser": "swarmauri_standard.parsers.CSVParser",
         "swarmauri.parsers.HTMLTagStripParser": "swarmauri_standard.parsers.HTMLTagStripParser",
-        "swarmauri.parsers.KeywordExtractorParser": "swarmauri_standard.parsers.KeywordExtractorParser",
+        # "swarmauri.parsers.KeywordExtractorParser": "swarmauri_standard.parsers.KeywordExtractorParser",
         "swarmauri.parsers.Md2HtmlParser": "swarmauri_standard.parsers.Md2HtmlParser",
         "swarmauri.parsers.OpenAPISpecParser": "swarmauri_standard.parsers.OpenAPISpecParser",
         "swarmauri.parsers.PhoneNumberExtractorParser": "swarmauri_standard.parsers.PhoneNumberExtractorParser",
@@ -226,6 +226,7 @@ class PluginCitizenshipRegistry:
         "swarmauri.vector_stores.Doc2VecEmbedding": "swarmauri_vectorstore_doc2vec.Doc2VecEmbedding",
         "swarmauri.tools.MatplotlibCsvTool": "swarmauri_tool_matplotlib.MatplotlibCsvTool",
         "swarmauri.tools.MatplotlibTool": "swarmauri_tool_matplotlib.MatplotlibTool",
+        "swarmauri.parsers.KeywordExtractorParser": "swarmauri_parser_keywordextractor.KeywordExtractorParser",
     }
     SECOND_CLASS_REGISTRY: Dict[str, str] = {}
     THIRD_CLASS_REGISTRY: Dict[str, str] = {}