diff --git a/pkgs/standards/swarmauri_standard/pyproject.toml b/pkgs/standards/swarmauri_standard/pyproject.toml index a807e996..5e029b7b 100644 --- a/pkgs/standards/swarmauri_standard/pyproject.toml +++ b/pkgs/standards/swarmauri_standard/pyproject.toml @@ -45,17 +45,17 @@ aiohttp = { version = "^3.10.10", optional = true } #openai = { version = "^1.52.0", optional = true } #nltk = { version = "^3.9.1", optional = true } #textblob = { version = "^0.18.0", optional = true } -yake = { version = "==0.4.8", optional = true } +#yake = { version = "==0.4.8", optional = true } beautifulsoup4 = { version = "04.12.3", optional = true } #gensim = { version = "==4.3.3", optional = true } scipy = { version = ">=1.7.0,<1.14.0", optional = true } -scikit-learn = { version = "^1.4.2", optional = true } +#scikit-learn = { version = "^1.4.2", optional = true } #spacy = { version = ">=3.0.0,<=3.8.2", optional = true } #transformers = { version = "^4.45.0", optional = true } #torch = { version = "^2.5.0", optional = true } #keras = { version = ">=3.2.0", optional = true } #tf-keras = { version = ">=2.16.0", optional = true } -matplotlib = { version = ">=3.9.2", optional = true } +#matplotlib = { version = ">=3.9.2", optional = true } [tool.poetry.extras] # Extras without versioning, grouped for specific use cases @@ -64,16 +64,16 @@ io = ["aiofiles", "aiohttp"] nlp = [ #"nltk", #"textblob", - "yake" + #"yake" ] nlp_tools = ["beautifulsoup4"] #ml_toolkits = ["gensim", "scipy", "scikit-learn"] -ml_toolkits = ["scikit-learn"] +#ml_toolkits = ["scikit-learn"] #spacy = ["spacy"] #transformers = ["transformers"] #torch = ["torch"] #tensorflow = ["keras", "tf-keras"] -visualization = ["matplotlib"] +#visualization = ["matplotlib"] # Full option to install all extras full = [ @@ -82,7 +82,7 @@ full = [ #"nltk", "textblob", #"yake", "beautifulsoup4", - "scikit-learn", + ##"scikit-learn", #"gensim", "scipy", "scikit-learn", #"spacy", #"transformers", diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/README.md b/pkgs/standards/swarmauri_vectorstore_tfidf/README.md new file mode 100644 index 00000000..24ded9c4 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/README.md @@ -0,0 +1 @@ +# Swarmauri Example Plugin \ No newline at end of file diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/pyproject.toml b/pkgs/standards/swarmauri_vectorstore_tfidf/pyproject.toml new file mode 100644 index 00000000..645fbaa2 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/pyproject.toml @@ -0,0 +1,57 @@ +[tool.poetry] +name = "swarmauri_vectorstore_tfidf" +version = "0.6.0.dev1" +description = "This repository includes an example of a First Class Swarmauri Example." +authors = ["Jacob Stewart "] +license = "Apache-2.0" +readme = "README.md" +repository = "http://github.com/swarmauri/swarmauri-sdk" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] + +[tool.poetry.dependencies] +python = ">=3.10,<3.13" + +# Swarmauri +swarmauri_core = { path = "../../core" } +swarmauri_base = { path = "../../base" } + +[tool.poetry.group.dev.dependencies] +flake8 = "^7.0" +pytest = "^8.0" +pytest-asyncio = ">=0.24.0" +pytest-xdist = "^3.6.1" +pytest-json-report = "^1.5.0" +python-dotenv = "*" +requests = "^2.32.3" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +norecursedirs = ["combined", "scripts"] + +markers = [ + "test: standard test", + "unit: Unit tests", + "integration: Integration tests", + "acceptance: Acceptance tests", + "experimental: Experimental tests" +] +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" +asyncio_default_fixture_loop_scope = "function" + +[tool.poetry.plugins."swarmauri.vector_stores"] +TfidfVectorStore = "swarmauri_vectorstore_tfidf:TfidfVectorStore" + +[tool.poetry.plugins."swarmauri.embeddings"] +TfidfEmbedding = "swarmauri_vectorstore_tfidf:TfidfEmbedding" + diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfEmbedding.py b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfEmbedding.py new file mode 100644 index 00000000..4d55e134 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfEmbedding.py @@ -0,0 +1,54 @@ +from typing import List, Union, Any, Literal +import joblib +from pydantic import PrivateAttr +from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer + +from swarmauri_base.embeddings.EmbeddingBase import EmbeddingBase +from swarmauri_standard.vectors.Vector import Vector +from swarmauri_core.ComponentBase import ComponentBase + + +@ComponentBase.register_type(EmbeddingBase, "TfidfEmbedding") +class TfidfEmbedding(EmbeddingBase): + _model = PrivateAttr() + _fit_matrix = PrivateAttr() + type: Literal["TfidfEmbedding"] = "TfidfEmbedding" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._model = SklearnTfidfVectorizer() + + def extract_features(self): + return self._model.get_feature_names_out().tolist() + + def fit(self, documents: List[str]) -> None: + self._fit_matrix = self._model.fit_transform(documents) + + def fit_transform(self, documents: List[str]) -> List[Vector]: + self._fit_matrix = self._model.fit_transform(documents) + # Convert the sparse matrix rows into Vector instances + vectors = [ + Vector(value=vector.toarray().flatten()) for vector in self._fit_matrix + ] + return vectors + + def transform(self, data: Union[str, Any], documents: List[str]) -> List[Vector]: + raise NotImplementedError("Transform not implemented on TFIDFVectorizer.") + + def infer_vector(self, data: str, documents: List[str]) -> Vector: + documents.append(data) + tmp_tfidf_matrix = self.fit_transform(documents) + query_vector = tmp_tfidf_matrix[-1] + return query_vector + + def save_model(self, path: str) -> None: + """ + Saves the TF-IDF model to the specified path using joblib. + """ + joblib.dump(self._model, path) + + def load_model(self, path: str) -> None: + """ + Loads a TF-IDF model from the specified path using joblib. + """ + self._model = joblib.load(path) diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfVectorStore.py b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfVectorStore.py new file mode 100644 index 00000000..8c306d36 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/TfidfVectorStore.py @@ -0,0 +1,76 @@ +from typing import List, Union, Literal +from swarmauri_standard.documents.Document import Document +from swarmauri_standard.embeddings.TfidfEmbedding import TfidfEmbedding +from swarmauri_standard.distances.CosineDistance import CosineDistance + +from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase +from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import ( + VectorStoreRetrieveMixin, +) +from swarmauri_base.vector_stores.VectorStoreSaveLoadMixin import ( + VectorStoreSaveLoadMixin, +) +from swarmauri_core.ComponentBase import ComponentBase + + +@ComponentBase.register_type(VectorStoreBase, "TfidfVectorStore") +class TfidfVectorStore( + VectorStoreSaveLoadMixin, VectorStoreRetrieveMixin, VectorStoreBase +): + type: Literal["TfidfVectorStore"] = "TfidfVectorStore" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._embedder = TfidfEmbedding() + self._distance = CosineDistance() + self.documents = [] + + def add_document(self, document: Document) -> None: + self.documents.append(document) + # Recalculate TF-IDF matrix for the current set of documents + self._embedder.fit([doc.content for doc in self.documents]) + + def add_documents(self, documents: List[Document]) -> None: + self.documents.extend(documents) + # Recalculate TF-IDF matrix for the current set of documents + self._embedder.fit([doc.content for doc in self.documents]) + + def get_document(self, id: str) -> Union[Document, None]: + for document in self.documents: + if document.id == id: + return document + return None + + def get_all_documents(self) -> List[Document]: + return self.documents + + def delete_document(self, id: str) -> None: + self.documents = [doc for doc in self.documents if doc.id != id] + # Recalculate TF-IDF matrix for the current set of documents + self._embedder.fit([doc.content for doc in self.documents]) + + def update_document(self, id: str, updated_document: Document) -> None: + for i, document in enumerate(self.documents): + if document.id == id: + self.documents[i] = updated_document + break + + # Recalculate TF-IDF matrix for the current set of documents + self._embedder.fit([doc.content for doc in self.documents]) + + def retrieve(self, query: str, top_k: int = 5) -> List[Document]: + documents = [query] + documents.extend([doc.content for doc in self.documents]) + transform_matrix = self._embedder.fit_transform(documents) + + # The inferred vector is the last vector in the transformed_matrix + # The rest of the matrix is what we are comparing + distances = self._distance.distances( + transform_matrix[-1], transform_matrix[:-1] + ) + + # Get the indices of the top_k most similar (least distant) documents + top_k_indices = sorted(range(len(distances)), key=lambda i: distances[i])[ + :top_k + ] + return [self.documents[i] for i in top_k_indices] diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/__init__.py b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/__init__.py new file mode 100644 index 00000000..9d75f4fa --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/swarmauri_vectorstore_tfidf/__init__.py @@ -0,0 +1,15 @@ +from .TfidfEmbedding import TfidfEmbedding +from .TfidfVectorStore import TfidfVectorStore + +__version__ = "0.6.0.dev26" +__long_desc__ = """ + +# Swarmauri Tfidf Plugin + +This repository includes an Tfidf of a Swarmauri Plugin. + +Visit us at: https://swarmauri.com +Follow us at: https://github.com/swarmauri +Star us at: https://github.com/swarmauri/swarmauri-sdk + +""" diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfEmbedding_unit_test.py b/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfEmbedding_unit_test.py new file mode 100644 index 00000000..75850144 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfEmbedding_unit_test.py @@ -0,0 +1,39 @@ +import pytest +from swarmauri_vectorstore_tfidf.TfidfEmbedding import TfidfEmbedding + + +@pytest.mark.unit +def test_ubc_resource(): + def test(): + assert TfidfEmbedding().resource == "Embedding" + + test() + + +@pytest.mark.unit +def test_ubc_type(): + assert TfidfEmbedding().type == "TfidfEmbedding" + + +@pytest.mark.unit +def test_serialization(): + embedder = TfidfEmbedding() + assert ( + embedder.id == TfidfEmbedding.model_validate_json(embedder.model_dump_json()).id + ) + + +@pytest.mark.unit +def test_fit_transform(): + embedder = TfidfEmbedding() + documents = ["test", "test1", "test2"] + embedder.fit_transform(documents) + assert documents == embedder.extract_features() + + +@pytest.mark.unit +def test_infer_vector(): + embedder = TfidfEmbedding() + documents = ["test", "test1", "test2"] + embedder.fit_transform(documents) + assert embedder.infer_vector("hi", documents).value == [1.0, 0.0, 0.0, 0.0] diff --git a/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfVectorStore_unit_test.py b/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfVectorStore_unit_test.py new file mode 100644 index 00000000..975df7d8 --- /dev/null +++ b/pkgs/standards/swarmauri_vectorstore_tfidf/tests/unit/TfidfVectorStore_unit_test.py @@ -0,0 +1,36 @@ +import pytest +from swarmauri.documents.concrete.Document import Document +from swarmauri_vectorstore_tfidf.TfidfVectorStore import TfidfVectorStore + + +@pytest.mark.unit +def test_ubc_resource(): + vs = TfidfVectorStore() + assert vs.resource == "VectorStore" + assert vs.embedder.resource == "Embedding" + + +@pytest.mark.unit +def test_ubc_type(): + vs = TfidfVectorStore() + assert vs.type == "TfidfVectorStore" + + +@pytest.mark.unit +def test_serialization(): + vs = TfidfVectorStore() + assert vs.id == TfidfVectorStore.model_validate_json(vs.model_dump_json()).id + + +@pytest.mark.unit +def test_top_k(): + vs = TfidfVectorStore() + documents = [ + Document(content="test"), + Document(content="test1"), + Document(content="test2"), + Document(content="test3"), + ] + + vs.add_documents(documents) + assert len(vs.retrieve(query="test", top_k=2)) == 2 diff --git a/pkgs/swarmauri/pyproject.toml b/pkgs/swarmauri/pyproject.toml index 9ee80e5e..f5e5f6a1 100644 --- a/pkgs/swarmauri/pyproject.toml +++ b/pkgs/swarmauri/pyproject.toml @@ -49,9 +49,10 @@ full = [ "scikit-learn", "matplotlib" ] -doc2vecvectorstore = ["swarmauri_doc2vec_vectorstore"] +doc2vecvectorstore = ["swarmauri_vectorstore_doc2vec"] matplotlib_tool = ["swarmauri_tool_matplotlib"] keywordextractor_parser = ["swarmauri_parser_keywordextractor"] +tfidf_vectorstore = ["swarmauri_vectorstore_tfidf"] [tool.setuptools] namespace_packages = ["swarmauri"] diff --git a/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py b/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py index a75ed24c..a857f4c0 100644 --- a/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py +++ b/pkgs/swarmauri/swarmauri/plugin_citizenship_registry.py @@ -83,7 +83,7 @@ class PluginCitizenshipRegistry: "swarmauri.embeddings.MistralEmbedding": "swarmauri_standard.embeddings.MistralEmbedding", "swarmauri.embeddings.NmfEmbedding": "swarmauri_standard.embeddings.NmfEmbedding", "swarmauri.embeddings.OpenAIEmbedding": "swarmauri_standard.embeddings.OpenAIEmbedding", - "swarmauri.embeddings.TfidfEmbedding": "swarmauri_standard.embeddings.TfidfEmbedding", + # "swarmauri.embeddings.TfidfEmbedding": "swarmauri_standard.embeddings.TfidfEmbedding", "swarmauri.embeddings.VoyageEmbedding": "swarmauri_standard.embeddings.VoyageEmbedding", "swarmauri.exceptions.IndexErrorWithContext": "swarmauri_standard.exceptions.IndexErrorWithContext", "swarmauri.factories.AgentFactory": "swarmauri_standard.factories.AgentFactory", @@ -219,14 +219,17 @@ class PluginCitizenshipRegistry: "swarmauri.utils.sql_log": "swarmauri_standard.utils.sql_log", "swarmauri.utils.timeout_wrapper": "swarmauri_standard.utils.timeout_wrapper", "swarmauri.vector_stores.SqliteVectorStore": "swarmauri_standard.vector_stores.SqliteVectorStore", - "swarmauri.vector_stores.TfidfVectorStore": "swarmauri_standard.vector_stores.TfidfVectorStore", + # "swarmauri.vector_stores.TfidfVectorStore": "swarmauri_standard.vector_stores.TfidfVectorStore", "swarmauri.vectors.Vector": "swarmauri_standard.vectors.Vector", # extra "swarmauri.vector_stores.Doc2vecVectorStore": "swarmauri_vectorstore_doc2vec.Doc2vecVectorStore", - "swarmauri.vector_stores.Doc2VecEmbedding": "swarmauri_vectorstore_doc2vec.Doc2VecEmbedding", + "swarmauri.embeddings.Doc2VecEmbedding": "swarmauri_vectorstore_doc2vec.Doc2VecEmbedding", "swarmauri.tools.MatplotlibCsvTool": "swarmauri_tool_matplotlib.MatplotlibCsvTool", "swarmauri.tools.MatplotlibTool": "swarmauri_tool_matplotlib.MatplotlibTool", "swarmauri.parsers.KeywordExtractorParser": "swarmauri_parser_keywordextractor.KeywordExtractorParser", + "swarmauri.vector_stores.TfidfVectorStore": "swarmauri_vectorstore_tfidf.TfidfVectorStore", + "swarmauri.embeddings.TfidfEmbedding": "swarmauri_vectorstore_tfidf.TfidfEmbedding", + } SECOND_CLASS_REGISTRY: Dict[str, str] = {} THIRD_CLASS_REGISTRY: Dict[str, str] = {}