Feat: add weaviate database (#24)

* Implemented vector_store/weaviate
apify · Jul 8, 2024 · 6335457 · 6335457
1 parent 4d070b5
commit 6335457
Show file tree

Hide file tree

Showing 72 changed files with 119,107 additions and 1,039,144 deletions.
diff --git a/Makefile b/Makefile
@@ -26,6 +26,7 @@ pydantic-model:
 	datamodel-codegen --input $(DIRS_WITH_ACTORS)/pgvector/.actor/input_schema.json --output $(DIRS_WITH_CODE)/src/models/pgvector_input_model.py  --input-file-type jsonschema  --field-constraints
 	datamodel-codegen --input $(DIRS_WITH_ACTORS)/pinecone/.actor/input_schema.json --output $(DIRS_WITH_CODE)/src/models/pinecone_input_model.py  --input-file-type jsonschema  --field-constraints
 	datamodel-codegen --input $(DIRS_WITH_ACTORS)/qdrant/.actor/input_schema.json --output $(DIRS_WITH_CODE)/src/models/qdrant_input_model.py  --input-file-type jsonschema  --field-constraints
+	datamodel-codegen --input $(DIRS_WITH_ACTORS)/weaviate/.actor/input_schema.json --output $(DIRS_WITH_CODE)/src/models/weaviate_input_model.py  --input-file-type jsonschema  --field-constraints
 
 pytest:
 	poetry run -C $(DIRS_WITH_CODE) pytest --with-integration --vcr-record=none
diff --git a/README.md b/README.md
@@ -66,7 +66,11 @@ services:
    ```bash
    make pydantic-model
    ```
-1. Create a new vector store in the `vector_stores` directory, e.g. `vector_stores/pgvector` and implement all functions
+1. Import the created model in `src/models/__init__.py`:
+   ```python
+   from .pgvector_input_model import PgvectorIntegration
+   ``
+1. Create a new module (`pgvector.py`) in the `vector_stores` directory, e.g. `vector_stores/pgvector` and implement all class `PGVectorDatabase` and all required methods.
 1. Add PGVector into `SupportedVectorStores` in the `constants.py` 
    ```python
       class SupportedVectorStores(str, enum.Enum):
@@ -93,7 +97,7 @@ services:
            return PGVectorDatabase(actor_input, embeddings)
    ```
 
-1. Add `PGVectorDatabase` fixture into `tests/confets.py`
+1. Add `PGVectorDatabase` fixture into `tests/conftets.py`
    ```python
       @pytest.fixture()
       def db_pgvector(crawl_1: list[Document]) -> PGVectorDatabase:

diff --git a/actors/chroma/README.md b/actors/chroma/README.md
@@ -9,8 +9,6 @@ This approach reduces unnecessary embedding computation and storage operations,
 💡 **Note**: This Actor is meant to be used together with other Actors' integration sections.
 For instance, if you are using the [Website Content Crawler](https://apify.com/apify/website-content-crawler), you can activate Chroma integration to save web data as vectors to Chroma.
 
-For more information how to leverage vector stores in Apify platform, see [Pinecone integration](https://github.com/apify/pinecone) and detailed blog post [what Pinecone is and why you should use it with your LLMs](https://blog.apify.com/what-is-pinecone-why-use-it-with-llms/).
-
 ## How does it work?
 
 Apify Chroma integration computes text embeddings and store them in Chroma. 

diff --git a/actors/pinecone/README.md b/actors/pinecone/README.md
@@ -9,7 +9,7 @@ This approach reduces unnecessary embedding computation and storage operations,
 💡 **Note**: This Actor is meant to be used together with other Actors' integration sections.
 For instance, if you are using the [Website Content Crawler](https://apify.com/apify/website-content-crawler), you can activate Pinecone integration to save web data as vectors to Pinecone.
 
-For more information how to leverage vector stores in Apify platform, see [Pinecone integration](https://github.com/HonzaTuron/pinecone) and detailed blog post [what Pinecone is and why you should use it with your LLMs](https://blog.apify.com/what-is-pinecone-why-use-it-with-llms/).
+For more information how to leverage vector stores in Apify platform, see detailed blog post [What Pinecone is and why you should use it with your LLMs](https://blog.apify.com/what-is-pinecone-why-use-it-with-llms/).
 
 ## How does it work?
 
@@ -44,7 +44,7 @@ This means your Pinecone index should also be configured to accommodate vectors
 ```json
 {
   "pineconeApiKey": "YOUR-PINECONE-API-KEY",
-  "pineconeIndexName": "apify-pinecone"
+  "pineconeIndexName": "apify"
 }
 ```
 
@@ -176,7 +176,7 @@ This integration will save the selected fields from your Actor to Pinecone.
 ```json
 {
   "pineconeApiKey": "YOUR-PINECONE-API-KEY",
-  "pineconeIndexName": "apify-pinecone"
+  "pineconeIndexName": "apify"
 }
 ```
 

diff --git a/actors/weaviate/.actor/actor.json b/actors/weaviate/.actor/actor.json
@@ -0,0 +1,15 @@
+{
+	"actorSpecification": 1,
+	"name": "weaviate-integration",
+	"title": "Weaviate Integration",
+	"description": "Upload a dataset to Weaviate",
+	"version": "0.0",
+	"input": "./input_schema.json",
+	"dockerfile": "../../../shared/Dockerfile",
+	"readme": "./README.md",
+	"changelog":"../../../shared/CHANGELOG.md",
+	"storages": {
+		"dataset": "../../../shared/dataset_schema.json"
+	},
+	"dockerContextDir": "../../.."
+}
diff --git a/actors/weaviate/.actor/input_schema.json b/actors/weaviate/.actor/input_schema.json
@@ -0,0 +1,143 @@
+{
+  "title": "Weaviate-integration",
+  "type": "object",
+  "schemaVersion": 1,
+  "properties": {
+    "weaviateUrl": {
+      "title": "Weaviate URL",
+      "type": "string",
+      "description": "REST URL of the Weaviate instance to connect to",
+      "editor": "textfield",
+      "sectionCaption": "Weaviate settings"
+    },
+    "weaviateApiKey": {
+      "title": "Weaviate API KEY",
+      "description": "Weaviate API KEY",
+      "type": "string",
+      "editor": "textfield",
+      "isSecret": true
+    },
+    "weaviateCollectionName": {
+      "title": "Weaviate collection name",
+      "type": "string",
+      "description": "Name of the Weaviate collection where the data will be stored",
+      "editor": "textfield"
+    },
+    "embeddingsProvider": {
+      "title": "Embeddings provider (as defined in the langchain API)",
+      "description": "Choose the embeddings provider to use for generating embeddings",
+      "type": "string",
+      "editor": "select",
+      "enum": [
+        "OpenAI",
+        "Cohere"
+      ],
+      "default": "OpenAI",
+      "sectionCaption": "Embeddings settings"
+    },
+    "embeddingsConfig": {
+      "title": "Configuration for embeddings provider",
+      "description": "Configure the parameters for the LangChain embedding class. Key points to consider:\n\n1. Typically, you only need to specify the model name. For example, for OpenAI, set the model name as {\"model\": \"text-embedding-3-small\"}.\n\n2. It's crucial to ensure that the vector size of your embeddings matches the size of embeddings in the database.\n\n3. Here are some examples of embedding models:\n   - [OpenAI](https://platform.openai.com/docs/guides/embeddings): `text-embedding-3-small`, `text-embedding-3-large`, etc.\n   - [Cohere](https://docs.cohere.com/docs/cohere-embed): `embed-english-v3.0`, `embed-multilingual-light-v3.0`, etc.\n\n4. For more details about other parameters, refer to the [LangChain documentation](https://python.langchain.com/v0.2/docs/integrations/text_embedding/).",
+      "type": "object",
+      "editor": "json"
+    },
+    "embeddingsApiKey": {
+      "title": "Embeddings API KEY (whenever applicable, depends on provider)",
+      "description": "Value of the API KEY for the embeddings provider (if required).\n\n For example for OpenAI it is OPENAI_API_KEY, for Cohere it is COHERE_API_KEY)",
+      "type": "string",
+      "editor": "textfield",
+      "isSecret": true
+    },
+    "datasetFields": {
+      "title": "Dataset fields to select from the dataset results and store in the database",
+      "type": "array",
+      "description": "This array specifies the dataset fields to be selected and stored in the vector store. Only the fields listed here will be included in the vector store.\n\nFor instance, when using the Website Content Crawler, you might choose to include fields such as `text`, `url`, and `metadata.title` in the vector store.",
+      "default": [
+        "text"
+      ],
+      "prefill": [
+        "text"
+      ],
+      "editor": "stringList",
+      "sectionCaption": "Dataset settings"
+    },
+    "metadataDatasetFields": {
+      "title": "Dataset fields to select from the dataset and store as metadata in the database",
+      "type": "object",
+      "description": "A list of dataset fields which should be selected from the dataset and stored as metadata in the vector stores.\n\nFor example, when using the Website Content Crawler, you might want to store `url` in metadata. In this case, use `metadataDatasetFields parameter as follows {\"url\": \"url\"}`",
+      "editor": "json"
+    },
+    "metadataObject": {
+      "title": "Custom object to be stored as metadata in the vector store database",
+      "type": "object",
+      "description": "This object allows you to store custom metadata for every item in the vector store.\n\nFor example, if you want to store the `domain` as metadata, use the `metadataObject` like this: {\"domain\": \"apify.com\"}.",
+      "editor": "json"
+    },
+    "datasetId": {
+      "title": "Dataset ID",
+      "type": "string",
+      "description": "Dataset ID (when running standalone without integration)",
+      "editor": "textfield"
+    },
+    "enableDeltaUpdates": {
+      "title": "Enable incremental updates for objects based on deltas",
+      "type": "boolean",
+      "description": "When set to true, this setting enables incremental updates for objects in the database by comparing the changes (deltas) between the crawled dataset items and the existing objects, uniquely identified by the `datasetKeysToItemId` field.\n\n The integration will only add new objects and update those that have changed, reducing unnecessary updates. The `datasetFields`, `metadataDatasetFields`, and `metadataObject` fields are used to determine the changes.",
+      "default": true,
+      "sectionCaption": "Delta updates settings"
+    },
+    "deltaUpdatesPrimaryDatasetFields": {
+      "title": "Dataset fields to uniquely identify dataset items (only relevant when `enableDeltaUpdates` is enabled)",
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "description": "This array contains fields that are used to uniquely identify dataset items, which helps to handle content changes across different runs.\n\nFor instance, in a web content crawling scenario, the `url` field could serve as a unique identifier for each item.",
+      "editor": "stringList",
+      "default": [
+        "url"
+      ],
+      "prefill": [
+        "url"
+      ]
+    },
+    "expiredObjectDeletionPeriodDays": {
+      "title": "Delete expired objects from the database after a specified number of days (only relevant when `enableDeltaUpdates` is enabled)",
+      "type": "integer",
+      "description": "This setting allows the integration to manage the deletion of objects from the database that have not been crawled for a specified period. It is typically used in subsequent runs after the initial crawl.\n\nWhen the value is greater than 0, the integration checks if objects have been seen within the last X days (determined by the expiration period). If the objects are expired, they are deleted from the database. The specific value for `deletedExpiredObjectsDays` depends on your use case and how frequently you crawl data.\n\nFor example, if you crawl data daily, you can set `deletedExpiredObjectsDays` to 7 days. If you crawl data weekly, you can set `deletedExpiredObjectsDays` to 30 days.\n\nSetting `deletedExpiredObjectsDays` to 0 disables this feature",
+      "default": 30,
+      "minimum": 0,
+      "unit": "days",
+      "editor": "number"
+    },
+    "performChunking": {
+      "title": "Enable text chunking",
+      "description": "When set to true, the text will be divided into smaller chunks based on the settings provided below. Proper chunking helps optimize retrieval and ensures accurate and efficient responses.",
+      "default": false,
+      "type": "boolean",
+      "sectionCaption": "Text chunking settings"
+    },
+    "chunkSize": {
+      "title": "Maximum chunk size",
+      "type": "integer",
+      "description": "Defines the maximum number of characters in each text chunk. Choosing the right size balances between detailed context and system performance. Optimal sizes ensure high relevancy and minimal response time.",
+      "default": 1000,
+      "minimum": 1
+    },
+    "chunkOverlap": {
+      "title": "Chunk overlap",
+      "type": "integer",
+      "description": "Specifies the number of overlapping characters between consecutive text chunks. Adjusting this helps maintain context across chunks, which is crucial for accuracy in retrieval-augmented generation systems.",
+      "default": 0,
+      "minimum": 0
+    }
+  },
+  "required": [
+    "weaviateUrl",
+    "weaviateApiKey",
+    "weaviateCollectionName",
+    "embeddingsProvider",
+    "embeddingsApiKey",
+    "datasetFields"
+  ]
+}