diff --git a/graphrag/api/query.py b/graphrag/api/query.py index ae1438269b..dc11ee33c2 100644 --- a/graphrag/api/query.py +++ b/graphrag/api/query.py @@ -26,6 +26,7 @@ from graphrag.config.embeddings import ( community_full_content_embedding, + create_collection_name, entity_description_embedding, text_unit_text_embedding, ) @@ -47,7 +48,6 @@ read_indexer_text_units, ) from graphrag.utils.cli import redact -from graphrag.utils.embeddings import create_collection_name from graphrag.vector_stores.base import BaseVectorStore from graphrag.vector_stores.factory import VectorStoreFactory diff --git a/graphrag/config/embeddings.py b/graphrag/config/embeddings.py index e63239ed6c..78f3cfdfe2 100644 --- a/graphrag/config/embeddings.py +++ b/graphrag/config/embeddings.py @@ -75,3 +75,22 @@ def get_embedding_settings( return { "strategy": strategy, } + + +def create_collection_name( + container_name: str, embedding_name: str, validate: bool = True +) -> str: + """ + Create a collection name for the embedding store. + + Within any given vector store, we can have multiple sets of embeddings organized into projects. + The `container` param is used for this partitioning, and is added as a prefix to the collection name for differentiation. + + The embedding name is fixed, with the available list defined in graphrag.index.config.embeddings + + Note that we use dot notation in our names, but many vector stores do not support this - so we convert to dashes. + """ + if validate and embedding_name not in all_embeddings: + msg = f"Invalid embedding name: {embedding_name}" + raise KeyError(msg) + return f"{container_name}-{embedding_name}".replace(".", "-") diff --git a/graphrag/index/operations/embed_text/embed_text.py b/graphrag/index/operations/embed_text/embed_text.py index 79110c076d..c735d1024a 100644 --- a/graphrag/index/operations/embed_text/embed_text.py +++ b/graphrag/index/operations/embed_text/embed_text.py @@ -12,8 +12,8 @@ from graphrag.cache.pipeline_cache import PipelineCache from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks +from graphrag.config.embeddings import create_collection_name from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingStrategy -from graphrag.utils.embeddings import create_collection_name from graphrag.vector_stores.base import BaseVectorStore, VectorStoreDocument from graphrag.vector_stores.factory import VectorStoreFactory diff --git a/graphrag/utils/embeddings.py b/graphrag/utils/embeddings.py deleted file mode 100644 index ff9fb1cd3a..0000000000 --- a/graphrag/utils/embeddings.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Utilities for working with embeddings stores.""" - -from graphrag.config.embeddings import all_embeddings - - -def create_collection_name( - container_name: str, embedding_name: str, validate: bool = True -) -> str: - """ - Create a collection name for the embedding store. - - Within any given vector store, we can have multiple sets of embeddings organized into projects. - The `container` param is used for this partitioning, and is added as a prefix to the collection name for differentiation. - - The embedding name is fixed, with the available list defined in graphrag.index.config.embeddings - - Note that we use dot notation in our names, but many vector stores do not support this - so we convert to dashes. - """ - if validate and embedding_name not in all_embeddings: - msg = f"Invalid embedding name: {embedding_name}" - raise KeyError(msg) - return f"{container_name}-{embedding_name}".replace(".", "-") diff --git a/tests/unit/utils/test_embeddings.py b/tests/unit/utils/test_embeddings.py index 54a6b79647..854bf08180 100644 --- a/tests/unit/utils/test_embeddings.py +++ b/tests/unit/utils/test_embeddings.py @@ -3,7 +3,7 @@ import pytest -from graphrag.utils.embeddings import create_collection_name +from graphrag.config.embeddings import create_collection_name def test_create_collection_name():