This repository has been archived by the owner on Mar 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 737
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add multi doc autoretrieval pack (#803)
* cr * cr * cr * cr
- Loading branch information
Showing
9 changed files
with
672 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# Multi-Document AutoRetrieval (with Weaviate) Pack | ||
|
||
This LlamaPack implements structured hierarchical retrieval over multiple documents, using multiple @weaviate_io collections. | ||
|
||
## CLI Usage | ||
|
||
You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package: | ||
|
||
```bash | ||
llamaindex-cli download-llamapack MultiDocAutoRetrieverPack --download-dir ./multidoc_autoretrieval_pack | ||
``` | ||
|
||
You can then inspect the files at `./multidoc_autoretrieval_pack` and use them as a template for your own project! | ||
|
||
## Code Usage | ||
|
||
You can download the pack to a the `./multidoc_autoretrieval_pack` directory: | ||
|
||
```python | ||
from llama_index.llama_pack import download_llama_pack | ||
|
||
# download and install dependencies | ||
MultiDocAutoRetrieverPack = download_llama_pack( | ||
"MultiDocAutoRetrieverPack", "./multidoc_autoretrieval_pack" | ||
) | ||
``` | ||
|
||
From here, you can use the pack. To initialize it, you need to define a few arguments, see below. | ||
|
||
Then, you can set up the pack like so: | ||
|
||
```python | ||
# setup pack arguments | ||
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo | ||
|
||
import weaviate | ||
|
||
# cloud | ||
auth_config = weaviate.AuthApiKey(api_key="<api_key>") | ||
client = weaviate.Client( | ||
"https://<cluster>.weaviate.network", | ||
auth_client_secret=auth_config, | ||
) | ||
|
||
vector_store_info = VectorStoreInfo( | ||
content_info="Github Issues", | ||
metadata_info=[ | ||
MetadataInfo( | ||
name="state", | ||
description="Whether the issue is `open` or `closed`", | ||
type="string", | ||
), | ||
... | ||
] | ||
) | ||
|
||
# metadata_nodes is set of nodes with metadata representing each document | ||
# docs is the source docs | ||
# metadata_nodes and docs must be the same length | ||
metadata_nodes = [TextNode(..., metadata={...}), ...] | ||
docs = [Document(...), ...] | ||
|
||
pack = MultiDocAutoRetrieverPack( | ||
client, | ||
"<metadata_index_name>", | ||
"<doc_chunks_index_name>", | ||
metadata_nodes, | ||
docs, | ||
vector_store_info, | ||
auto_retriever_kwargs={ | ||
# any kwargs for the auto-retriever | ||
... | ||
} | ||
) | ||
|
||
``` | ||
|
||
The `run()` function is a light wrapper around `query_engine.query()`. | ||
|
||
```python | ||
response = pack.run("Tell me a bout a Music celebritiy.") | ||
``` | ||
|
||
You can also use modules individually. | ||
|
||
```python | ||
# use the retreiver | ||
retriever = pack.retriever | ||
nodes = retriever.retrieve("query_str") | ||
|
||
# use the query engine | ||
query_engine = pack.query_engine | ||
response = query_engine.query("query_str") | ||
``` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
"""Multidoc Autoretriever.""" | ||
|
||
from llama_index import VectorStoreIndex | ||
from llama_index.llms import OpenAI | ||
from typing import List, Dict, Any, Optional, cast | ||
from llama_index.llama_pack.base import BaseLlamaPack | ||
from llama_index.schema import Document, BaseNode | ||
from llama_index.vector_stores import WeaviateVectorStore | ||
from llama_index.vector_stores.types import VectorStoreInfo | ||
from llama_index.storage import StorageContext | ||
from llama_index import VectorStoreIndex | ||
from llama_index.retrievers import VectorIndexAutoRetriever, RecursiveRetriever | ||
from llama_index.vector_stores.types import ( | ||
MetadataFilter, | ||
MetadataFilters, | ||
FilterOperator, | ||
) | ||
|
||
from llama_index.retrievers import BaseRetriever | ||
from llama_index.indices.query.schema import QueryBundle | ||
from llama_index.schema import IndexNode, NodeWithScore | ||
from llama_index.query_engine import RetrieverQueryEngine | ||
|
||
|
||
class IndexAutoRetriever(BaseRetriever): | ||
"""Index auto-retriever. | ||
Simple wrapper around VectorIndexAutoRetriever to convert | ||
text nodes to index nodes. | ||
""" | ||
|
||
def __init__(self, retriever: VectorIndexAutoRetriever): | ||
"""Init params.""" | ||
self.retriever = retriever | ||
|
||
def _retrieve(self, query_bundle: QueryBundle): | ||
"""Convert nodes to index node.""" | ||
retrieved_nodes = self.retriever.retrieve(query_bundle) | ||
new_retrieved_nodes = [] | ||
for retrieved_node in retrieved_nodes: | ||
index_id = retrieved_node.metadata["index_id"] | ||
index_node = IndexNode.from_text_node( | ||
retrieved_node.node, index_id=index_id | ||
) | ||
new_retrieved_nodes.append( | ||
NodeWithScore(node=index_node, score=retrieved_node.score) | ||
) | ||
return new_retrieved_nodes | ||
|
||
|
||
class MultiDocAutoRetrieverPack(BaseLlamaPack): | ||
"""Multi-doc auto-retriever pack. | ||
Uses weaviate as the underlying storage. | ||
Args: | ||
docs (List[Document]): A list of documents to index. | ||
**kwargs: Keyword arguments to pass to the underlying index. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
weaviate_client: Any, | ||
doc_metadata_index_name: str, | ||
doc_chunks_index_name: str, | ||
metadata_nodes: List[BaseNode], | ||
docs: List[Document], | ||
doc_metadata_schema: VectorStoreInfo, | ||
auto_retriever_kwargs: Optional[Dict[str, Any]] = None, | ||
verbose: bool = False, | ||
) -> None: | ||
"""Init params.""" | ||
import weaviate | ||
|
||
# do some validation | ||
if len(docs) != len(metadata_nodes): | ||
raise ValueError( | ||
"The number of metadata nodes must match the number of documents." | ||
) | ||
|
||
# authenticate | ||
client = cast(weaviate.Client, weaviate_client) | ||
# auth_config = weaviate.AuthApiKey(api_key="") | ||
# client = weaviate.Client( | ||
# "https://<weaviate-cluster>.weaviate.network", | ||
# auth_client_secret=auth_config, | ||
# ) | ||
|
||
# initialize two vector store classes corresponding to the two index names | ||
metadata_store = WeaviateVectorStore( | ||
weaviate_client=client, index_name=doc_metadata_index_name | ||
) | ||
metadata_sc = StorageContext.from_defaults(vector_store=metadata_store) | ||
# index VectorStoreIndex | ||
# Since "new_docs" are concise summaries, we can directly feed them as nodes into VectorStoreIndex | ||
index = VectorStoreIndex(metadata_nodes, storage_context=metadata_sc) | ||
if verbose: | ||
print("Indexed metadata nodes.") | ||
|
||
# construct separate Weaviate Index with original docs. Define a separate query engine with query engine mapping to each doc id. | ||
chunks_store = WeaviateVectorStore( | ||
weaviate_client=client, index_name=doc_chunks_index_name | ||
) | ||
chunks_sc = StorageContext.from_defaults(vector_store=chunks_store) | ||
doc_index = VectorStoreIndex.from_documents(docs, storage_context=chunks_sc) | ||
if verbose: | ||
print("Indexed source document nodes.") | ||
|
||
# setup auto retriever | ||
auto_retriever = VectorIndexAutoRetriever( | ||
index, | ||
vector_store_info=doc_metadata_schema, | ||
**(auto_retriever_kwargs or {}), | ||
) | ||
self.index_auto_retriever = IndexAutoRetriever(retriever=auto_retriever) | ||
if verbose: | ||
print("Setup autoretriever over metadata.") | ||
|
||
# define per-document retriever | ||
self.retriever_dict = {} | ||
for doc in docs: | ||
index_id = doc.metadata["index_id"] | ||
# filter for the specific doc id | ||
filters = MetadataFilters( | ||
filters=[ | ||
MetadataFilter( | ||
key="index_id", operator=FilterOperator.EQ, value=index_id | ||
), | ||
] | ||
) | ||
retriever = doc_index.as_retriever(filters=filters) | ||
|
||
self.retriever_dict[index_id] = retriever | ||
|
||
if verbose: | ||
print("Setup per-document retriever.") | ||
|
||
# setup recursive retriever | ||
self.recursive_retriever = RecursiveRetriever( | ||
"vector", | ||
retriever_dict={"vector": self.index_auto_retriever, **self.retriever_dict}, | ||
verbose=True, | ||
) | ||
if verbose: | ||
print("Setup recursive retriever.") | ||
|
||
# plug into query engine | ||
llm = OpenAI(model="gpt-3.5-turbo") | ||
self.query_engine = RetrieverQueryEngine.from_args( | ||
self.recursive_retriever, llm=llm | ||
) | ||
|
||
def get_modules(self) -> Dict[str, Any]: | ||
""" | ||
Returns a dictionary containing the internals of the LlamaPack. | ||
Returns: | ||
Dict[str, Any]: A dictionary containing the internals of the | ||
LlamaPack. | ||
""" | ||
return { | ||
"index_auto_retriever": self.index_auto_retriever, | ||
"retriever_dict": self.retriever_dict, | ||
"recursive_retriever": self.recursive_retriever, | ||
"query_engine": self.query_engine, | ||
} | ||
|
||
def run(self, *args: Any, **kwargs: Any) -> Any: | ||
""" | ||
Runs queries against the index. | ||
Returns: | ||
Any: A response from the query engine. | ||
""" | ||
return self.query_engine.query(*args, **kwargs) |
Oops, something went wrong.