Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
add multi doc autoretrieval pack (#803)
Browse files Browse the repository at this point in the history
* cr

* cr

* cr

* cr
  • Loading branch information
jerryjliu authored Dec 22, 2023
1 parent b2133b9 commit e7d77b4
Show file tree
Hide file tree
Showing 9 changed files with 672 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@
"metadata": {},
"outputs": [],
"source": [
"cogniswitch_tool_kwargs = {\"cs_token\":cs_token,\n",
" \"apiKey\":oauth_token\n",
" }"
"cogniswitch_tool_kwargs = {\"cs_token\": cs_token, \"apiKey\": oauth_token}"
]
},
{
Expand Down Expand Up @@ -128,7 +126,9 @@
}
],
"source": [
"response = cogniswitch_agent_pack.run(\"Upload the URL- https://cogniswitch.ai/developer\")\n",
"response = cogniswitch_agent_pack.run(\n",
" \"Upload the URL- https://cogniswitch.ai/developer\"\n",
")\n",
"print(response)"
]
},
Expand All @@ -155,7 +155,9 @@
}
],
"source": [
"response = cogniswitch_agent_pack.run(\"tell me the status of the document https://cogniswitch.ai/developer\")\n",
"response = cogniswitch_agent_pack.run(\n",
" \"tell me the status of the document https://cogniswitch.ai/developer\"\n",
")\n",
"print(response)"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "llama-index-4a-wkI5X-py3.11",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -322,9 +322,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
11 changes: 11 additions & 0 deletions llama_hub/llama_packs/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -200,5 +200,16 @@
"symbolic",
"embedding"
]
},
"MultiDocAutoRetrieverPack": {
"id": "llama_packs/multidoc_autoretrieval",
"author": "jerryjliu",
"keywords": [
"autoretrieval",
"multi",
"multidoc",
"document",
"retrieval"
]
}
}
94 changes: 94 additions & 0 deletions llama_hub/llama_packs/multidoc_autoretrieval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Multi-Document AutoRetrieval (with Weaviate) Pack

This LlamaPack implements structured hierarchical retrieval over multiple documents, using multiple @weaviate_io collections.

## CLI Usage

You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:

```bash
llamaindex-cli download-llamapack MultiDocAutoRetrieverPack --download-dir ./multidoc_autoretrieval_pack
```

You can then inspect the files at `./multidoc_autoretrieval_pack` and use them as a template for your own project!

## Code Usage

You can download the pack to a the `./multidoc_autoretrieval_pack` directory:

```python
from llama_index.llama_pack import download_llama_pack

# download and install dependencies
MultiDocAutoRetrieverPack = download_llama_pack(
"MultiDocAutoRetrieverPack", "./multidoc_autoretrieval_pack"
)
```

From here, you can use the pack. To initialize it, you need to define a few arguments, see below.

Then, you can set up the pack like so:

```python
# setup pack arguments
from llama_index.vector_stores.types import MetadataInfo, VectorStoreInfo

import weaviate

# cloud
auth_config = weaviate.AuthApiKey(api_key="<api_key>")
client = weaviate.Client(
"https://<cluster>.weaviate.network",
auth_client_secret=auth_config,
)

vector_store_info = VectorStoreInfo(
content_info="Github Issues",
metadata_info=[
MetadataInfo(
name="state",
description="Whether the issue is `open` or `closed`",
type="string",
),
...
]
)

# metadata_nodes is set of nodes with metadata representing each document
# docs is the source docs
# metadata_nodes and docs must be the same length
metadata_nodes = [TextNode(..., metadata={...}), ...]
docs = [Document(...), ...]

pack = MultiDocAutoRetrieverPack(
client,
"<metadata_index_name>",
"<doc_chunks_index_name>",
metadata_nodes,
docs,
vector_store_info,
auto_retriever_kwargs={
# any kwargs for the auto-retriever
...
}
)

```

The `run()` function is a light wrapper around `query_engine.query()`.

```python
response = pack.run("Tell me a bout a Music celebritiy.")
```

You can also use modules individually.

```python
# use the retreiver
retriever = pack.retriever
nodes = retriever.retrieve("query_str")

# use the query engine
query_engine = pack.query_engine
response = query_engine.query("query_str")
```
Empty file.
177 changes: 177 additions & 0 deletions llama_hub/llama_packs/multidoc_autoretrieval/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Multidoc Autoretriever."""

from llama_index import VectorStoreIndex
from llama_index.llms import OpenAI
from typing import List, Dict, Any, Optional, cast
from llama_index.llama_pack.base import BaseLlamaPack
from llama_index.schema import Document, BaseNode
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.vector_stores.types import VectorStoreInfo
from llama_index.storage import StorageContext
from llama_index import VectorStoreIndex
from llama_index.retrievers import VectorIndexAutoRetriever, RecursiveRetriever
from llama_index.vector_stores.types import (
MetadataFilter,
MetadataFilters,
FilterOperator,
)

from llama_index.retrievers import BaseRetriever
from llama_index.indices.query.schema import QueryBundle
from llama_index.schema import IndexNode, NodeWithScore
from llama_index.query_engine import RetrieverQueryEngine


class IndexAutoRetriever(BaseRetriever):
"""Index auto-retriever.
Simple wrapper around VectorIndexAutoRetriever to convert
text nodes to index nodes.
"""

def __init__(self, retriever: VectorIndexAutoRetriever):
"""Init params."""
self.retriever = retriever

def _retrieve(self, query_bundle: QueryBundle):
"""Convert nodes to index node."""
retrieved_nodes = self.retriever.retrieve(query_bundle)
new_retrieved_nodes = []
for retrieved_node in retrieved_nodes:
index_id = retrieved_node.metadata["index_id"]
index_node = IndexNode.from_text_node(
retrieved_node.node, index_id=index_id
)
new_retrieved_nodes.append(
NodeWithScore(node=index_node, score=retrieved_node.score)
)
return new_retrieved_nodes


class MultiDocAutoRetrieverPack(BaseLlamaPack):
"""Multi-doc auto-retriever pack.
Uses weaviate as the underlying storage.
Args:
docs (List[Document]): A list of documents to index.
**kwargs: Keyword arguments to pass to the underlying index.
"""

def __init__(
self,
weaviate_client: Any,
doc_metadata_index_name: str,
doc_chunks_index_name: str,
metadata_nodes: List[BaseNode],
docs: List[Document],
doc_metadata_schema: VectorStoreInfo,
auto_retriever_kwargs: Optional[Dict[str, Any]] = None,
verbose: bool = False,
) -> None:
"""Init params."""
import weaviate

# do some validation
if len(docs) != len(metadata_nodes):
raise ValueError(
"The number of metadata nodes must match the number of documents."
)

# authenticate
client = cast(weaviate.Client, weaviate_client)
# auth_config = weaviate.AuthApiKey(api_key="")
# client = weaviate.Client(
# "https://<weaviate-cluster>.weaviate.network",
# auth_client_secret=auth_config,
# )

# initialize two vector store classes corresponding to the two index names
metadata_store = WeaviateVectorStore(
weaviate_client=client, index_name=doc_metadata_index_name
)
metadata_sc = StorageContext.from_defaults(vector_store=metadata_store)
# index VectorStoreIndex
# Since "new_docs" are concise summaries, we can directly feed them as nodes into VectorStoreIndex
index = VectorStoreIndex(metadata_nodes, storage_context=metadata_sc)
if verbose:
print("Indexed metadata nodes.")

# construct separate Weaviate Index with original docs. Define a separate query engine with query engine mapping to each doc id.
chunks_store = WeaviateVectorStore(
weaviate_client=client, index_name=doc_chunks_index_name
)
chunks_sc = StorageContext.from_defaults(vector_store=chunks_store)
doc_index = VectorStoreIndex.from_documents(docs, storage_context=chunks_sc)
if verbose:
print("Indexed source document nodes.")

# setup auto retriever
auto_retriever = VectorIndexAutoRetriever(
index,
vector_store_info=doc_metadata_schema,
**(auto_retriever_kwargs or {}),
)
self.index_auto_retriever = IndexAutoRetriever(retriever=auto_retriever)
if verbose:
print("Setup autoretriever over metadata.")

# define per-document retriever
self.retriever_dict = {}
for doc in docs:
index_id = doc.metadata["index_id"]
# filter for the specific doc id
filters = MetadataFilters(
filters=[
MetadataFilter(
key="index_id", operator=FilterOperator.EQ, value=index_id
),
]
)
retriever = doc_index.as_retriever(filters=filters)

self.retriever_dict[index_id] = retriever

if verbose:
print("Setup per-document retriever.")

# setup recursive retriever
self.recursive_retriever = RecursiveRetriever(
"vector",
retriever_dict={"vector": self.index_auto_retriever, **self.retriever_dict},
verbose=True,
)
if verbose:
print("Setup recursive retriever.")

# plug into query engine
llm = OpenAI(model="gpt-3.5-turbo")
self.query_engine = RetrieverQueryEngine.from_args(
self.recursive_retriever, llm=llm
)

def get_modules(self) -> Dict[str, Any]:
"""
Returns a dictionary containing the internals of the LlamaPack.
Returns:
Dict[str, Any]: A dictionary containing the internals of the
LlamaPack.
"""
return {
"index_auto_retriever": self.index_auto_retriever,
"retriever_dict": self.retriever_dict,
"recursive_retriever": self.recursive_retriever,
"query_engine": self.query_engine,
}

def run(self, *args: Any, **kwargs: Any) -> Any:
"""
Runs queries against the index.
Returns:
Any: A response from the query engine.
"""
return self.query_engine.query(*args, **kwargs)
Loading

0 comments on commit e7d77b4

Please sign in to comment.