From 382b5979b7098bc33d6a8c1ec38a4b04ce3841f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9sar=20Garc=C3=ADa?= Date: Tue, 6 Feb 2024 00:40:02 +0100 Subject: [PATCH] Initial commit of the Mintter Publications Reader --- llama_hub/library.json | 8 +++ llama_hub/mintter/README.md | 26 ++++++++ llama_hub/mintter/__init__.py | 6 ++ llama_hub/mintter/base.py | 95 ++++++++++++++++++++++++++++++ llama_hub/mintter/requirements.txt | 4 ++ llama_hub/mintter/test.py | 15 +++++ 6 files changed, 154 insertions(+) create mode 100644 llama_hub/mintter/README.md create mode 100644 llama_hub/mintter/__init__.py create mode 100644 llama_hub/mintter/base.py create mode 100644 llama_hub/mintter/requirements.txt create mode 100644 llama_hub/mintter/test.py diff --git a/llama_hub/library.json b/llama_hub/library.json index 8d2e42243f..f0e971e368 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -166,6 +166,14 @@ "spreadsheet" ] }, + "MintterPublicationsReader": { + "id": "mintter", + "author": "elsatch", + "keywords": [ + "hypertext", + "hypermedia" + ] + }, "PagedCSVReader": { "id": "file/paged_csv", "author": "thejessezhang", diff --git a/llama_hub/mintter/README.md b/llama_hub/mintter/README.md new file mode 100644 index 0000000000..30b061540a --- /dev/null +++ b/llama_hub/mintter/README.md @@ -0,0 +1,26 @@ +# Mintter Publications Reader + +This reader loads documents from Mintter Hypermedia App. The user must have a local installation of Mintter App before running this loader. Then they need to specify if it's connecting to `author_publications` or `group_publications`. In case `group_publications` is selected use must specify `group_id` to load in the corresponding Document objects. + +Note that this Loader extracts the contents using the Hypermedia structures to extract additional metadata and the hierarchical structure of the documents (blocks, etc). This information is extracted from the Mintter local daemon using gRPC calls. + +The Hypermedia structured data might compromise its use for RAG applications, given a single topic might span over several blocks. Additional care must be taken to retrive the correct context for RAG. + +## Usage + +Here's an example usage of the MintterPublicationsReader. + +```python +from llama_index import download_loader +import os + +MintterPublicationsReader = download_loader('MintterPublicationsReader') + +access_method = "group_publications" +group_id = "" + +loader = MintterPublicationsReader(access_method=access_method, group_id="") +documents = loader.load_data() +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/llama_hub/mintter/__init__.py b/llama_hub/mintter/__init__.py new file mode 100644 index 0000000000..8274aa9f04 --- /dev/null +++ b/llama_hub/mintter/__init__.py @@ -0,0 +1,6 @@ +"""Init file.""" +from llama_hub.mintter.base import ( + MintterPublicationsReader, +) + +__all__ = ["MintterPublicationsReader"] \ No newline at end of file diff --git a/llama_hub/mintter/base.py b/llama_hub/mintter/base.py new file mode 100644 index 0000000000..990531b384 --- /dev/null +++ b/llama_hub/mintter/base.py @@ -0,0 +1,95 @@ +"""Mintter reader class. + +Pass in the access_method (either author_publications or group_publications). For groups_publications, pass in the group_id as well. +This will import the publications into a List of Documents, +with each Document containing text from under a Mintter block. + +""" +from groups.v1alpha import groups_pb2 +from groups.v1alpha import groups_pb2_grpc +from documents.v1alpha import documents_pb2 +from documents.v1alpha import documents_pb2_grpc + +from google.protobuf.json_format import MessageToDict + +import grpc + +from typing import Any, List, TYPE_CHECKING + +if TYPE_CHECKING: + from langchain.docstore.document import Document as LCDocument + +from llama_index.readers.base import BaseReader +from llama_index.readers.json import JSONReader +from llama_index.readers.schema.base import Document + + +class MintterPublicationsReader(BaseReader): + """Utilities for loading data from a Mintter Daemon. + + Args: + access_method (str): group_publications | author_publications. + (opt) group_id (str): The group id to load publications from. + + """ + + def __init__(self, access_method:str, group_id: str): + """Init params.""" + self.access_method = access_method + self.group_id = group_id + + def list_group_content(self,group_id): + with grpc.insecure_channel('localhost:55002') as channel: + stub = groups_pb2_grpc.GroupsStub(channel) + request = groups_pb2.ListContentRequest(id=group_id) + response = stub.ListContent(request) + # print(response) + return response + + def generate_publications_info(self,group_publication_list): + document_details = [] + + for key, value in group_publication_list["content"].items(): + document_id = value + cid_list = document_id.split("?v=") + document_info = { + "title": key, # Store the title/key for reference + "document_id": cid_list[0], # The document ID is always present + "version": cid_list[1] if len(cid_list) > 1 else None # The version is optional + } + document_details.append(document_info) + + return document_details + + def extract_document_content(self,document_id, version,local_only=False): + # LightClient Syntax: res = self._publications.GetPublication(documents_pb2.GetPublicationRequest(document_id=eid.split("?v=")[0], version=eid.split("?v=")[1], local_only=local_only)) + + with grpc.insecure_channel('localhost:55002') as channel: + stub = documents_pb2_grpc.PublicationsStub(channel) + if version is None: + request = documents_pb2.GetPublicationRequest(document_id=document_id, local_only=local_only) + else: + request = documents_pb2.GetPublicationRequest(document_id=document_id, version=version, local_only=local_only) + + response = stub.GetPublication(request) + print(f'Document content: {response}') + return response + + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + docs: List[Document] = [] + + group_publications = self.list_group_content(self.group_id) + group_publications_list = MessageToDict(group_publications) + group_publications_details = self.generate_documents_info(group_publications_list) + + for doc in group_publications_details: + extracted_document = self.extract_document_content(self, doc['document_id'], doc['version']) + content = JSONReader().load_data(extracted_document) + docs.extend(content) + return docs + + def load_langchain_documents(self, **load_kwargs: Any) -> List["LCDocument"]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] \ No newline at end of file diff --git a/llama_hub/mintter/requirements.txt b/llama_hub/mintter/requirements.txt new file mode 100644 index 0000000000..2e96febecf --- /dev/null +++ b/llama_hub/mintter/requirements.txt @@ -0,0 +1,4 @@ +# requirements taken from lightclient +grpcio==1.51.1 +grpcio-tools==1.51.1 +protobuf==4.21.12 \ No newline at end of file diff --git a/llama_hub/mintter/test.py b/llama_hub/mintter/test.py new file mode 100644 index 0000000000..fb789d3498 --- /dev/null +++ b/llama_hub/mintter/test.py @@ -0,0 +1,15 @@ +# We will call the loader using the following functions +# loader = MintterPublicationsLoader(access_method=access_method, group_id="") +# documents = loader.load_data() + +from llama_index import VectorStoreIndex, download_loader + +MintterPublicationsReader = download_loader('MintterPublicationsReader') + +access_method = 'group_publications' +group_id = 'hm://g/4FRae3AD1WpmfroSRMFGh' + +loader = MintterPublicationsReader(access_method=access_method, group_id=group_id) +documents = loader.load_data() +index = VectorStoreIndex.from_documents(documents) +index.query('¿Cuál es el principal factor para conseguir un buen rendimiento de un modelo LLM?') \ No newline at end of file