Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Initial commit of the Mintter Publications Reader
Browse files Browse the repository at this point in the history
  • Loading branch information
elsatch committed Feb 5, 2024
1 parent 20c2f59 commit 382b597
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 0 deletions.
8 changes: 8 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@
"spreadsheet"
]
},
"MintterPublicationsReader": {
"id": "mintter",
"author": "elsatch",
"keywords": [
"hypertext",
"hypermedia"
]
},
"PagedCSVReader": {
"id": "file/paged_csv",
"author": "thejessezhang",
Expand Down
26 changes: 26 additions & 0 deletions llama_hub/mintter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Mintter Publications Reader

This reader loads documents from Mintter Hypermedia App. The user must have a local installation of Mintter App before running this loader. Then they need to specify if it's connecting to `author_publications` or `group_publications`. In case `group_publications` is selected use must specify `group_id` to load in the corresponding Document objects.

Note that this Loader extracts the contents using the Hypermedia structures to extract additional metadata and the hierarchical structure of the documents (blocks, etc). This information is extracted from the Mintter local daemon using gRPC calls.

The Hypermedia structured data might compromise its use for RAG applications, given a single topic might span over several blocks. Additional care must be taken to retrive the correct context for RAG.

## Usage

Here's an example usage of the MintterPublicationsReader.

```python
from llama_index import download_loader
import os

MintterPublicationsReader = download_loader('MintterPublicationsReader')

access_method = "group_publications"
group_id = "<group_id>"

loader = MintterPublicationsReader(access_method=access_method, group_id="<group_id>")
documents = loader.load_data()
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/mintter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.mintter.base import (
MintterPublicationsReader,
)

__all__ = ["MintterPublicationsReader"]
95 changes: 95 additions & 0 deletions llama_hub/mintter/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Mintter reader class.
Pass in the access_method (either author_publications or group_publications). For groups_publications, pass in the group_id as well.
This will import the publications into a List of Documents,
with each Document containing text from under a Mintter block.
"""
from groups.v1alpha import groups_pb2
from groups.v1alpha import groups_pb2_grpc
from documents.v1alpha import documents_pb2
from documents.v1alpha import documents_pb2_grpc

from google.protobuf.json_format import MessageToDict

import grpc

from typing import Any, List, TYPE_CHECKING

if TYPE_CHECKING:
from langchain.docstore.document import Document as LCDocument

from llama_index.readers.base import BaseReader
from llama_index.readers.json import JSONReader
from llama_index.readers.schema.base import Document


class MintterPublicationsReader(BaseReader):
"""Utilities for loading data from a Mintter Daemon.
Args:
access_method (str): group_publications | author_publications.
(opt) group_id (str): The group id to load publications from.
"""

def __init__(self, access_method:str, group_id: str):
"""Init params."""
self.access_method = access_method
self.group_id = group_id

def list_group_content(self,group_id):
with grpc.insecure_channel('localhost:55002') as channel:
stub = groups_pb2_grpc.GroupsStub(channel)
request = groups_pb2.ListContentRequest(id=group_id)
response = stub.ListContent(request)
# print(response)
return response

def generate_publications_info(self,group_publication_list):
document_details = []

for key, value in group_publication_list["content"].items():
document_id = value
cid_list = document_id.split("?v=")
document_info = {
"title": key, # Store the title/key for reference
"document_id": cid_list[0], # The document ID is always present
"version": cid_list[1] if len(cid_list) > 1 else None # The version is optional
}
document_details.append(document_info)

return document_details

def extract_document_content(self,document_id, version,local_only=False):
# LightClient Syntax: res = self._publications.GetPublication(documents_pb2.GetPublicationRequest(document_id=eid.split("?v=")[0], version=eid.split("?v=")[1], local_only=local_only))

with grpc.insecure_channel('localhost:55002') as channel:
stub = documents_pb2_grpc.PublicationsStub(channel)
if version is None:
request = documents_pb2.GetPublicationRequest(document_id=document_id, local_only=local_only)
else:
request = documents_pb2.GetPublicationRequest(document_id=document_id, version=version, local_only=local_only)

response = stub.GetPublication(request)
print(f'Document content: {response}')
return response

def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
"""Load data from the input directory."""
docs: List[Document] = []

group_publications = self.list_group_content(self.group_id)
group_publications_list = MessageToDict(group_publications)
group_publications_details = self.generate_documents_info(group_publications_list)

for doc in group_publications_details:
extracted_document = self.extract_document_content(self, doc['document_id'], doc['version'])
content = JSONReader().load_data(extracted_document)
docs.extend(content)
return docs

def load_langchain_documents(self, **load_kwargs: Any) -> List["LCDocument"]:
"""Load data in LangChain document format."""
docs = self.load_data(**load_kwargs)
return [d.to_langchain_format() for d in docs]
4 changes: 4 additions & 0 deletions llama_hub/mintter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# requirements taken from lightclient
grpcio==1.51.1
grpcio-tools==1.51.1
protobuf==4.21.12
15 changes: 15 additions & 0 deletions llama_hub/mintter/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# We will call the loader using the following functions
# loader = MintterPublicationsLoader(access_method=access_method, group_id="<group_id>")
# documents = loader.load_data()

from llama_index import VectorStoreIndex, download_loader

MintterPublicationsReader = download_loader('MintterPublicationsReader')

access_method = 'group_publications'
group_id = 'hm://g/4FRae3AD1WpmfroSRMFGh'

loader = MintterPublicationsReader(access_method=access_method, group_id=group_id)
documents = loader.load_data()
index = VectorStoreIndex.from_documents(documents)
index.query('¿Cuál es el principal factor para conseguir un buen rendimiento de un modelo LLM?')

0 comments on commit 382b597

Please sign in to comment.