Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Updated Docugami loader (#771)
Browse files Browse the repository at this point in the history
  • Loading branch information
tjaffri authored Feb 8, 2024
1 parent 5263112 commit be02528
Show file tree
Hide file tree
Showing 5 changed files with 474 additions and 1,146 deletions.
4 changes: 2 additions & 2 deletions llama_hub/docugami/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ from llama_index import download_loader

DocugamiReader = download_loader('DocugamiReader')

docset_id="ecxqpipcoe2p"
document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"]
docset_id="tjwrr2ekqkc3"
document_ids=["ui7pkriyckwi", "1be3o7ch10iy"]

loader = DocugamiReader()
documents = loader.load_data(docset_id=docset_id, document_ids=document_ids)
Expand Down
259 changes: 136 additions & 123 deletions llama_hub/docugami/base.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
"""Docugami reader."""

import hashlib
import io
import logging
import os
import re
from typing import Any, Dict, List, Mapping, Optional
from pathlib import Path
from typing import Dict, List, Mapping, Optional, Sequence, Union

import requests
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document

TD_NAME = "{http://www.w3.org/1999/xhtml}td"
TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"

XPATH_KEY = "xpath"
DOCUMENT_ID_KEY = "id"
ID_KEY = "id"
DOCUMENT_SOURCE_KEY = "source"
DOCUMENT_NAME_KEY = "name"
STRUCTURE_KEY = "structure"
TAG_KEY = "tag"
PROJECTS_KEY = "projects"

DEFAULT_API_ENDPOINT = "https://api.docugami.com/v1preview1"

logger = logging.getLogger(__name__)


class DocugamiReader(BaseReader):
"""Docugami reader.
Expand All @@ -30,126 +34,122 @@ class DocugamiReader(BaseReader):
"""

api: str = DEFAULT_API_ENDPOINT
"""The Docugami API endpoint to use."""

access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
min_chunk_size: int = 32 # appended to next chunk to avoid over-chunking
"""The Docugami API access token to use."""

max_text_length = 4096
"""Max length of chunk text returned."""

min_text_length: int = 32
"""Threshold under which chunks are appended to next to avoid over-chunking."""

max_metadata_length = 512
"""Max length of metadata text returned."""

include_xml_tags: bool = False
"""Set to true for XML tags in chunk output text."""

parent_hierarchy_levels: int = 0
"""Set appropriately to get parent chunks using the chunk hierarchy."""

parent_id_key: str = "doc_id"
"""Metadata key for parent doc ID."""

sub_chunk_tables: bool = False
"""Set to True to return sub-chunks within tables."""

whitespace_normalize_text: bool = True
"""Set to False if you want to full whitespace formatting in the original
XML doc, including indentation."""

docset_id: Optional[str]
"""The Docugami API docset ID to use."""

document_ids: Optional[Sequence[str]]
"""The Docugami API document IDs to use."""

file_paths: Optional[Sequence[Union[Path, str]]]
"""The local file paths to use."""

include_project_metadata_in_doc_metadata: bool = True
"""Set to True if you want to include the project metadata in the doc metadata."""

def _parse_dgml(
self, document: Mapping, content: bytes, doc_metadata: Optional[Mapping] = None
self,
content: bytes,
document_name: Optional[str] = None,
additional_doc_metadata: Optional[Mapping] = None,
) -> List[Document]:
"""Parse a single DGML document into a list of Documents."""
try:
from lxml import etree
except ImportError:
raise ValueError(
raise ImportError(
"Could not import lxml python package. "
"Please install it with `pip install lxml`."
)

# helpers
def _xpath_qname_for_chunk(chunk: Any) -> str:
"""Get the xpath qname for a chunk."""
qname = f"{chunk.prefix}:{chunk.tag.split('}')[-1]}"

parent = chunk.getparent()
if parent is not None:
doppelgangers = [x for x in parent if x.tag == chunk.tag]
if len(doppelgangers) > 1:
idx_of_self = doppelgangers.index(chunk)
qname = f"{qname}[{idx_of_self + 1}]"

return qname

def _xpath_for_chunk(chunk: Any) -> str:
"""Get the xpath for a chunk."""
ancestor_chain = chunk.xpath("ancestor-or-self::*")
return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain)

def _structure_value(node: Any) -> Optional[str]:
"""Get the structure value for a node."""
structure = (
"table"
if node.tag == TABLE_NAME
else node.attrib["structure"]
if "structure" in node.attrib
else None
try:
from dgml_utils.models import Chunk
from dgml_utils.segmentation import get_chunks
except ImportError:
raise ImportError(
"Could not import from dgml-utils python package. "
"Please install it with `pip install dgml-utils`."
)
return structure

def _is_structural(node: Any) -> bool:
"""Check if a node is structural."""
return _structure_value(node) is not None

def _is_heading(node: Any) -> bool:
"""Check if a node is a heading."""
structure = _structure_value(node)
return structure is not None and structure.lower().startswith("h")

def _get_text(node: Any) -> str:
"""Get the text of a node."""
return " ".join(node.itertext()).strip()

def _has_structural_descendant(node: Any) -> bool:
"""Check if a node has a structural descendant."""
for child in node:
if _is_structural(child) or _has_structural_descendant(child):
return True
return False

def _leaf_structural_nodes(node: Any) -> List:
"""Get the leaf structural nodes of a node."""
if _is_structural(node) and not _has_structural_descendant(node):
return [node]
else:
leaf_nodes = []
for child in node:
leaf_nodes.extend(_leaf_structural_nodes(child))
return leaf_nodes

def _create_doc(node: Any, text: str) -> Document:
"""Create a Document from a node and text."""
def _build_framework_chunk(dg_chunk: Chunk) -> Document:
# Stable IDs for chunks with the same text.
_hashed_id = hashlib.md5(dg_chunk.text.encode()).hexdigest()
metadata = {
XPATH_KEY: _xpath_for_chunk(node),
DOCUMENT_ID_KEY: document["id"],
DOCUMENT_NAME_KEY: document["name"],
STRUCTURE_KEY: node.attrib.get("structure", ""),
TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
XPATH_KEY: dg_chunk.xpath,
ID_KEY: _hashed_id,
DOCUMENT_NAME_KEY: document_name,
STRUCTURE_KEY: dg_chunk.structure,
TAG_KEY: dg_chunk.tag,
}

if doc_metadata:
metadata.update(doc_metadata)
text = dg_chunk.text
if additional_doc_metadata:
if self.include_project_metadata_in_doc_metadata:
metadata.update(additional_doc_metadata)

return Document(
text=text,
text=text[: self.max_text_length],
metadata=metadata,
excluded_llm_metadata_keys=[XPATH_KEY, DOCUMENT_ID_KEY, STRUCTURE_KEY],
excluded_llm_metadata_keys=[XPATH_KEY, ID_KEY, STRUCTURE_KEY],
)

# parse the tree and return chunks
# Parse the tree and return chunks
tree = etree.parse(io.BytesIO(content))
root = tree.getroot()

chunks: List[Document] = []
prev_small_chunk_text = None
for node in _leaf_structural_nodes(root):
text = _get_text(node)
if prev_small_chunk_text:
text = prev_small_chunk_text + " " + text
prev_small_chunk_text = None

if _is_heading(node) or len(text) < self.min_chunk_size:
# Save headings or other small chunks to be appended to the next chunk
prev_small_chunk_text = text
else:
chunks.append(_create_doc(node, text))

if prev_small_chunk_text and len(chunks) > 0:
# small chunk at the end left over, just append to last chunk
if not chunks[-1].text:
chunks[-1].text = prev_small_chunk_text
else:
chunks[-1].text += " " + prev_small_chunk_text
dg_chunks = get_chunks(
root,
min_text_length=self.min_text_length,
max_text_length=self.max_text_length,
whitespace_normalize_text=self.whitespace_normalize_text,
sub_chunk_tables=self.sub_chunk_tables,
include_xml_tags=self.include_xml_tags,
parent_hierarchy_levels=self.parent_hierarchy_levels,
)

return chunks
framework_chunks: Dict[str, Document] = {}
for dg_chunk in dg_chunks:
framework_chunk = _build_framework_chunk(dg_chunk)
chunk_id = framework_chunk.metadata.get(ID_KEY)
if chunk_id:
framework_chunks[chunk_id] = framework_chunk
if dg_chunk.parent:
framework_parent_chunk = _build_framework_chunk(dg_chunk.parent)
parent_id = framework_parent_chunk.metadata.get(ID_KEY)
if parent_id and framework_parent_chunk.page_content:
framework_chunk.metadata[self.parent_id_key] = parent_id
framework_chunks[parent_id] = framework_parent_chunk

return list(framework_chunks.values())

def _document_details_for_docset_id(self, docset_id: str) -> List[Dict]:
"""Gets all document details for the given docset ID"""
Expand Down Expand Up @@ -197,11 +197,12 @@ def _project_details_for_docset_id(self, docset_id: str) -> List[Dict]:

def _metadata_for_project(self, project: Dict) -> Dict:
"""Gets project metadata for all files"""
project_id = project.get("id")
project_id = project.get(ID_KEY)

url = f"{self.api}/projects/{project_id}/artifacts/latest"
all_artifacts = []

per_file_metadata: Dict = {}
while url:
response = requests.request(
"GET",
Expand All @@ -213,22 +214,24 @@ def _metadata_for_project(self, project: Dict) -> Dict:
data = response.json()
all_artifacts.extend(data["artifacts"])
url = data.get("next", None)
elif response.status_code == 404:
# Not found is ok, just means no published projects
return per_file_metadata
else:
raise Exception(
f"Failed to download {url} (status: {response.status_code})"
)

per_file_metadata = {}
for artifact in all_artifacts:
artifact_name = artifact.get("name")
artifact_url = artifact.get("url")
artifact_doc = artifact.get("document")

if artifact_name == "report-values.xml" and artifact_url and artifact_doc:
doc_id = artifact_doc["id"]
doc_id = artifact_doc[ID_KEY]
metadata: Dict = {}

# the evaluated XML for each document is named after the project
# The evaluated XML for each document is named after the project
response = requests.request(
"GET",
f"{artifact_url}/content",
Expand All @@ -240,7 +243,7 @@ def _metadata_for_project(self, project: Dict) -> Dict:
try:
from lxml import etree
except ImportError:
raise ValueError(
raise ImportError(
"Could not import lxml python package. "
"Please install it with `pip install lxml`."
)
Expand All @@ -253,7 +256,7 @@ def _metadata_for_project(self, project: Dict) -> Dict:
value = " ".join(
entry.xpath("./pr:Value", namespaces=ns)[0].itertext()
).strip()
metadata[heading] = value
metadata[heading] = value[: self.max_metadata_length]
per_file_metadata[doc_id] = metadata
else:
raise Exception(
Expand All @@ -264,10 +267,13 @@ def _metadata_for_project(self, project: Dict) -> Dict:
return per_file_metadata

def _load_chunks_for_document(
self, docset_id: str, document: Dict, doc_metadata: Optional[Dict] = None
self,
document_id: str,
docset_id: str,
document_name: Optional[str] = None,
additional_metadata: Optional[Mapping] = None,
) -> List[Document]:
"""Load chunks for a document."""
document_id = document["id"]
url = f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml"

response = requests.request(
Expand All @@ -278,7 +284,11 @@ def _load_chunks_for_document(
)

if response.ok:
return self._parse_dgml(document, response.content, doc_metadata)
return self._parse_dgml(
content=response.content,
document_name=document_name,
additional_doc_metadata=additional_metadata,
)
else:
raise Exception(
f"Failed to download {url} (status: {response.status_code})"
Expand Down Expand Up @@ -311,28 +321,31 @@ def load_data(
_document_details = self._document_details_for_docset_id(docset_id)
if document_ids:
_document_details = [
d for d in _document_details if d["id"] in document_ids
d for d in _document_details if d[ID_KEY] in document_ids
]

_project_details = self._project_details_for_docset_id(docset_id)
combined_project_metadata = {}
if _project_details:
# if there are any projects for this docset, load project metadata
combined_project_metadata: Dict[str, Dict] = {}
if _project_details and self.include_project_metadata_in_doc_metadata:
# If there are any projects for this docset and the caller requested
# project metadata, load it.
for project in _project_details:
metadata = self._metadata_for_project(project)
combined_project_metadata.update(metadata)
for file_id in metadata:
if file_id not in combined_project_metadata:
combined_project_metadata[file_id] = metadata[file_id]
else:
combined_project_metadata[file_id].update(metadata[file_id])

for doc in _document_details:
doc_metadata = combined_project_metadata.get(doc["id"])
chunks += self._load_chunks_for_document(docset_id, doc, doc_metadata)
doc_id = doc[ID_KEY]
doc_name = doc.get(DOCUMENT_NAME_KEY)
doc_metadata = combined_project_metadata.get(doc_id)
chunks += self._load_chunks_for_document(
document_id=doc_id,
docset_id=docset_id,
document_name=doc_name,
additional_metadata=doc_metadata,
)

return chunks


if __name__ == "__main__":
reader = DocugamiReader()
print(
reader.load_data(
docset_id="ecxqpipcoe2p", document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"]
)
)
Loading

0 comments on commit be02528

Please sign in to comment.