Updated Docugami loader (#771)

run-llama · Feb 8, 2024 · be02528 · be02528
1 parent 5263112
commit be02528
Show file tree

Hide file tree

Showing 5 changed files with 474 additions and 1,146 deletions.
diff --git a/llama_hub/docugami/README.md b/llama_hub/docugami/README.md
@@ -18,8 +18,8 @@ from llama_index import download_loader
 
 DocugamiReader = download_loader('DocugamiReader')
 
-docset_id="ecxqpipcoe2p"
-document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"]
+docset_id="tjwrr2ekqkc3"
+document_ids=["ui7pkriyckwi", "1be3o7ch10iy"]
 
 loader = DocugamiReader()
 documents = loader.load_data(docset_id=docset_id, document_ids=document_ids)

diff --git a/llama_hub/docugami/base.py b/llama_hub/docugami/base.py
@@ -1,26 +1,30 @@
 """Docugami reader."""
 
+import hashlib
 import io
+import logging
 import os
-import re
-from typing import Any, Dict, List, Mapping, Optional
+from pathlib import Path
+from typing import Dict, List, Mapping, Optional, Sequence, Union
 
 import requests
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 
-TD_NAME = "{http://www.w3.org/1999/xhtml}td"
 TABLE_NAME = "{http://www.w3.org/1999/xhtml}table"
 
 XPATH_KEY = "xpath"
-DOCUMENT_ID_KEY = "id"
+ID_KEY = "id"
+DOCUMENT_SOURCE_KEY = "source"
 DOCUMENT_NAME_KEY = "name"
 STRUCTURE_KEY = "structure"
 TAG_KEY = "tag"
 PROJECTS_KEY = "projects"
 
 DEFAULT_API_ENDPOINT = "https://api.docugami.com/v1preview1"
 
+logger = logging.getLogger(__name__)
+
 
 class DocugamiReader(BaseReader):
     """Docugami reader.
@@ -30,126 +34,122 @@ class DocugamiReader(BaseReader):
     """
 
     api: str = DEFAULT_API_ENDPOINT
+    """The Docugami API endpoint to use."""
+
     access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
-    min_chunk_size: int = 32  # appended to next chunk to avoid over-chunking
+    """The Docugami API access token to use."""
+
+    max_text_length = 4096
+    """Max length of chunk text returned."""
+
+    min_text_length: int = 32
+    """Threshold under which chunks are appended to next to avoid over-chunking."""
+
+    max_metadata_length = 512
+    """Max length of metadata text returned."""
+
+    include_xml_tags: bool = False
+    """Set to true for XML tags in chunk output text."""
+
+    parent_hierarchy_levels: int = 0
+    """Set appropriately to get parent chunks using the chunk hierarchy."""
+
+    parent_id_key: str = "doc_id"
+    """Metadata key for parent doc ID."""
+
+    sub_chunk_tables: bool = False
+    """Set to True to return sub-chunks within tables."""
+
+    whitespace_normalize_text: bool = True
+    """Set to False if you want to full whitespace formatting in the original
+    XML doc, including indentation."""
+
+    docset_id: Optional[str]
+    """The Docugami API docset ID to use."""
+
+    document_ids: Optional[Sequence[str]]
+    """The Docugami API document IDs to use."""
+
+    file_paths: Optional[Sequence[Union[Path, str]]]
+    """The local file paths to use."""
+
+    include_project_metadata_in_doc_metadata: bool = True
+    """Set to True if you want to include the project metadata in the doc metadata."""
 
     def _parse_dgml(
-        self, document: Mapping, content: bytes, doc_metadata: Optional[Mapping] = None
+        self,
+        content: bytes,
+        document_name: Optional[str] = None,
+        additional_doc_metadata: Optional[Mapping] = None,
     ) -> List[Document]:
         """Parse a single DGML document into a list of Documents."""
         try:
             from lxml import etree
         except ImportError:
-            raise ValueError(
+            raise ImportError(
                 "Could not import lxml python package. "
                 "Please install it with `pip install lxml`."
             )
 
-        # helpers
-        def _xpath_qname_for_chunk(chunk: Any) -> str:
-            """Get the xpath qname for a chunk."""
-            qname = f"{chunk.prefix}:{chunk.tag.split('}')[-1]}"
-
-            parent = chunk.getparent()
-            if parent is not None:
-                doppelgangers = [x for x in parent if x.tag == chunk.tag]
-                if len(doppelgangers) > 1:
-                    idx_of_self = doppelgangers.index(chunk)
-                    qname = f"{qname}[{idx_of_self + 1}]"
-
-            return qname
-
-        def _xpath_for_chunk(chunk: Any) -> str:
-            """Get the xpath for a chunk."""
-            ancestor_chain = chunk.xpath("ancestor-or-self::*")
-            return "/" + "/".join(_xpath_qname_for_chunk(x) for x in ancestor_chain)
-
-        def _structure_value(node: Any) -> Optional[str]:
-            """Get the structure value for a node."""
-            structure = (
-                "table"
-                if node.tag == TABLE_NAME
-                else node.attrib["structure"]
-                if "structure" in node.attrib
-                else None
+        try:
+            from dgml_utils.models import Chunk
+            from dgml_utils.segmentation import get_chunks
+        except ImportError:
+            raise ImportError(
+                "Could not import from dgml-utils python package. "
+                "Please install it with `pip install dgml-utils`."
             )
-            return structure
-
-        def _is_structural(node: Any) -> bool:
-            """Check if a node is structural."""
-            return _structure_value(node) is not None
-
-        def _is_heading(node: Any) -> bool:
-            """Check if a node is a heading."""
-            structure = _structure_value(node)
-            return structure is not None and structure.lower().startswith("h")
-
-        def _get_text(node: Any) -> str:
-            """Get the text of a node."""
-            return " ".join(node.itertext()).strip()
-
-        def _has_structural_descendant(node: Any) -> bool:
-            """Check if a node has a structural descendant."""
-            for child in node:
-                if _is_structural(child) or _has_structural_descendant(child):
-                    return True
-            return False
-
-        def _leaf_structural_nodes(node: Any) -> List:
-            """Get the leaf structural nodes of a node."""
-            if _is_structural(node) and not _has_structural_descendant(node):
-                return [node]
-            else:
-                leaf_nodes = []
-                for child in node:
-                    leaf_nodes.extend(_leaf_structural_nodes(child))
-                return leaf_nodes
 
-        def _create_doc(node: Any, text: str) -> Document:
-            """Create a Document from a node and text."""
+        def _build_framework_chunk(dg_chunk: Chunk) -> Document:
+            # Stable IDs for chunks with the same text.
+            _hashed_id = hashlib.md5(dg_chunk.text.encode()).hexdigest()
             metadata = {
-                XPATH_KEY: _xpath_for_chunk(node),
-                DOCUMENT_ID_KEY: document["id"],
-                DOCUMENT_NAME_KEY: document["name"],
-                STRUCTURE_KEY: node.attrib.get("structure", ""),
-                TAG_KEY: re.sub(r"\{.*\}", "", node.tag),
+                XPATH_KEY: dg_chunk.xpath,
+                ID_KEY: _hashed_id,
+                DOCUMENT_NAME_KEY: document_name,
+                STRUCTURE_KEY: dg_chunk.structure,
+                TAG_KEY: dg_chunk.tag,
             }
 
-            if doc_metadata:
-                metadata.update(doc_metadata)
+            text = dg_chunk.text
+            if additional_doc_metadata:
+                if self.include_project_metadata_in_doc_metadata:
+                    metadata.update(additional_doc_metadata)
 
             return Document(
-                text=text,
+                text=text[: self.max_text_length],
                 metadata=metadata,
-                excluded_llm_metadata_keys=[XPATH_KEY, DOCUMENT_ID_KEY, STRUCTURE_KEY],
+                excluded_llm_metadata_keys=[XPATH_KEY, ID_KEY, STRUCTURE_KEY],
             )
 
-        # parse the tree and return chunks
+        # Parse the tree and return chunks
         tree = etree.parse(io.BytesIO(content))
         root = tree.getroot()
 
-        chunks: List[Document] = []
-        prev_small_chunk_text = None
-        for node in _leaf_structural_nodes(root):
-            text = _get_text(node)
-            if prev_small_chunk_text:
-                text = prev_small_chunk_text + " " + text
-                prev_small_chunk_text = None
-
-            if _is_heading(node) or len(text) < self.min_chunk_size:
-                # Save headings or other small chunks to be appended to the next chunk
-                prev_small_chunk_text = text
-            else:
-                chunks.append(_create_doc(node, text))
-
-        if prev_small_chunk_text and len(chunks) > 0:
-            # small chunk at the end left over, just append to last chunk
-            if not chunks[-1].text:
-                chunks[-1].text = prev_small_chunk_text
-            else:
-                chunks[-1].text += " " + prev_small_chunk_text
+        dg_chunks = get_chunks(
+            root,
+            min_text_length=self.min_text_length,
+            max_text_length=self.max_text_length,
+            whitespace_normalize_text=self.whitespace_normalize_text,
+            sub_chunk_tables=self.sub_chunk_tables,
+            include_xml_tags=self.include_xml_tags,
+            parent_hierarchy_levels=self.parent_hierarchy_levels,
+        )
 
-        return chunks
+        framework_chunks: Dict[str, Document] = {}
+        for dg_chunk in dg_chunks:
+            framework_chunk = _build_framework_chunk(dg_chunk)
+            chunk_id = framework_chunk.metadata.get(ID_KEY)
+            if chunk_id:
+                framework_chunks[chunk_id] = framework_chunk
+                if dg_chunk.parent:
+                    framework_parent_chunk = _build_framework_chunk(dg_chunk.parent)
+                    parent_id = framework_parent_chunk.metadata.get(ID_KEY)
+                    if parent_id and framework_parent_chunk.page_content:
+                        framework_chunk.metadata[self.parent_id_key] = parent_id
+                        framework_chunks[parent_id] = framework_parent_chunk
+
+        return list(framework_chunks.values())
 
     def _document_details_for_docset_id(self, docset_id: str) -> List[Dict]:
         """Gets all document details for the given docset ID"""
@@ -197,11 +197,12 @@ def _project_details_for_docset_id(self, docset_id: str) -> List[Dict]:
 
     def _metadata_for_project(self, project: Dict) -> Dict:
         """Gets project metadata for all files"""
-        project_id = project.get("id")
+        project_id = project.get(ID_KEY)
 
         url = f"{self.api}/projects/{project_id}/artifacts/latest"
         all_artifacts = []
 
+        per_file_metadata: Dict = {}
         while url:
             response = requests.request(
                 "GET",
@@ -213,22 +214,24 @@ def _metadata_for_project(self, project: Dict) -> Dict:
                 data = response.json()
                 all_artifacts.extend(data["artifacts"])
                 url = data.get("next", None)
+            elif response.status_code == 404:
+                # Not found is ok, just means no published projects
+                return per_file_metadata
             else:
                 raise Exception(
                     f"Failed to download {url} (status: {response.status_code})"
                 )
 
-        per_file_metadata = {}
         for artifact in all_artifacts:
             artifact_name = artifact.get("name")
             artifact_url = artifact.get("url")
             artifact_doc = artifact.get("document")
 
             if artifact_name == "report-values.xml" and artifact_url and artifact_doc:
-                doc_id = artifact_doc["id"]
+                doc_id = artifact_doc[ID_KEY]
                 metadata: Dict = {}
 
-                # the evaluated XML for each document is named after the project
+                # The evaluated XML for each document is named after the project
                 response = requests.request(
                     "GET",
                     f"{artifact_url}/content",
@@ -240,7 +243,7 @@ def _metadata_for_project(self, project: Dict) -> Dict:
                     try:
                         from lxml import etree
                     except ImportError:
-                        raise ValueError(
+                        raise ImportError(
                             "Could not import lxml python package. "
                             "Please install it with `pip install lxml`."
                         )
@@ -253,7 +256,7 @@ def _metadata_for_project(self, project: Dict) -> Dict:
                         value = " ".join(
                             entry.xpath("./pr:Value", namespaces=ns)[0].itertext()
                         ).strip()
-                        metadata[heading] = value
+                        metadata[heading] = value[: self.max_metadata_length]
                     per_file_metadata[doc_id] = metadata
                 else:
                     raise Exception(
@@ -264,10 +267,13 @@ def _metadata_for_project(self, project: Dict) -> Dict:
         return per_file_metadata
 
     def _load_chunks_for_document(
-        self, docset_id: str, document: Dict, doc_metadata: Optional[Dict] = None
+        self,
+        document_id: str,
+        docset_id: str,
+        document_name: Optional[str] = None,
+        additional_metadata: Optional[Mapping] = None,
     ) -> List[Document]:
         """Load chunks for a document."""
-        document_id = document["id"]
         url = f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml"
 
         response = requests.request(
@@ -278,7 +284,11 @@ def _load_chunks_for_document(
         )
 
         if response.ok:
-            return self._parse_dgml(document, response.content, doc_metadata)
+            return self._parse_dgml(
+                content=response.content,
+                document_name=document_name,
+                additional_doc_metadata=additional_metadata,
+            )
         else:
             raise Exception(
                 f"Failed to download {url} (status: {response.status_code})"
@@ -311,28 +321,31 @@ def load_data(
         _document_details = self._document_details_for_docset_id(docset_id)
         if document_ids:
             _document_details = [
-                d for d in _document_details if d["id"] in document_ids
+                d for d in _document_details if d[ID_KEY] in document_ids
             ]
 
         _project_details = self._project_details_for_docset_id(docset_id)
-        combined_project_metadata = {}
-        if _project_details:
-            # if there are any projects for this docset, load project metadata
+        combined_project_metadata: Dict[str, Dict] = {}
+        if _project_details and self.include_project_metadata_in_doc_metadata:
+            # If there are any projects for this docset and the caller requested
+            # project metadata, load it.
             for project in _project_details:
                 metadata = self._metadata_for_project(project)
-                combined_project_metadata.update(metadata)
+                for file_id in metadata:
+                    if file_id not in combined_project_metadata:
+                        combined_project_metadata[file_id] = metadata[file_id]
+                    else:
+                        combined_project_metadata[file_id].update(metadata[file_id])
 
         for doc in _document_details:
-            doc_metadata = combined_project_metadata.get(doc["id"])
-            chunks += self._load_chunks_for_document(docset_id, doc, doc_metadata)
+            doc_id = doc[ID_KEY]
+            doc_name = doc.get(DOCUMENT_NAME_KEY)
+            doc_metadata = combined_project_metadata.get(doc_id)
+            chunks += self._load_chunks_for_document(
+                document_id=doc_id,
+                docset_id=docset_id,
+                document_name=doc_name,
+                additional_metadata=doc_metadata,
+            )
 
         return chunks
-
-
-if __name__ == "__main__":
-    reader = DocugamiReader()
-    print(
-        reader.load_data(
-            docset_id="ecxqpipcoe2p", document_ids=["43rj0ds7s0ur", "bpc1vibyeke2"]
-        )
-    )