Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Unstructured.IO API support #648

Merged
merged 4 commits into from
Nov 22, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fixed linting
András Otártics committed Nov 22, 2023
commit b7521f424ba714e2cd3729ae4fd4f0f0271fb962
67 changes: 33 additions & 34 deletions llama_hub/file/unstructured/base.py
Original file line number Diff line number Diff line change
@@ -11,33 +11,34 @@
from llama_index.readers.schema.base import Document
import json


class UnstructuredReader(BaseReader):
"""General unstructured text reader for a variety of files."""

def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args) # not passing kwargs to parent bc it cannot accept it
self.api = False # we default to local
super().__init__(*args) # not passing kwargs to parent bc it cannot accept it

self.api = False # we default to local
if "url" in kwargs:
self.server_url = str(kwargs["url"])
self.api = True # is url was set, switch to api
self.api = True # is url was set, switch to api
else:
self.server_url = "http://localhost:8000"
self.server_url = "http://localhost:8000"

if "api" in kwargs:
self.api = kwargs["api"]

self.api_key = ""
if "api_key" in kwargs:
self.api_key = kwargs["api_key"]

# Prerequisite for Unstructured.io to work
import nltk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

""" Loads data usin Unstructured.io py

Depending on the constructin if url is set or api = True
@@ -47,65 +48,63 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:

Returns list of documents
"""

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
split_documents: Optional[bool] = False,
) -> List[Document]:

""" If api is set, parse through api"""
if (self.api):
"""If api is set, parse through api"""
if self.api:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(
filename=str(file),
api_key = self.api_key,
api_url=self.server_url + "/general/v0/general"
api_key=self.api_key,
api_url=self.server_url + "/general/v0/general",
)
else:
""" Parse file locally """
"""Parse file locally"""
from unstructured.partition.auto import partition

elements= partition(filename=str(file))
elements = partition(filename=str(file))

""" Process elements """
docs = []
if (split_documents):
if split_documents:
for node in elements:
metadata = {}
if (hasattr(node, "metadata")):
""" Load metadata fields """
if hasattr(node, "metadata"):
"""Load metadata fields"""
for field, val in vars(node.metadata).items():
if (field == "_known_field_names"):
if field == "_known_field_names":
continue
# removing coordinates because it does not serialize
# and dont want to bother with it
if (field == "coordinates"):
if field == "coordinates":
continue
# removing bc it might cause interference
if (field == "parent_id"):
# removing bc it might cause interference
if field == "parent_id":
continue
metadata[field] = val

if extra_info is not None:
metadata.update(extra_info)
metadata.update(extra_info)

metadata["filename"] = str(file)
docs.append(Document(text=node.text, extra_info=metadata))

else:
text_chunks = [" ".join(str(el).split()) for el in elements]

metadata = {}

if extra_info is not None:
metadata.update(extra_info)
metadata.update(extra_info)

metadata["filename"] = str(file)
# Create a single document by joining all the texts
docs.append(Document(text = "\n\n".join(text_chunks), extra_info=metadata))
# Create a single document by joining all the texts
docs.append(Document(text="\n\n".join(text_chunks), extra_info=metadata))

return docs