Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Fixed linting
Browse files Browse the repository at this point in the history
  • Loading branch information
András Otártics committed Nov 22, 2023
1 parent e1f73ef commit b7521f4
Showing 1 changed file with 33 additions and 34 deletions.
67 changes: 33 additions & 34 deletions llama_hub/file/unstructured/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,34 @@
from llama_index.readers.schema.base import Document
import json


class UnstructuredReader(BaseReader):
"""General unstructured text reader for a variety of files."""

def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args) # not passing kwargs to parent bc it cannot accept it
self.api = False # we default to local
super().__init__(*args) # not passing kwargs to parent bc it cannot accept it

self.api = False # we default to local
if "url" in kwargs:
self.server_url = str(kwargs["url"])
self.api = True # is url was set, switch to api
self.api = True # is url was set, switch to api
else:
self.server_url = "http://localhost:8000"
self.server_url = "http://localhost:8000"

if "api" in kwargs:
self.api = kwargs["api"]

self.api_key = ""
if "api_key" in kwargs:
self.api_key = kwargs["api_key"]

# Prerequisite for Unstructured.io to work
import nltk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

""" Loads data usin Unstructured.io py
Depending on the constructin if url is set or api = True
Expand All @@ -47,65 +48,63 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
Returns list of documents
"""

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
split_documents: Optional[bool] = False,
) -> List[Document]:

""" If api is set, parse through api"""
if (self.api):
"""If api is set, parse through api"""
if self.api:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(
filename=str(file),
api_key = self.api_key,
api_url=self.server_url + "/general/v0/general"
api_key=self.api_key,
api_url=self.server_url + "/general/v0/general",
)
else:
""" Parse file locally """
"""Parse file locally"""
from unstructured.partition.auto import partition

elements= partition(filename=str(file))
elements = partition(filename=str(file))

""" Process elements """
docs = []
if (split_documents):
if split_documents:
for node in elements:
metadata = {}
if (hasattr(node, "metadata")):
""" Load metadata fields """
if hasattr(node, "metadata"):
"""Load metadata fields"""
for field, val in vars(node.metadata).items():
if (field == "_known_field_names"):
if field == "_known_field_names":
continue
# removing coordinates because it does not serialize
# and dont want to bother with it
if (field == "coordinates"):
if field == "coordinates":
continue
# removing bc it might cause interference
if (field == "parent_id"):
# removing bc it might cause interference
if field == "parent_id":
continue
metadata[field] = val

if extra_info is not None:
metadata.update(extra_info)
metadata.update(extra_info)

metadata["filename"] = str(file)
docs.append(Document(text=node.text, extra_info=metadata))

else:
text_chunks = [" ".join(str(el).split()) for el in elements]

metadata = {}

if extra_info is not None:
metadata.update(extra_info)
metadata.update(extra_info)

metadata["filename"] = str(file)
# Create a single document by joining all the texts
docs.append(Document(text = "\n\n".join(text_chunks), extra_info=metadata))
# Create a single document by joining all the texts
docs.append(Document(text="\n\n".join(text_chunks), extra_info=metadata))

return docs


0 comments on commit b7521f4

Please sign in to comment.