Skip to content

Commit

Permalink
Modify _get_sub_docs to use Custom Separator (#254)
Browse files Browse the repository at this point in the history
Move _get_sub_docs to private function
  • Loading branch information
adreichert authored Jul 17, 2024
1 parent 7b90d03 commit 8938286
Showing 1 changed file with 18 additions and 16 deletions.
34 changes: 18 additions & 16 deletions llama_parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,7 @@
# if passing as bytes or a buffer, must provide the file_name in extra_info
FileInput = Union[str, bytes, BufferedIOBase]


def _get_sub_docs(docs: List[Document]) -> List[Document]:
"""Split docs into pages, by separator."""
sub_docs = []
for doc in docs:
doc_chunks = doc.text.split("\n---\n")
for doc_chunk in doc_chunks:
sub_doc = Document(
text=doc_chunk,
metadata=deepcopy(doc.metadata),
)
sub_docs.append(sub_doc)

return sub_docs
_DEFAULT_SEPARATOR = "\n---\n"


class LlamaParse(BasePydanticReader):
Expand Down Expand Up @@ -132,7 +119,7 @@ class LlamaParse(BasePydanticReader):
)
split_by_page: bool = Field(
default=True,
description="Whether to split by page (NOTE: using a predefined separator `\n---\n`)",
description="Whether to split by page using the page separator",
)
vendor_multimodal_api_key: Optional[str] = Field(
default=None,
Expand Down Expand Up @@ -318,7 +305,7 @@ async def _aload_data(
)
]
if self.split_by_page:
return _get_sub_docs(docs)
return self._get_sub_docs(docs)
else:
return docs

Expand Down Expand Up @@ -492,3 +479,18 @@ def get_images(self, json_result: List[dict], download_path: str) -> List[dict]:
return []
else:
raise e

def _get_sub_docs(self, docs: List[Document]) -> List[Document]:
"""Split docs into pages, by separator."""
sub_docs = []
separator = self.page_separator or _DEFAULT_SEPARATOR
for doc in docs:
doc_chunks = doc.text.split(separator)
for doc_chunk in doc_chunks:
sub_doc = Document(
text=doc_chunk,
metadata=deepcopy(doc.metadata),
)
sub_docs.append(sub_doc)

return sub_docs

0 comments on commit 8938286

Please sign in to comment.