Skip to content

Commit

Permalink
Rebase
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Nov 13, 2024
1 parent 9de4d35 commit 1b984e0
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __new__(
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
docling_model_path: str | None = None,
):
"""Insantiate the appropriate chunker for the provided document
Expand Down Expand Up @@ -115,6 +116,7 @@ def __new__(
output_dir,
chunk_word_count,
tokenizer_model_name,
docling_model_path=docling_model_path,
)

@staticmethod
Expand Down Expand Up @@ -189,6 +191,7 @@ def __init__(
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
docling_model_path=None
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand All @@ -201,6 +204,7 @@ def __init__(
)

self.tokenizer = self.create_tokenizer(tokenizer_model_name)
self.docling_model_path = docling_model_path

def chunk_documents(self) -> List:
"""Semantically chunk PDF documents.
Expand All @@ -211,8 +215,11 @@ def chunk_documents(self) -> List:
if self.document_paths == []:
return []

model_artifacts_path = StandardPdfPipeline.download_models_hf()
pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
if not self.docling_model_path.exists():
raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
print("docling_model_path", docling_model_path)

Check failure on line 220 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0602: Undefined variable 'docling_model_path' (undefined-variable)
pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)

Check failure on line 221 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0602: Undefined variable 'docling_model_path' (undefined-variable)

# Keep OCR models on the CPU instead of GPU
pipeline_options.ocr_options.use_gpu = False
converter = DocumentConverter(
Expand Down

0 comments on commit 1b984e0

Please sign in to comment.