Skip to content

Commit

Permalink
Update docling model path and move hf model download to chunkers
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Nov 13, 2024
1 parent 1b984e0 commit a6b6454
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 20 deletions.
24 changes: 9 additions & 15 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
import time

# Third Party
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

Check warning on line 15 in src/instructlab/sdg/generate_data.py

View workflow job for this annotation

GitHub Actions / pylint

W0611: Unused StandardPdfPipeline imported from docling.pipeline.standard_pdf_pipeline (unused-import)

# instructlab - All of these need to go away (other than sdg) - issue #6
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
import openai

# First Party
Expand Down Expand Up @@ -42,6 +43,7 @@

_SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."


def _unescape(s):
return bytes(s, "utf-8").decode("utf-8").strip()

Expand Down Expand Up @@ -210,29 +212,21 @@ def _context_init(
**extra_kwargs,
)


def _sdg_init(ctx, pipeline):
pipeline_pkg = None

# Search for the pipeline in User and Site data directories
# then for a package defined pipeline
# and finally pipelines referenced by absolute path
data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling")
data_dirs = [data_dir]
data_dirs.extend(
os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs()
)
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

# Set `docling_model_path` to consistently use `data_dir`
docling_model_path = Path(data_dir)
# Initialize docling model path
docling_model_path = os.path.join(xdg_data_home(), "models", "docling")
# Ensure the `docling_model_path` directory exists
os.makedirs(docling_model_path, exist_ok=True)

if not os.listdir(docling_model_path):
# Download models if directory is empty
logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...")
StandardPdfPipeline.download_models_hf()
else:
logger.info(f"Using existing Docling models from: {docling_model_path}")

for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
Expand Down
17 changes: 13 additions & 4 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import DefaultDict, Iterable, List, Tuple
import json
import logging
import os
import re

# Third Party
Expand Down Expand Up @@ -191,7 +192,7 @@ def __init__(
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
docling_model_path=None
docling_model_path=None,
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand All @@ -216,9 +217,17 @@ def chunk_documents(self) -> List:
return []

if not self.docling_model_path.exists():
raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
print("docling_model_path", docling_model_path)
pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)
logger.info(
f"Docling model path {self.docling_model_path} not found, downloading models..."
)
os.makedirs(self.docling_model_path, exist_ok=True)
StandardPdfPipeline.download_models_hf(

Check failure on line 224 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E1123: Unexpected keyword argument 'destination_path' in staticmethod call (unexpected-keyword-arg)
destination_path=self.docling_model_path
)
else:
logger.info("Found the docling models")

pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path)

# Keep OCR models on the CPU instead of GPU
pipeline_options.ocr_options.use_gpu = False
Expand Down
2 changes: 1 addition & 1 deletion src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def _knowledge_leaf_node_to_samples(
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
tokenizer_model_name=model_name,
docling_model_path=docling_model_path
docling_model_path=docling_model_path,
)
chunks = chunker.chunk_documents()

Expand Down

0 comments on commit a6b6454

Please sign in to comment.