Update docling model path and move hf model download to chunkers

Signed-off-by: Aakanksha Duggal <[email protected]>
instructlab · Nov 13, 2024 · a6b6454 · a6b6454
1 parent 1b984e0
commit a6b6454
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 20 deletions.
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -12,9 +12,10 @@
 import time
 
 # Third Party
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 import openai
 
 # First Party
@@ -42,6 +43,7 @@
 
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
+
 def _unescape(s):
     return bytes(s, "utf-8").decode("utf-8").strip()
 
@@ -210,29 +212,21 @@ def _context_init(
         **extra_kwargs,
     )
 
+
 def _sdg_init(ctx, pipeline):
     pipeline_pkg = None
 
     # Search for the pipeline in User and Site data directories
     # then for a package defined pipeline
     # and finally pipelines referenced by absolute path
-    data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling")
-    data_dirs = [data_dir]
-    data_dirs.extend(
-        os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs()
-    )
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
-    # Set `docling_model_path` to consistently use `data_dir`
-    docling_model_path = Path(data_dir)
+    # Initialize docling model path
+    docling_model_path = os.path.join(xdg_data_home(), "models", "docling")
+    # Ensure the `docling_model_path` directory exists
     os.makedirs(docling_model_path, exist_ok=True)
 
-    if not os.listdir(docling_model_path):
-        # Download models if directory is empty
-        logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...")
-        StandardPdfPipeline.download_models_hf()
-    else:
-        logger.info(f"Using existing Docling models from: {docling_model_path}")
-
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -6,6 +6,7 @@
 from typing import DefaultDict, Iterable, List, Tuple
 import json
 import logging
+import os
 import re
 
 # Third Party
@@ -191,7 +192,7 @@ def __init__(
         output_dir: Path,
         chunk_word_count: int,
         tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        docling_model_path=None
+        docling_model_path=None,
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
@@ -216,9 +217,17 @@ def chunk_documents(self) -> List:
             return []
 
         if not self.docling_model_path.exists():
-            raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
-        print("docling_model_path", docling_model_path)
-        pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)
+            logger.info(
+                f"Docling model path {self.docling_model_path} not found, downloading models..."
+            )
+            os.makedirs(self.docling_model_path, exist_ok=True)
+            StandardPdfPipeline.download_models_hf(
+                destination_path=self.docling_model_path
+            )
+        else:
+            logger.info("Found the docling models")
+
+        pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path)
 
         # Keep OCR models on the CPU instead of GPU
         pipeline_options.ocr_options.use_gpu = False

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -425,7 +425,7 @@ def _knowledge_leaf_node_to_samples(
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
         tokenizer_model_name=model_name,
-        docling_model_path=docling_model_path
+        docling_model_path=docling_model_path,
     )
     chunks = chunker.chunk_documents()