instructlab · mergify · Nov 14, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 13, 2024
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -15,6 +15,7 @@
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai
+import yaml
 
 # First Party
 # pylint: disable=ungrouped-imports
@@ -220,6 +221,25 @@ def _sdg_init(ctx, pipeline):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
+    docling_model_path = None
+    sdg_models_path = docling_model_path
+    for d in data_dirs:
+        if os.path.exists(os.path.join(d, "models")):
+            sdg_models_path = os.path.join(d, "models")
+            break
+
+        if sdg_models_path is not None:
+            try:
+                with open(
+                    os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
+                ) as file:
+                    config = yaml.safe_load(file)
+                    docling_model_path = config["models"][0]["path"]
+            except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
+                logger.warning(
+                    f"unable to read docling models path from config.yaml {e}"
+                )
+
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):
@@ -295,6 +315,7 @@ def generate_data(
     batch_size: Optional[int] = None,
     checkpoint_dir: Optional[str] = None,
     max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+    docling_model_path: Optional[str] = None,
 ) -> None:
     """Generate data for training and testing a model.
 
@@ -392,6 +413,7 @@ def generate_data(
             chunk_word_count,
             document_output_dir,
             model_name,
+            docling_model_path=docling_model_path,
         )
 
         if not samples:

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -60,6 +60,7 @@ def __new__(
         server_ctx_size=4096,
         chunk_word_count=1024,
         tokenizer_model_name: str | None = None,
+        docling_model_path: str | None = None,
     ):
         """Insantiate the appropriate chunker for the provided document
 
@@ -115,6 +116,7 @@ def __new__(
                 output_dir,
                 chunk_word_count,
                 tokenizer_model_name,
+                docling_model_path=docling_model_path,
             )
 
     @staticmethod
@@ -189,6 +191,7 @@ def __init__(
         output_dir: Path,
         chunk_word_count: int,
         tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        docling_model_path=None,
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
@@ -201,6 +204,7 @@ def __init__(
         )
 
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
+        self.docling_model_path = docling_model_path
 
     def chunk_documents(self) -> List:
         """Semantically chunk PDF documents.
@@ -211,8 +215,14 @@ def chunk_documents(self) -> List:
         if self.document_paths == []:
             return []
 
-        model_artifacts_path = StandardPdfPipeline.download_models_hf()
-        pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
+        if self.docling_model_path is None:
+            logger.info("Docling models not found on disk, downloading models...")
+            self.docling_model_path = StandardPdfPipeline.download_models_hf()
+        else:
+            logger.info("Found the docling models")
+
+        pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path)
+
         # Keep OCR models on the CPU instead of GPU
         pipeline_options.ocr_options.use_gpu = False
         converter = DocumentConverter(

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples(
     chunk_word_count,
     document_output_dir,
     model_name,
+    docling_model_path=None,
 ):
     chunker = DocumentChunker(
         leaf_node=leaf_node,
@@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples(
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
         tokenizer_model_name=model_name,
+        docling_model_path=docling_model_path,
     )
     chunks = chunker.chunk_documents()
 
@@ -453,6 +455,7 @@ def leaf_node_to_samples(
     chunk_word_count,
     document_output_dir,
     model_name,
+    docling_model_path=None,
 ):
     if not leaf_node:
         return []
@@ -464,5 +467,6 @@ def leaf_node_to_samples(
             chunk_word_count,
             document_output_dir,
             model_name,
+            docling_model_path,
         )
     return _skill_leaf_node_to_samples(leaf_node)