Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docling models path #362

Merged
merged 13 commits into from
Nov 14, 2024
22 changes: 22 additions & 0 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# instructlab - All of these need to go away (other than sdg) - issue #6
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
import openai
import yaml

# First Party
# pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -220,6 +221,25 @@ def _sdg_init(ctx, pipeline):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

docling_model_path = None
sdg_models_path = docling_model_path
for d in data_dirs:
if os.path.exists(os.path.join(d, "models")):
sdg_models_path = os.path.join(d, "models")
break
jaideepr97 marked this conversation as resolved.
Show resolved Hide resolved

if sdg_models_path is not None:
try:
with open(
os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
) as file:
config = yaml.safe_load(file)
docling_model_path = config["models"][0]["path"]
aakankshaduggal marked this conversation as resolved.
Show resolved Hide resolved
except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
logger.warning(
f"unable to read docling models path from config.yaml {e}"
)

for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
Expand Down Expand Up @@ -295,6 +315,7 @@ def generate_data(
batch_size: Optional[int] = None,
checkpoint_dir: Optional[str] = None,
max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
docling_model_path: Optional[str] = None,
aakankshaduggal marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""Generate data for training and testing a model.

Expand Down Expand Up @@ -392,6 +413,7 @@ def generate_data(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=docling_model_path,
)

if not samples:
Expand Down
14 changes: 12 additions & 2 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __new__(
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
docling_model_path: str | None = None,
):
"""Insantiate the appropriate chunker for the provided document

Expand Down Expand Up @@ -115,6 +116,7 @@ def __new__(
output_dir,
chunk_word_count,
tokenizer_model_name,
docling_model_path=docling_model_path,
)

@staticmethod
Expand Down Expand Up @@ -189,6 +191,7 @@ def __init__(
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
docling_model_path=None,
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand All @@ -201,6 +204,7 @@ def __init__(
)

self.tokenizer = self.create_tokenizer(tokenizer_model_name)
self.docling_model_path = docling_model_path

def chunk_documents(self) -> List:
"""Semantically chunk PDF documents.
Expand All @@ -211,8 +215,14 @@ def chunk_documents(self) -> List:
if self.document_paths == []:
return []

model_artifacts_path = StandardPdfPipeline.download_models_hf()
pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
if self.docling_model_path is None:
logger.info("Docling models not found on disk, downloading models...")
self.docling_model_path = StandardPdfPipeline.download_models_hf()
else:
logger.info("Found the docling models")

pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path)

# Keep OCR models on the CPU instead of GPU
pipeline_options.ocr_options.use_gpu = False
converter = DocumentConverter(
Expand Down
4 changes: 4 additions & 0 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=None,
):
chunker = DocumentChunker(
leaf_node=leaf_node,
Expand All @@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples(
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
tokenizer_model_name=model_name,
docling_model_path=docling_model_path,
)
chunks = chunker.chunk_documents()

Expand Down Expand Up @@ -453,6 +455,7 @@ def leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=None,
):
if not leaf_node:
return []
Expand All @@ -464,5 +467,6 @@ def leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path,
)
return _skill_leaf_node_to_samples(leaf_node)
Loading