From 01961fec4fbd18cce3fa2c4bf65bf67c4c879105 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 2 Jul 2024 13:09:00 +0100 Subject: [PATCH] Remove the unnecessary SDG class A pipeline chains together a sequence of blocks, and an SDG chains together a sequence of pipelines. There is no need for this additional layer - we can construct a pipeline with the full sequence of blocks, and not chain pipelines together. Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 4 +--- scripts/test_grounded_skills.py | 4 +--- scripts/test_knowledge.py | 4 +--- src/instructlab/sdg/__init__.py | 2 -- src/instructlab/sdg/generate_data.py | 19 ++++++++++--------- src/instructlab/sdg/sdg.py | 21 --------------------- 6 files changed, 13 insertions(+), 41 deletions(-) delete mode 100644 src/instructlab/sdg/sdg.py diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 70dd6cf4..a06df9f9 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -6,7 +6,6 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, Pipeline, @@ -60,8 +59,7 @@ with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path: skills_pipe = Pipeline.from_file(ctx, yaml_path) -sdg = SDG([skills_pipe]) -gen_data = sdg.generate(ds) +gen_data = skills_pipe.generate(ds) print(gen_data) print(gen_data[0]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 5578db56..6f761e77 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -6,7 +6,6 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, Pipeline, @@ -108,8 +107,7 @@ with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path: skills_pipe = Pipeline.from_file(ctx, yaml_path) -sdg = SDG([skills_pipe]) -gen_data = sdg.generate(ds) +gen_data = skills_pipe.generate(ds) print(gen_data) print(gen_data[0]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index fc65a275..fec6fe52 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -7,7 +7,6 @@ from openai import OpenAI # First Party -from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, Pipeline, @@ -47,8 +46,7 @@ with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path: knowledge_pipe = Pipeline.from_file(ctx, yaml_path) -sdg = SDG([knowledge_pipe]) -mmlubench_data = sdg.generate(ds) +mmlubench_data = knowledge_pipe.generate(ds) print(mmlubench_data) print(mmlubench_data[0]) diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index 76c75d31..b2500ae3 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -21,7 +21,6 @@ "SamplePopulatorBlock", "SelectorBlock", "SetToMajorityValueBlock", - "SDG", "SIMPLE_PIPELINES_PACKAGE", "FULL_PIPELINES_PACKAGE", "generate_data", @@ -42,7 +41,6 @@ PipelineConfigParserError, PipelineContext, ) -from .sdg import SDG from .utilblocks import ( CombineColumnsBlock, DuplicateColumnsBlock, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 3231f9ec..ae467f71 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -30,7 +30,6 @@ Pipeline, PipelineContext, ) -from instructlab.sdg.sdg import SDG from instructlab.sdg.utils import GenerateException, models from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, @@ -241,9 +240,9 @@ def load_pipeline(yaml_basename): return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename)) return ( - SDG([load_pipeline("knowledge.yaml")]), - SDG([load_pipeline("freeform_skills.yaml")]), - SDG([load_pipeline("grounded_skills.yaml")]), + load_pipeline("knowledge.yaml"), + load_pipeline("freeform_skills.yaml"), + load_pipeline("grounded_skills.yaml"), ) @@ -361,7 +360,9 @@ def generate_data( batch_num_workers=num_cpus, ) - sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill = _sdg_init(ctx, pipeline) + knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init( + ctx, pipeline + ) # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline) mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None) @@ -384,19 +385,19 @@ def generate_data( raise GenerateException("Error: No samples found in leaf node.") if samples[0].get("document"): - sdg = sdg_knowledge + pipe = knowledge_pipe is_knowledge = True elif samples[0].get("seed_context"): - sdg = sdg_grounded_skill + pipe = grounded_skills_pipe else: - sdg = sdg_freeform_skill + pipe = freeform_skills_pipe logger.debug("Samples: %s", samples) ds = Dataset.from_list(samples) logger.debug("Dataset: %s", ds) - new_generated_data = sdg.generate(ds) + new_generated_data = pipe.generate(ds) if len(new_generated_data) == 0: raise EmptyDatasetError( "Pipeline stopped: Empty dataset after running pipe" diff --git a/src/instructlab/sdg/sdg.py b/src/instructlab/sdg/sdg.py deleted file mode 100644 index 7bfba702..00000000 --- a/src/instructlab/sdg/sdg.py +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Third Party -from datasets import Dataset - -# Local -from .pipeline import Pipeline - - -# This is part of the public API. -class SDG: - def __init__(self, pipelines: list[Pipeline]) -> None: - self.pipelines = pipelines - - def generate(self, dataset: Dataset): - """ - Generate the dataset by running the chained pipeline steps. - dataset: the input dataset - """ - for pipeline in self.pipelines: - dataset = pipeline.generate(dataset) - return dataset