Skip to content

Commit

Permalink
Remove the unnecessary SDG class
Browse files Browse the repository at this point in the history
A pipeline chains together a sequence of blocks, and an SDG chains together
a sequence of pipelines. There is no need for this additional layer - we
can construct a pipeline with the full sequence of blocks, and not chain
pipelines together.

Signed-off-by: Mark McLoughlin <[email protected]>
  • Loading branch information
markmc committed Jul 27, 2024
1 parent ca30d98 commit 01961fe
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 41 deletions.
4 changes: 1 addition & 3 deletions scripts/test_freeform_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -60,8 +59,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path:
skills_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([skills_pipe])
gen_data = sdg.generate(ds)
gen_data = skills_pipe.generate(ds)

print(gen_data)
print(gen_data[0])
4 changes: 1 addition & 3 deletions scripts/test_grounded_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -108,8 +107,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path:
skills_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([skills_pipe])
gen_data = sdg.generate(ds)
gen_data = skills_pipe.generate(ds)

print(gen_data)
print(gen_data[0])
4 changes: 1 addition & 3 deletions scripts/test_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -47,8 +46,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path:
knowledge_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([knowledge_pipe])
mmlubench_data = sdg.generate(ds)
mmlubench_data = knowledge_pipe.generate(ds)

print(mmlubench_data)
print(mmlubench_data[0])
2 changes: 0 additions & 2 deletions src/instructlab/sdg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
"SamplePopulatorBlock",
"SelectorBlock",
"SetToMajorityValueBlock",
"SDG",
"SIMPLE_PIPELINES_PACKAGE",
"FULL_PIPELINES_PACKAGE",
"generate_data",
Expand All @@ -42,7 +41,6 @@
PipelineConfigParserError,
PipelineContext,
)
from .sdg import SDG
from .utilblocks import (
CombineColumnsBlock,
DuplicateColumnsBlock,
Expand Down
19 changes: 10 additions & 9 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
Pipeline,
PipelineContext,
)
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils import GenerateException, models
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
Expand Down Expand Up @@ -241,9 +240,9 @@ def load_pipeline(yaml_basename):
return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename))

return (
SDG([load_pipeline("knowledge.yaml")]),
SDG([load_pipeline("freeform_skills.yaml")]),
SDG([load_pipeline("grounded_skills.yaml")]),
load_pipeline("knowledge.yaml"),
load_pipeline("freeform_skills.yaml"),
load_pipeline("grounded_skills.yaml"),
)


Expand Down Expand Up @@ -361,7 +360,9 @@ def generate_data(
batch_num_workers=num_cpus,
)

sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill = _sdg_init(ctx, pipeline)
knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
ctx, pipeline
)

# Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None)
Expand All @@ -384,19 +385,19 @@ def generate_data(
raise GenerateException("Error: No samples found in leaf node.")

if samples[0].get("document"):
sdg = sdg_knowledge
pipe = knowledge_pipe
is_knowledge = True

elif samples[0].get("seed_context"):
sdg = sdg_grounded_skill
pipe = grounded_skills_pipe

else:
sdg = sdg_freeform_skill
pipe = freeform_skills_pipe

logger.debug("Samples: %s", samples)
ds = Dataset.from_list(samples)
logger.debug("Dataset: %s", ds)
new_generated_data = sdg.generate(ds)
new_generated_data = pipe.generate(ds)
if len(new_generated_data) == 0:
raise EmptyDatasetError(
"Pipeline stopped: Empty dataset after running pipe"
Expand Down
21 changes: 0 additions & 21 deletions src/instructlab/sdg/sdg.py

This file was deleted.

0 comments on commit 01961fe

Please sign in to comment.