Skip to content

Commit

Permalink
Merge pull request #64 from markmc/api-tweaks
Browse files Browse the repository at this point in the history
Remove the unnecessary SDG class
  • Loading branch information
markmc authored Jul 29, 2024
2 parents 2a91e7c + a9d93c4 commit 2dcbec7
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 46 deletions.
4 changes: 1 addition & 3 deletions scripts/test_freeform_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -60,8 +59,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path:
skills_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([skills_pipe])
gen_data = sdg.generate(ds)
gen_data = skills_pipe.generate(ds)

print(gen_data)
print(gen_data[0])
4 changes: 1 addition & 3 deletions scripts/test_grounded_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -108,8 +107,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path:
skills_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([skills_pipe])
gen_data = sdg.generate(ds)
gen_data = skills_pipe.generate(ds)

print(gen_data)
print(gen_data[0])
4 changes: 1 addition & 3 deletions scripts/test_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from openai import OpenAI

# First Party
from src.instructlab.sdg import SDG
from src.instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
Pipeline,
Expand Down Expand Up @@ -47,8 +46,7 @@
with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path:
knowledge_pipe = Pipeline.from_file(ctx, yaml_path)

sdg = SDG([knowledge_pipe])
mmlubench_data = sdg.generate(ds)
mmlubench_data = knowledge_pipe.generate(ds)

print(mmlubench_data)
print(mmlubench_data[0])
2 changes: 0 additions & 2 deletions src/instructlab/sdg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
"SamplePopulatorBlock",
"SelectorBlock",
"SetToMajorityValueBlock",
"SDG",
"SIMPLE_PIPELINES_PACKAGE",
"FULL_PIPELINES_PACKAGE",
"generate_data",
Expand All @@ -42,7 +41,6 @@
PipelineConfigParserError,
PipelineContext,
)
from .sdg import SDG
from .utilblocks import (
CombineColumnsBlock,
DuplicateColumnsBlock,
Expand Down
24 changes: 11 additions & 13 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
Pipeline,
PipelineContext,
)
from instructlab.sdg.sdg import SDG
from instructlab.sdg.utils import GenerateException, models
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
Expand Down Expand Up @@ -241,9 +240,9 @@ def load_pipeline(yaml_basename):
return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename))

return (
SDG([load_pipeline("knowledge.yaml")]),
SDG([load_pipeline("freeform_skills.yaml")]),
SDG([load_pipeline("grounded_skills.yaml")]),
load_pipeline("knowledge.yaml"),
load_pipeline("freeform_skills.yaml"),
load_pipeline("grounded_skills.yaml"),
)


Expand Down Expand Up @@ -362,16 +361,15 @@ def generate_data(
batch_num_workers=num_cpus,
)

sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill = _sdg_init(ctx, pipeline)
knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
ctx, pipeline
)

# Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None)
mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)

# FIXME: remove SDG https://github.com/instructlab/sdg/pull/64
mixer = _mixer_init(
ctx, output_dir, date_suffix, sdg_knowledge.pipelines[0].auxiliary_inst
)
mixer = _mixer_init(ctx, output_dir, date_suffix, knowledge_pipe.auxiliary_inst)

if console_output:
logger.info(
Expand All @@ -388,19 +386,19 @@ def generate_data(
raise GenerateException("Error: No samples found in leaf node.")

if samples[0].get("document"):
sdg = sdg_knowledge
pipe = knowledge_pipe
is_knowledge = True

elif samples[0].get("seed_context"):
sdg = sdg_grounded_skill
pipe = grounded_skills_pipe

else:
sdg = sdg_freeform_skill
pipe = freeform_skills_pipe

logger.debug("Samples: %s", samples)
ds = Dataset.from_list(samples)
logger.debug("Dataset: %s", ds)
new_generated_data = sdg.generate(ds)
new_generated_data = pipe.generate(ds)
if len(new_generated_data) == 0:
raise EmptyDatasetError(
"Pipeline stopped: Empty dataset after running pipe"
Expand Down
22 changes: 0 additions & 22 deletions src/instructlab/sdg/sdg.py

This file was deleted.

0 comments on commit 2dcbec7

Please sign in to comment.