Skip to content

Commit

Permalink
Merge pull request #209 from derekhiggins/mmlu-orig-samples
Browse files Browse the repository at this point in the history
Generate mmlu bench data with the original samples
  • Loading branch information
derekhiggins authored Jul 25, 2024
2 parents afadfd5 + 98013bc commit b292b7a
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
9 changes: 7 additions & 2 deletions src/instructlab/sdg/eval_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,11 @@ def _format_mmlu_style(ds: Dataset) -> Dataset:
ds = ds.filter(lambda x: x["choices"])
ds = ds.filter(lambda x: len(x["choices"]) == 4)
ds = ds.filter(lambda x: x["answer"] in ["A", "B", "C", "D"])
ds = ds.class_encode_column("answer")
# We filter out a lot of the dataset above (and in _post_process_mcq)
# if we've managed to filter out all of the results we don't want to run class_encode_column
# as the answer column might not exist
if len(ds):
ds = ds.class_encode_column("answer")
return ds


Expand Down Expand Up @@ -113,7 +117,8 @@ def generate_eval_task_data(
mmlubench_pipe, task_name, samples, output_dir, date_suffix
):
mmlubench_data = mmlubench_pipe.generate(samples)
mmlubench_data = _post_process_mcq(mmlubench_data)
if len(mmlubench_data):
mmlubench_data = _post_process_mcq(mmlubench_data)

eval_data_file_path = (
f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{task_name}.jsonl"
Expand Down
7 changes: 6 additions & 1 deletion src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from instructlab.sdg.pipeline import (
FULL_PIPELINES_PACKAGE,
SIMPLE_PIPELINES_PACKAGE,
EmptyDatasetError,
Pipeline,
PipelineContext,
)
Expand Down Expand Up @@ -371,6 +372,10 @@ def generate_data(
ds = Dataset.from_list(samples)
logger.debug("Dataset: %s" % ds)
new_generated_data = sdg.generate(ds)
if len(new_generated_data) == 0:
raise EmptyDatasetError(
"Pipeline stopped: Empty dataset after running pipe"
)
generated_data = (
[new_generated_data]
if generated_data is None
Expand All @@ -384,7 +389,7 @@ def generate_data(
generate_eval_task_data(
mmlu_bench_pipe,
leaf_node_path,
new_generated_data,
ds,
output_dir,
date_suffix,
)
Expand Down
4 changes: 1 addition & 3 deletions src/instructlab/sdg/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,7 @@ def _generate_single(self, dataset) -> Dataset:

# If at any point we end up with an empty data set, the pipeline has failed
if len(dataset) == 0:
raise EmptyDatasetError(
f"Pipeline stopped: Empty dataset after running block: {block_name}"
)
return dataset

drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
if drop_columns:
Expand Down

0 comments on commit b292b7a

Please sign in to comment.