diff --git a/src/instructlab/sdg/eval_data.py b/src/instructlab/sdg/eval_data.py index 4afcd5ff..a95bd597 100644 --- a/src/instructlab/sdg/eval_data.py +++ b/src/instructlab/sdg/eval_data.py @@ -55,7 +55,11 @@ def _format_mmlu_style(ds: Dataset) -> Dataset: ds = ds.filter(lambda x: x["choices"]) ds = ds.filter(lambda x: len(x["choices"]) == 4) ds = ds.filter(lambda x: x["answer"] in ["A", "B", "C", "D"]) - ds = ds.class_encode_column("answer") + # We filter out a lot of the dataset above (and in _post_process_mcq) + # if we've managed to filter out all of the results we don't want to run class_encode_column + # as the answer column might not exist + if len(ds): + ds = ds.class_encode_column("answer") return ds @@ -113,7 +117,8 @@ def generate_eval_task_data( mmlubench_pipe, task_name, samples, output_dir, date_suffix ): mmlubench_data = mmlubench_pipe.generate(samples) - mmlubench_data = _post_process_mcq(mmlubench_data) + if len(mmlubench_data): + mmlubench_data = _post_process_mcq(mmlubench_data) eval_data_file_path = ( f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{task_name}.jsonl" diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 80c80944..4d6fffe3 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -24,6 +24,7 @@ from instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, SIMPLE_PIPELINES_PACKAGE, + EmptyDatasetError, Pipeline, PipelineContext, ) @@ -371,6 +372,10 @@ def generate_data( ds = Dataset.from_list(samples) logger.debug("Dataset: %s" % ds) new_generated_data = sdg.generate(ds) + if len(new_generated_data) == 0: + raise EmptyDatasetError( + "Pipeline stopped: Empty dataset after running pipe" + ) generated_data = ( [new_generated_data] if generated_data is None @@ -384,7 +389,7 @@ def generate_data( generate_eval_task_data( mmlu_bench_pipe, leaf_node_path, - new_generated_data, + ds, output_dir, date_suffix, ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index ae71c2af..1263c974 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -184,9 +184,7 @@ def _generate_single(self, dataset) -> Dataset: # If at any point we end up with an empty data set, the pipeline has failed if len(dataset) == 0: - raise EmptyDatasetError( - f"Pipeline stopped: Empty dataset after running block: {block_name}" - ) + return dataset drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names] if drop_columns: