Skip to content

Commit

Permalink
Update generate_data.py to fix caching (#17)
Browse files Browse the repository at this point in the history
* Add index to the cache to point to the right leaf node.
* Fix the save paths to make it concise

Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal authored Jul 18, 2024
1 parent 94b49ca commit 89e3b44
Showing 1 changed file with 4 additions and 5 deletions.
9 changes: 4 additions & 5 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import json
import os
import time

# Third Party
# instructlab - All of these need to go away (other than sdg) - issue #6
from datasets import Dataset, concatenate_datasets
Expand Down Expand Up @@ -241,7 +240,7 @@ def generate_data(
# add to 1.0 recipe

Check warning on line 241 in src/instructlab/sdg/generate_data.py

View workflow job for this annotation

GitHub Actions / lint

C0303: Trailing whitespace (trailing-whitespace)

generated_data = sdg.generate(ds, cache_dataset_path='~/tmp/cache.jsonl')
generated_data = sdg.generate(ds, cache_dataset_path=f"~/tmp/cache_{leaf_node_path}.jsonl")

if is_knowledge:
knowledge_phase_data = create_phase07_ds(generated_data)
Expand All @@ -257,10 +256,10 @@ def generate_data(

# generate mmlubench data for the current leaf node

Check warning on line 257 in src/instructlab/sdg/generate_data.py

View workflow job for this annotation

GitHub Actions / lint

C0303: Trailing whitespace (trailing-whitespace)
mmlubench_data = create_mmlu_evaluation_dataset(sdg_mmlubench.generate(ds))
eval_data_file_path=f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{date_suffix}_{leaf_node_path}.jsonl"
eval_data_file_path=f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{leaf_node_path}.jsonl"
logger.info(f"Saving MMLU Dataset {eval_data_file_path}")
mmlubench_data.to_json(eval_data_file_path, orient='records', lines=True)
yaml_file_path=f"{output_dir}/node_datasets_{date_suffix}/{leaf_node_path}_{date_suffix}_{leaf_node_path}_task.yaml"
yaml_file_path=f"{output_dir}/node_datasets_{date_suffix}/{leaf_node_path}_task.yaml"
logger.info(f"Saving MMLU Task yaml {yaml_file_path}")
create_mmlu_evaluation_yaml(task_name=leaf_node_path,

Check warning on line 264 in src/instructlab/sdg/generate_data.py

View workflow job for this annotation

GitHub Actions / lint

C0303: Trailing whitespace (trailing-whitespace)
eval_data_file_path=eval_data_file_path,
Expand All @@ -275,7 +274,7 @@ def generate_data(
num_proc=8,
)

fpath = os.path.join(output_dir, f"node_datasets_{date_suffix}/node_{i}.jsonl")
fpath = os.path.join(output_dir, f"node_datasets_{date_suffix}/{leaf_node_path}.jsonl")
messages.to_json(fpath, orient="records", lines=True)
skills_recipe.add_dataset(fpath, NUM_SYNTH_SKILLS)

Expand Down

0 comments on commit 89e3b44

Please sign in to comment.