Skip to content

Commit

Permalink
Add functionality to import hf precomputed dataset in the recipe
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Jul 24, 2024
1 parent a302954 commit 125669b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
datasets:
- path: <path_to_dataset>
- path: instructlab/InstructLabCommunity
sampling_size: 1.0

sys_prompt: |
Expand Down
8 changes: 6 additions & 2 deletions src/instructlab/sdg/utils/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@ def adjust_train_sample_size(ds: Dataset, num_samples: int):


def load_ds(path, sampling_size):
LOGGER.info(f"Loading dataset from {path} ...")
dataset = load_dataset("json", data_files=path, split="train")
if path.endswith(".jsonl"):
LOGGER.info(f"Loading dataset from {path} ...")
dataset = load_dataset("json", data_files=path, split="train")
else:
LOGGER.info(f"Loading dataset from HF {path} ...")
dataset = load_dataset(path, split="train")
LOGGER.info(f"Dataset columns: {dataset.column_names}")
LOGGER.info(f"Dataset loaded with {len(dataset)} samples")

Expand Down

0 comments on commit 125669b

Please sign in to comment.