-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #280 from JamesKunstle/smoketests
adds basic smoketests for main_ds and data_process CLI args
- Loading branch information
Showing
2 changed files
with
232 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
## Overview | ||
|
||
`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change. | ||
|
||
Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed. | ||
|
||
Current tests add features as they go: | ||
|
||
1. No Flash Attention or Granite | ||
2. No Granite but Flash Attention enabled | ||
3. Granite and Flash Attention enabled | ||
|
||
## Usage | ||
|
||
The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`. | ||
|
||
The second positional argument is for "number of GPUs"- e.g.: `./smoketest.sh fsdp 8`. This will run the test with 8 GPUs with fsdp as the distributed backend. | ||
|
||
> [!NOTE] | ||
> You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
#!/usr/bin/env bash | ||
set -eux -o pipefail | ||
|
||
# ############### Read-only parameters ############### | ||
MODEL_NAME="instructlab/granite-7b-lab" | ||
# gets directory of current file. | ||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" | ||
CORRECT_WORKING_DIR="${SCRIPT_DIR}/../src/instructlab/training/" | ||
SAMPLE_DATA_PATH="${SCRIPT_DIR}/../sample-data/train_all_pruned_SDG.jsonl" | ||
TMP_DIR=$(mktemp -d) | ||
CHECKPOINTS_DIR="${TMP_DIR}/checkpoints" | ||
DATA_DIR="${TMP_DIR}/data" | ||
COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl" | ||
DEFAULT_DISTRIB_FRAMEWORK='fsdp' | ||
DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP | ||
DEFAULT_GPUS=8 | ||
NUM_GPUS="${2:-$DEFAULT_GPUS}" | ||
|
||
# ############### User-modifiable parameters ############### | ||
# Change these as needed | ||
MAX_BATCH_LEN=60000 | ||
NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size. | ||
|
||
# ############### Test Functions ############### | ||
|
||
####################################### | ||
# Creates directories for the precomputed datasets | ||
# and the checkpoints that are saved during training inside | ||
# of the temporary storage created for these tests. | ||
# Globals: | ||
# CHECKPOINTS_DIR | ||
# DATA_DIR | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# None | ||
####################################### | ||
function setup_tmpdir () { | ||
mkdir "$CHECKPOINTS_DIR" | ||
mkdir "$DATA_DIR" | ||
} | ||
|
||
####################################### | ||
# Test most common training parameters without using | ||
# Flash Attention | ||
# Globals: | ||
# SAMPLE_DATA_PATH | ||
# DATA_DIR | ||
# MODEL_NAME | ||
# NUM_SAMPLES_TRAINED_ON | ||
# COMPUTED_DATA_PATH | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# echos number of samples trained on to standard out. | ||
####################################### | ||
function prepare_data () { | ||
# preprocesses .jsonl messages data so that it's a valid | ||
# input to the model (inputs tokenized, formatted with mask, etc.) | ||
# then, data is trimmed to a determined length to make training | ||
# go faster. | ||
|
||
python3 data_process.py \ | ||
--data_path="$SAMPLE_DATA_PATH" \ | ||
--data_output_path="$DATA_DIR" \ | ||
--max_seq_len=4096 \ | ||
--model_name_or_path="$MODEL_NAME" | ||
|
||
# trim data so we only keep the first 'n' samples. | ||
# should be enough data for training to be meaningful but not enough | ||
# that training takes a large amount of time. | ||
echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH" | ||
|
||
echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES" | ||
} | ||
|
||
####################################### | ||
# Clears and remakes the temporary directory where | ||
# artifacts, such as checkpoints and logs, are stored | ||
# during training. | ||
# Globals: | ||
# CHECKPOINTS_DIR | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# writes location of checkpoints dir to standard out. | ||
####################################### | ||
function _cleanup_saved_checkpoints() { | ||
echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR" | ||
rm -rf "$CHECKPOINTS_DIR" | ||
mkdir "$CHECKPOINTS_DIR" | ||
} | ||
|
||
####################################### | ||
# Test most common training parameters without using | ||
# Flash Attention | ||
# Globals: | ||
# NUM_GPUS | ||
# MODEL_NAME | ||
# COMPUTED_DATA_PATH | ||
# CHECKPOINTS_DIR | ||
# DISTRIBUTED_FRAMEWORK | ||
# MAX_BATCH_LEN | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# None | ||
####################################### | ||
function test_standard_loop () { | ||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" \ | ||
--is_granite | ||
} | ||
|
||
####################################### | ||
# Test most common training parameters without using | ||
# Flash Attention | ||
# Globals: | ||
# NUM_GPUS | ||
# MODEL_NAME | ||
# COMPUTED_DATA_PATH | ||
# CHECKPOINTS_DIR | ||
# DISTRIBUTED_FRAMEWORK | ||
# MAX_BATCH_LEN | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# None | ||
####################################### | ||
function test_standard_loop_nongranite () { | ||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" | ||
# --is_granite \ | ||
} | ||
|
||
####################################### | ||
# Test most common training parameters without using | ||
# Granite or Flash Attention | ||
# Globals: | ||
# NUM_GPUS | ||
# MODEL_NAME | ||
# COMPUTED_DATA_PATH | ||
# CHECKPOINTS_DIR | ||
# DISTRIBUTED_FRAMEWORK | ||
# MAX_BATCH_LEN | ||
# Arguments: | ||
# None | ||
# Returns: | ||
# None | ||
####################################### | ||
function test_standard_loop_noflashattention_nogranite () { | ||
torchrun \ | ||
--standalone \ | ||
--nproc_per_node="$NUM_GPUS" \ | ||
main_ds.py \ | ||
--model_name_or_path="$MODEL_NAME" \ | ||
--data_path="$COMPUTED_DATA_PATH" \ | ||
--output_dir="$CHECKPOINTS_DIR" \ | ||
--num_epochs=1 \ | ||
--effective_batch_size=128 \ | ||
--save_samples=0 \ | ||
--checkpoint_at_epoch \ | ||
--accelerate_full_state_at_epoch \ | ||
--distributed_training_framework="$DISTRIB_FRAMEWORK" \ | ||
--max_batch_len="$MAX_BATCH_LEN" \ | ||
--disable_flash_attn | ||
# --is_granite | ||
} | ||
|
||
function main () { | ||
|
||
setup_tmpdir | ||
trap "rm -rf $TMP_DIR" EXIT | ||
|
||
#NOTE (jkunstle): script is run as though it's | ||
# in the same source dir as main_ds and data_process. | ||
cd "$CORRECT_WORKING_DIR" | ||
echo "CURRENT WORKING DIRECTORY: $(pwd)" | ||
|
||
prepare_data | ||
test_standard_loop_noflashattention_nogranite | ||
_cleanup_saved_checkpoints | ||
test_standard_loop_nongranite | ||
_cleanup_saved_checkpoints | ||
test_standard_loop | ||
} | ||
|
||
main |