Skip to content

Commit

Permalink
Working run_clm deepspeed for LUMI
Browse files Browse the repository at this point in the history
  • Loading branch information
mvsjober committed Oct 5, 2023
1 parent 57b76ed commit a8ecbac
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 24 deletions.
16 changes: 14 additions & 2 deletions benchmarks/ds_config_clm.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{
"optimizer": {
"type": "SGD",
"type": "Adam",
"params": {
"lr": "auto"
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
Expand All @@ -13,6 +16,15 @@
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
Expand Down
41 changes: 20 additions & 21 deletions pytorch-clm-deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,39 @@ export NCCL_DEBUG=INFO

SCRIPT="benchmarks/run_clm.py"
OUTPUT_DIR=/flash/project_462000007/mvsjober/run-clm
DS_CONFIG=benchmarks/ds_config_clm.json

export HF_HOME=/scratch/project_462000007/mvsjober/hf-home
export TORCH_HOME=/scratch/project_462000007/mvsjober/torch-cache

if [ "$SLURM_NTASKS" -ne "$SLURM_NNODES" ]; then
echo "ERROR: this script needs to be run as one task per node."
echo "SLURM_NNODES = $SLURM_NNODES != SLURM_NTASKS = $SLURM_NTASKS"
exit 1
fi


if [ "$SLURM_NNODES" -gt 1 ]; then
if [ $(( $NUM_GPUS * $SLURM_NNODES )) -ne $SLURM_NTASKS ]; then
echo "ERROR: this script needs to be run as one task per GPU. Try using slurm/*-mpi.sh scripts."
echo "NUM_GPUS * SLURM_NNODES = $NUM_GPUS * $SLURM_NNODES != SLURM_NTASKS = $SLURM_NTASKS"
exit 1
fi

(set -x
srun python3 $SCRIPT --deepspeed benchmarks/ds_config_clm.json \
--model_name_or_path EleutherAI/gpt-neo-1.3B \
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
--per_device_train_batch_size 2 --do_train \
--output_dir $OUTPUT_DIR --overwrite_output_dir \
--gradient_accumulation_steps 1 \
--num_train_epochs 1 --dataloader_num_workers 7 $SCRIPT_OPTS $*
RDZV_HOST=$(hostname)
RDZV_PORT=29400

(set -x
srun python3 -m torch.distributed.run --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$RDZV_HOST:$RDZV_PORT \
--nnodes=$SLURM_NNODES --nproc_per_node=$NUM_GPUS $SCRIPT \
--deepspeed $DS_CONFIG \
--model_name_or_path EleutherAI/gpt-neo-1.3B \
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
--per_device_train_batch_size 2 --do_train \
--output_dir $OUTPUT_DIR --overwrite_output_dir \
--num_train_epochs 1 --dataloader_num_workers 7 $SCRIPT_OPTS $*
)
else
if [ $SLURM_NTASKS -ne 1 ]; then
echo "ERROR: single node runs need to be run as a single task"
echo "SLURM_NTASKS = $SLURM_NTASKS != 1"
exit 1
fi

if [ ! -z $NUM_GPUS ]; then
SCRIPT_OPTS="--gradient_accumulation_steps $(( 8 / $NUM_GPUS ))"
fi

(set -x
srun singularity_wrapper exec deepspeed --num_gpus=$NUM_GPUS $SCRIPT --deepspeed benchmarks/ds_config_clm.json \
srun singularity_wrapper exec deepspeed --num_gpus=$NUM_GPUS $SCRIPT --deepspeed $DS_CONFIG \
--model_name_or_path EleutherAI/gpt-neo-1.3B \
--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
--per_device_train_batch_size 2 --do_train \
Expand Down
2 changes: 1 addition & 1 deletion slurm/lumi-gpu16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#SBATCH --cpus-per-task=56
#SBATCH --gpus-per-node=8
#SBATCH --mem=480G
#SBATCH --time=15
#SBATCH --time=30
#SBATCH --output=logs/slurm-%x-%j.out

cd $SLURM_SUBMIT_DIR
Expand Down

0 comments on commit a8ecbac

Please sign in to comment.