Skip to content

Commit

Permalink
Merge pull request #31 from pjlab-sys4nlp/scaling_13b
Browse files Browse the repository at this point in the history
CPT: fix tb logging, fix grad ckpting, faster data loading
  • Loading branch information
DaizeDong authored Oct 7, 2023
2 parents f8a56b4 + f0e5ae3 commit 4bff10e
Show file tree
Hide file tree
Showing 34 changed files with 916 additions and 201 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ outputs/
/visualization/
results/analysis/cluster_*.png
results/expert_load_vis
results/analysis_clustering7
results/analysis_clustering*
results/gate_loss_100b
results/RandomSplit-l2_norm-llama_7B-16Select4-up_proj
results/gate_loss_original_clustering_model
Expand Down
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"type": "python",
"request": "attach",
"connect": {
"host": "SH-IDCA1404-10-140-54-115",
"host": "SH-IDCA1404-10-140-54-122",
"port": 5678
},
"pathMappings": [
Expand Down
60 changes: 35 additions & 25 deletions scripts/cpt/fpt.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,37 @@
#!/usr/bin/bash

#SBATCH --job-name=cpt-16select4-64gpus
#SBATCH --job-name=cpt-7b-test
#SBATCH --output=logs/%x-%j.log
#SBATCH --error=logs/%x-%j.log

#SBATCH --partition=MoE
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=0
#SBATCH -x SH-IDCA1404-10-140-54-116
#SBATCH -x SH-IDCA1404-10-140-54-116,SH-IDCA1404-10-140-54-70

#SBATCH --nodes=7
#SBATCH --nodes=1
#SBATCH --gres=gpu:8

source ~/anaconda3/bin/activate smoe

num_nodes=7 # should match with --nodes
num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=4
export LOGLEVEL=INFO
# export NCCL_DEBUG=INFO
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

{
num_nodes=1 # should match with --nodes
num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=16
export LOGLEVEL=INFO
# export NCCL_DEBUG=INFO
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

comment="exp purpose"

# model_type="llama"
# pretrained_model="outputs/llama1_7B_random"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
model_type="llama_moe"
pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm_bak
Expand All @@ -40,13 +44,14 @@ export LOGLEVEL=INFO
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed

lr=3e-4
lr=1e-4
final_lr_portion=0.1
per_device_train_batch_size=8
per_device_train_batch_size=16
per_device_eval_batch_size=1
gradient_accumulation_steps=4
gradient_accumulation_steps=2
block_size=2048
num_tokens="1*10^11"
seed=1227
deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json

max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
Expand All @@ -61,8 +66,11 @@ export LOGLEVEL=INFO
data_cache=resources/cache
output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
mkdir -p $output_dir
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
echo "output_dir: $output_dir"
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
git diff > $output_dir/diff.patch
env > $output_dir/.env
echo $comment > $output_dir/comment.txt

nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
nodes_array=($nodes)
Expand All @@ -78,7 +86,7 @@ export LOGLEVEL=INFO
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
Expand All @@ -89,7 +97,7 @@ export LOGLEVEL=INFO
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \
--seed $RANDOM \
--seed ${seed} \
--bf16 \
--num_train_epochs 1 \
--final_lr_portion ${final_lr_portion} \
Expand All @@ -102,8 +110,6 @@ export LOGLEVEL=INFO
--warmup_steps 2000 \
--max_steps ${max_steps} \
--max_train_samples ${max_train_samples} \
--logging_strategy steps \
--logging_steps 10 \
--save_strategy steps \
--save_total_limit 2 \
--save_steps 1000 \
Expand All @@ -113,12 +119,16 @@ export LOGLEVEL=INFO
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 30000 \
--logging_first_step True \
--torch_dtype bfloat16 \
--ddp_find_unused_parameters False \
--torch_dtype bfloat16 \
--gradient_checkpointing \
--report_to none \
--log_level info
--logging_first_step True \
--logging_strategy steps \
--logging_steps 10 \
--log_level info \
--log_level_replica warning \
--log_on_each_node False \
--report_to none
}
#SBATCH --job-name=cpt-moe-fpt-test_lr_change
#改动前:--logging_steps 10 \
Expand Down
87 changes: 60 additions & 27 deletions scripts/cpt/fpt_13b.sh
Original file line number Diff line number Diff line change
@@ -1,59 +1,80 @@
#!/usr/bin/bash

#SBATCH --job-name=cpt-moe-fpt-13b-64gpus-bs8_4-task_test
#SBATCH --job-name=cpt-13b-test
#SBATCH --output=logs/%x-%j.log
#SBATCH --error=logs/%x-%j.log
##SBATCH --output=logs/%x.log
##SBATCH --error=logs/%x.log

#SBATCH --partition=MoE
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=0
#SBATCH -x SH-IDCA1404-10-140-54-116
#SBATCH --time=8:00:00

#SBATCH --nodes=8
#SBATCH --nodes=2
#SBATCH --gres=gpu:8
#SBATCH --quotatype=auto
##SBATCH --time=5:00:00

source ~/anaconda3/bin/activate smoe

{
num_nodes=8 # should match with --nodes
num_nodes=2 # should match with --nodes
num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=4
export OMP_NUM_THREADS=16
export LOGLEVEL=INFO
# export NCCL_DEBUG=INFO
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

lr=3e-4
# comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4, expert weight re-scale"
comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4"
# comment="random initialized llama1-7B"
# comment="random initialized llama1-13B"
# comment="7B, expert 4/16, noisy gate, gradient shared neurons, w/o residual, w/o weight re-scale, lr2e-4"
# comment="3B MoE, debug"

# model_type="llama"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
# model_type="llama_moe"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/llama_13B"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
model_type="llama_moe"
pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj"
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_3B-8Select2-4320Neurons-Share"
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-16Select4-688Neurons-Share"
pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons-Share"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
# pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Clustering-l2/llama_13B-16Select4-up_proj"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
# pretrained_model=$1
echo "==================> $pretrained_model <=================="
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj
# pretrained_model=$1
echo "==================> $pretrained_model <=================="

# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax-copy/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
# tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B
# tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"

dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
# dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized

lr=2e-4
final_lr_portion=0.1
per_device_train_batch_size=8
per_device_eval_batch_size=1
gradient_accumulation_steps=4
num_tokens="3*10^11"
seed=1227
block_size=2048
max_steps=$(echo "10^11 / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
max_train_samples=$(echo "10^11 / $block_size" | bc)
deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json

max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
max_train_samples=$(echo "${num_tokens} / $block_size" | bc)
echo "max_steps: $max_steps"
echo "max_train_samples: $max_train_samples"
global_bs=$(echo "$per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node" | bc)
Expand All @@ -63,8 +84,13 @@ source ~/anaconda3/bin/activate smoe

data_cache=resources/cache
output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
# output_dir=/mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4
mkdir -p $output_dir
echo "output_dir: $output_dir"
deepspeed_config_file=conf/deepspeed/bf16_zero2_default.json
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
git diff > $output_dir/diff.patch
env > $output_dir/.env
echo $comment > $output_dir/comment.txt

nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
nodes_array=($nodes)
Expand All @@ -73,14 +99,15 @@ source ~/anaconda3/bin/activate smoe
echo "Node: $head_node"
echo "Node IP: $head_node_ip"

# --resume_from_checkpoint /mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4/checkpoint-2000 \
srun torchrun \
--nnodes ${num_nodes} \
--nproc_per_node ${num_gpu_per_node} \
--node_rank $SLURM_NODEID \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
Expand All @@ -91,10 +118,10 @@ source ~/anaconda3/bin/activate smoe
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \
--seed $RANDOM \
--seed ${seed} \
--bf16 \
--num_train_epochs 1 \
--final_lr_portion 0.1 \
--final_lr_portion ${final_lr_portion} \
--optim adamw_torch \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
Expand All @@ -103,9 +130,7 @@ source ~/anaconda3/bin/activate smoe
--max_grad_norm 1.0 \
--warmup_steps 2000 \
--max_steps ${max_steps} \
--max_train_samples 48828125 \
--logging_strategy steps \
--logging_steps 10 \
--max_train_samples ${max_train_samples} \
--save_strategy steps \
--save_total_limit 1 \
--save_steps 1000 \
Expand All @@ -115,10 +140,18 @@ source ~/anaconda3/bin/activate smoe
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 30000 \
--logging_first_step True \
--torch_dtype bfloat16 \
--ddp_find_unused_parameters False \
--torch_dtype bfloat16 \
--gradient_checkpointing \
--report_to none \
--log_level info
--logging_first_step True \
--logging_strategy steps \
--logging_steps 10 \
--log_level info \
--log_level_replica warning \
--log_on_each_node False \
--gate_type "TopKBalancedNoisyGate" \
--calculator_type "UniversalCalculator" \
--num_selects 4 \
--report_to none

}
2 changes: 1 addition & 1 deletion scripts/cpt/fpt_resume.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ export LOGLEVEL=INFO
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--ignore_data_skip \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/cpt/fpt_switch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ export LOGLEVEL=INFO
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/cpt/fpt_test_lr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ export LOGLEVEL=INFO
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/cpt/gate_loss.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ export GATE_LOSS_RESULTS_DIR="results/RandomSplit-l2_norm-llama_7B-16Select4-up_
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_fpt.py \
smoe/entrypoint/cpt/cpt_fpt.py \
--ignore_data_skip \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
Expand Down
2 changes: 1 addition & 1 deletion scripts/cpt/lora.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ srun torchrun \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt_lora.py \
smoe/entrypoint/cpt/cpt_lora.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
Expand Down
12 changes: 6 additions & 6 deletions scripts/tokenize/clustering.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
set -vx

tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples
out_dir=/mnt/petrelfs/share_data/quxiaoye/data/16clusters
data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32
out_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32_tokenized
logs_dir=logs

mkdir -p $out_dir
Expand All @@ -13,13 +13,13 @@ mkdir -p $logs_dir
# for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github
for data_type in $(ls $data_dir)
do
log_path=logs/tokenize_$data_type.log
nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \
log_path=logs/tokenize_${data_type}_32clusters.log
nohup srun -p MoE -N1 -n1 --cpus-per-task=32 -x "SH-IDCA1404-10-140-54-[12,18,33,38,41,43,63,70-71,74,83,85]" \
python -m smoe.utils.tokenize \
-f jsonl \
-t $tokenizer_dir \
-i $data_dir/$data_type \
-o $out_dir/$data_type \
1>$logs_dir/tokenize_$data_type.log 2>&1 &
echo "$data_type > $logs_dir/tokenize_$data_type.log"
1>${log_path} 2>&1 &
echo "$data_type > $log_path"
done
Loading

0 comments on commit 4bff10e

Please sign in to comment.