Merge pull request #31 from pjlab-sys4nlp/scaling_13b

CPT: fix tb logging, fix grad ckpting, faster data loading
pjlab-sys4nlp · Oct 7, 2023 · 4bff10e · 4bff10e
2 parents f8a56b4 + f0e5ae3
commit 4bff10e
Show file tree

Hide file tree

Showing 34 changed files with 916 additions and 201 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,7 +167,7 @@ outputs/
 /visualization/
 results/analysis/cluster_*.png
 results/expert_load_vis
-results/analysis_clustering7
+results/analysis_clustering*
 results/gate_loss_100b
 results/RandomSplit-l2_norm-llama_7B-16Select4-up_proj
 results/gate_loss_original_clustering_model

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -9,7 +9,7 @@
             "type": "python",
             "request": "attach",
             "connect": {
-                "host": "SH-IDCA1404-10-140-54-115",
+                "host": "SH-IDCA1404-10-140-54-122",
                 "port": 5678
             },
             "pathMappings": [

diff --git a/scripts/cpt/fpt.sh b/scripts/cpt/fpt.sh
@@ -1,33 +1,37 @@
 #!/usr/bin/bash
 
-#SBATCH --job-name=cpt-16select4-64gpus
+#SBATCH --job-name=cpt-7b-test
 #SBATCH --output=logs/%x-%j.log
 #SBATCH --error=logs/%x-%j.log
 
 #SBATCH --partition=MoE
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=32
 #SBATCH --mem=0
-#SBATCH -x SH-IDCA1404-10-140-54-116
+#SBATCH -x SH-IDCA1404-10-140-54-116,SH-IDCA1404-10-140-54-70
 
-#SBATCH --nodes=7
+#SBATCH --nodes=1
 #SBATCH --gres=gpu:8
 
 source ~/anaconda3/bin/activate smoe
 
-num_nodes=7         # should match with --nodes
-num_gpu_per_node=8  # should match with --gres
-
-# #cpu/#num_gpu_per_node
-export OMP_NUM_THREADS=4
-export LOGLEVEL=INFO
-# export NCCL_DEBUG=INFO
-# export TORCH_DISTRIBUTED_DEBUG=DETAIL
-# export TORCH_SHOW_CPP_STACKTRACES=1
-# export CUDA_LAUNCH_BLOCKING=1
 
 {
+    num_nodes=1         # should match with --nodes
+    num_gpu_per_node=8  # should match with --gres
+
+    # #cpu/#num_gpu_per_node
+    export OMP_NUM_THREADS=16
+    export LOGLEVEL=INFO
+    # export NCCL_DEBUG=INFO
+    # export TORCH_DISTRIBUTED_DEBUG=DETAIL
+    # export TORCH_SHOW_CPP_STACKTRACES=1
+    # export CUDA_LAUNCH_BLOCKING=1
+
+    comment="exp purpose"
+
     # model_type="llama"
+    # pretrained_model="outputs/llama1_7B_random"
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
     model_type="llama_moe"
     pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm_bak
@@ -40,13 +44,14 @@ export LOGLEVEL=INFO
     # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
     dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
 
-    lr=3e-4
+    lr=1e-4
     final_lr_portion=0.1
-    per_device_train_batch_size=8
+    per_device_train_batch_size=16
     per_device_eval_batch_size=1
-    gradient_accumulation_steps=4
+    gradient_accumulation_steps=2
     block_size=2048
     num_tokens="1*10^11"
+    seed=1227
     deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json
 
     max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
@@ -61,8 +66,11 @@ export LOGLEVEL=INFO
     data_cache=resources/cache
     output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
     mkdir -p $output_dir
-    scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
     echo "output_dir: $output_dir"
+    scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
+    git diff > $output_dir/diff.patch
+    env > $output_dir/.env
+    echo $comment > $output_dir/comment.txt
 
     nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
     nodes_array=($nodes)
@@ -78,7 +86,7 @@ export LOGLEVEL=INFO
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \
             --model_type ${model_type} \
@@ -89,7 +97,7 @@ export LOGLEVEL=INFO
             --per_device_train_batch_size ${per_device_train_batch_size} \
             --per_device_eval_batch_size ${per_device_eval_batch_size} \
             --do_train \
-            --seed $RANDOM \
+            --seed ${seed} \
             --bf16 \
             --num_train_epochs 1 \
             --final_lr_portion ${final_lr_portion} \
@@ -102,8 +110,6 @@ export LOGLEVEL=INFO
             --warmup_steps 2000 \
             --max_steps ${max_steps} \
             --max_train_samples ${max_train_samples} \
-            --logging_strategy steps \
-            --logging_steps 10 \
             --save_strategy steps \
             --save_total_limit 2 \
             --save_steps 1000 \
@@ -113,12 +119,16 @@ export LOGLEVEL=INFO
             --output_dir ${output_dir} \
             --overwrite_output_dir \
             --ddp_timeout 30000 \
-            --logging_first_step True \
-            --torch_dtype bfloat16 \
             --ddp_find_unused_parameters False \
+            --torch_dtype bfloat16 \
             --gradient_checkpointing \
-            --report_to none \
-            --log_level info
+            --logging_first_step True \
+            --logging_strategy steps \
+            --logging_steps 10 \
+            --log_level info \
+            --log_level_replica warning \
+            --log_on_each_node False \
+            --report_to none
 }
 #SBATCH --job-name=cpt-moe-fpt-test_lr_change
 #改动前：--logging_steps 10 \

diff --git a/scripts/cpt/fpt_13b.sh b/scripts/cpt/fpt_13b.sh
@@ -1,59 +1,80 @@
 #!/usr/bin/bash
 
-#SBATCH --job-name=cpt-moe-fpt-13b-64gpus-bs8_4-task_test
+#SBATCH --job-name=cpt-13b-test
 #SBATCH --output=logs/%x-%j.log
 #SBATCH --error=logs/%x-%j.log
+##SBATCH --output=logs/%x.log
+##SBATCH --error=logs/%x.log
 
 #SBATCH --partition=MoE
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=32
 #SBATCH --mem=0
-#SBATCH -x SH-IDCA1404-10-140-54-116
-#SBATCH --time=8:00:00
 
-#SBATCH --nodes=8
+#SBATCH --nodes=2
 #SBATCH --gres=gpu:8
+#SBATCH --quotatype=auto
+##SBATCH --time=5:00:00
 
 source ~/anaconda3/bin/activate smoe
 
 {
-    num_nodes=8         # should match with --nodes
+    num_nodes=2         # should match with --nodes
     num_gpu_per_node=8  # should match with --gres
 
     # #cpu/#num_gpu_per_node
-    export OMP_NUM_THREADS=4
+    export OMP_NUM_THREADS=16
     export LOGLEVEL=INFO
     # export NCCL_DEBUG=INFO
     # export TORCH_DISTRIBUTED_DEBUG=DETAIL
     # export TORCH_SHOW_CPP_STACKTRACES=1
     # export CUDA_LAUNCH_BLOCKING=1
 
-    lr=3e-4
+    # comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4, expert weight re-scale"
+    comment="13B, expert 4/16, noisy gate, seq len 2048, lr=4e-4"
+    # comment="random initialized llama1-7B"
+    # comment="random initialized llama1-13B"
+    # comment="7B, expert 4/16, noisy gate, gradient shared neurons, w/o residual, w/o weight re-scale, lr2e-4"
+    # comment="3B MoE, debug"
 
     # model_type="llama"
-    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
-    # model_type="llama_moe"
-    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/llama_13B"
+    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
+    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
     model_type="llama_moe"
-    pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj"
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_3B-8Select2-4320Neurons-Share"
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-16Select4-688Neurons-Share"
+    pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons-Share"
+    # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4-l2_norm
+    # pretrained_model="/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Clustering-l2/llama_13B-16Select4-up_proj"
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
-    # pretrained_model=$1
-    echo "==================> $pretrained_model <=================="
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj
     # pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj
+    # pretrained_model=$1
+    echo "==================> $pretrained_model <=================="
 
     # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
-    # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
+    # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-no-softmax-copy/Clustering-l2-l2_norm/llama_13B-16Select4-gate_proj
+    # tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama1_7B_random
     tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B
+    # tokenizer_path="/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"
+
     dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
+    # dataset_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
 
+    lr=2e-4
+    final_lr_portion=0.1
     per_device_train_batch_size=8
     per_device_eval_batch_size=1
     gradient_accumulation_steps=4
+    num_tokens="3*10^11"
+    seed=1227
     block_size=2048
-    max_steps=$(echo "10^11 / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
-    max_train_samples=$(echo "10^11 / $block_size" | bc)
+    deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json
+
+    max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
+    max_train_samples=$(echo "${num_tokens} / $block_size" | bc)
     echo "max_steps: $max_steps"
     echo "max_train_samples: $max_train_samples"
     global_bs=$(echo "$per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node" | bc)
@@ -63,8 +84,13 @@ source ~/anaconda3/bin/activate smoe
 
     data_cache=resources/cache
     output_dir=outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
+    # output_dir=/mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4
+    mkdir -p $output_dir
     echo "output_dir: $output_dir"
-    deepspeed_config_file=conf/deepspeed/bf16_zero2_default.json
+    scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
+    git diff > $output_dir/diff.patch
+    env > $output_dir/.env
+    echo $comment > $output_dir/comment.txt
 
     nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
     nodes_array=($nodes)
@@ -73,14 +99,15 @@ source ~/anaconda3/bin/activate smoe
     echo "Node: $head_node"
     echo "Node IP: $head_node_ip"
 
+            # --resume_from_checkpoint /mnt/petrelfs/share_data/quxiaoye/models/tzhu_model_bak/cpt-13b-16gpus-lr2e-4/checkpoint-2000 \
     srun torchrun \
         --nnodes ${num_nodes} \
         --nproc_per_node ${num_gpu_per_node} \
         --node_rank $SLURM_NODEID \
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \
             --model_type ${model_type} \
@@ -91,10 +118,10 @@ source ~/anaconda3/bin/activate smoe
             --per_device_train_batch_size ${per_device_train_batch_size} \
             --per_device_eval_batch_size ${per_device_eval_batch_size} \
             --do_train \
-            --seed $RANDOM \
+            --seed ${seed} \
             --bf16 \
             --num_train_epochs 1 \
-            --final_lr_portion 0.1 \
+            --final_lr_portion ${final_lr_portion} \
             --optim adamw_torch \
             --adam_beta1 0.9 \
             --adam_beta2 0.95 \
@@ -103,9 +130,7 @@ source ~/anaconda3/bin/activate smoe
             --max_grad_norm 1.0 \
             --warmup_steps 2000 \
             --max_steps ${max_steps} \
-            --max_train_samples 48828125 \
-            --logging_strategy steps \
-            --logging_steps 10 \
+            --max_train_samples ${max_train_samples} \
             --save_strategy steps \
             --save_total_limit 1 \
             --save_steps 1000 \
@@ -115,10 +140,18 @@ source ~/anaconda3/bin/activate smoe
             --output_dir ${output_dir} \
             --overwrite_output_dir \
             --ddp_timeout 30000 \
-            --logging_first_step True \
-            --torch_dtype bfloat16 \
             --ddp_find_unused_parameters False \
+            --torch_dtype bfloat16 \
             --gradient_checkpointing \
-            --report_to none \
-            --log_level info
+            --logging_first_step True \
+            --logging_strategy steps \
+            --logging_steps 10 \
+            --log_level info \
+            --log_level_replica warning \
+            --log_on_each_node False \
+            --gate_type "TopKBalancedNoisyGate" \
+            --calculator_type "UniversalCalculator" \
+            --num_selects 4 \
+            --report_to none
+
 }
diff --git a/scripts/cpt/fpt_resume.sh b/scripts/cpt/fpt_resume.sh
@@ -76,7 +76,7 @@ export LOGLEVEL=INFO
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --ignore_data_skip \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \

diff --git a/scripts/cpt/fpt_switch.sh b/scripts/cpt/fpt_switch.sh
@@ -82,7 +82,7 @@ export LOGLEVEL=INFO
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \
             --model_type ${model_type} \

diff --git a/scripts/cpt/fpt_test_lr.sh b/scripts/cpt/fpt_test_lr.sh
@@ -69,7 +69,7 @@ export LOGLEVEL=INFO
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \
             --model_type ${model_type} \

diff --git a/scripts/cpt/gate_loss.sh b/scripts/cpt/gate_loss.sh
@@ -75,7 +75,7 @@ export GATE_LOSS_RESULTS_DIR="results/RandomSplit-l2_norm-llama_7B-16Select4-up_
         --rdzv_id $RANDOM \
         --rdzv_backend c10d \
         --rdzv_endpoint $head_node:29518 \
-        smoe/entrypoint/cpt_fpt.py \
+        smoe/entrypoint/cpt/cpt_fpt.py \
             --ignore_data_skip \
             --deepspeed ${deepspeed_config_file} \
             --model_name_or_path ${pretrained_model} \

diff --git a/scripts/cpt/lora.sh b/scripts/cpt/lora.sh
@@ -67,7 +67,7 @@ srun torchrun \
     --rdzv_id $RANDOM \
     --rdzv_backend c10d \
     --rdzv_endpoint $head_node:29518 \
-    smoe/entrypoint/cpt_lora.py \
+    smoe/entrypoint/cpt/cpt_lora.py \
         --deepspeed ${deepspeed_config_file} \
         --model_name_or_path ${pretrained_model} \
         --model_type ${model_type} \

diff --git a/scripts/tokenize/clustering.sh b/scripts/tokenize/clustering.sh
@@ -3,8 +3,8 @@
 set -vx
 
 tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
-data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples
-out_dir=/mnt/petrelfs/share_data/quxiaoye/data/16clusters
+data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32
+out_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32_tokenized
 logs_dir=logs
 
 mkdir -p $out_dir
@@ -13,13 +13,13 @@ mkdir -p $logs_dir
 # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github
 for data_type in $(ls $data_dir)
 do
-    log_path=logs/tokenize_$data_type.log
-    nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \
+    log_path=logs/tokenize_${data_type}_32clusters.log
+    nohup srun -p MoE -N1 -n1 --cpus-per-task=32 -x "SH-IDCA1404-10-140-54-[12,18,33,38,41,43,63,70-71,74,83,85]" \
         python -m smoe.utils.tokenize \
             -f jsonl \
             -t $tokenizer_dir \
             -i $data_dir/$data_type \
             -o $out_dir/$data_type \
-        1>$logs_dir/tokenize_$data_type.log 2>&1 &
-    echo "$data_type > $logs_dir/tokenize_$data_type.log"
+        1>${log_path} 2>&1 &
+    echo "$data_type > $log_path"
 done