Update Llama_pretraining.sh

64266070 · wxj · 40ea1bd3 · 64266070
Commit 64266070 authored Jan 10, 2025 by wxj
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 47 deletions

Llama_pretraining.sh Llama_pretraining.sh +49 -47

No files found.
--- a/Llama_pretraining.sh
+++ b/Llama_pretraining.sh
@@ -48,8 +48,11 @@ GPT_MODEL_ARGS=(
    --hidden-size 4096
    --ffn-hidden-size 11008 
    --num-attention-heads 32
-    --seq-length 4096 #4096
    --max-position-embeddings 4096
+
+    --normalization RMSNorm 
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
 )

 # export NVTE_FLASH_ATTN=1 # 走cutlass
@@ -83,11 +86,13 @@ TRAINING_ARGS=(
    --min-lr 3.0e-6
    --lr-warmup-iters 1
    --ckpt-format torch
-    --ddp-average-in-collective
+    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
    # --recompute-granularity full # 开启重计算降低显存增加耗时
    # --recompute-num-layers 5 #0 #
    # --recompute-method block
-    --overlap-grad-reduce
+    --overlap-grad-reduce # 重叠ddp grad reduce
+    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
    --use-flash-attn-triton
 )
 # --use-flash-attn-cutlass # cutlass fa
@@ -96,16 +101,13 @@ TRAINING_ARGS=(
 MODEL_PARALLEL_ARGS=(
    --sequence-parallel
 	--tensor-model-parallel-size 2
-	--pipeline-model-parallel-size 4
+	--pipeline-model-parallel-size 2
 )

 DATA_ARGS=(
    --data-path $DATA_PATH 
+    --seq-length 4096 #4096
    --split 949,50,1
-    --untie-embeddings-and-output-weights
-    --use-rotary-position-embeddings 
-    --normalization RMSNorm 
-    --no-position-embedding 
    --tokenizer-type Llama2Tokenizer
    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
 )
@@ -157,46 +159,46 @@ APP="python -u pretrain_gpt.py \

 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
 # export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
-# ${APP}
-case ${LOCAL_RANK} in
-[0])
+${APP}
+# case ${LOCAL_RANK} in
+# [0])
+# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [1])
+# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [2])
+# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [3])
+# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [4])
 #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [5])
 #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[2])
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [6])
 #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[3])
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# [7])
 #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[4])
-  # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[5])
-  # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[6])
-  # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[7])
-  # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-esac
+#   ${APP}
+#   # numactl --cpunodebind=0 --membind=0 ${APP}
+#   ;;
+# esac