增加2nodes脚本

b2f70020 · silencealiang · 35cb3cc1 · b2f70020 · b2f70020 · b2f70020
Commit b2f70020 authored Jan 16, 2025 by silencealiang
Showing with 261 additions and 18 deletions

2nodes 2nodes +2 -0

run_2nodes.sh run_2nodes.sh +21 -0

train_mixtral_8x7B_1nodes.sh train_mixtral_8x7B_1nodes.sh +26 -18

train_mixtral_8x7B_2nodes.sh train_mixtral_8x7B_2nodes.sh +212 -0

No files found.
--- a/2nodes
+++ b/2nodes
+10.16.6.2 slots=8
+10.16.6.7 slots=8
--- a/run_2nodes.sh
+++ b/run_2nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 16 --hostfile 2nodes \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              --mca plm_rsh_args "-p 12333" \
+              --mca btl_tcp_if_include ibs8 \
+              train_mixtral_8x7B_2nodes.sh 10.16.6.2 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/train_mixtral_8x7B_1nodes.sh
+++ b/train_mixtral_8x7B_1nodes.sh
@@ -15,10 +15,10 @@ export HIP_DIRECT_DISPATCH=0
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-export NVTE_FLASH_ATTN_TRITON=1
+#export NVTE_FLASH_ATTN_TRITON=1
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=2
-export NCCL_MIN_NCHANNELS=16
+export NCCL_NCHANNELS_PER_PEER=8
+export NCCL_MIN_NCHANNELS=15
 export NCCL_IB_TIMEOUT=22
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 #export NCCL_IB_HCA=mlx5_0
@@ -27,7 +27,6 @@ export NCCL_NET_GDR_LEVEL=SYS
 export NCCL_NET_GDR_READ=0
 export GLOG_minloglevel=3

-
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -35,8 +34,8 @@ DIST_URL=${1}
 DIST_PORT=25900

 CHECKPOINT_PATH=./CKPT 
-TOKENIZER_MODEL=../../megatron-lm/mixtral_dataset/tokenizer.model
-DATA_PATH=../../megatron-lm/mixtral_dataset/my-mixtral_text_document
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document

 DISTRIBUTED_ARGS=(
    --rank ${RANK}
@@ -50,7 +49,7 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 4096
    --max-position-embeddings 32768
-    --num-layers 8 #16
+    --num-layers 8
    --hidden-size 1024
    --ffn-hidden-size 14336
    --num-attention-heads 32
@@ -88,9 +87,9 @@ DATA_ARGS=(

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 128 #256
+    --global-batch-size 128
    --lr 1e-4
-    --train-iters 20
+    --train-iters 10
    --lr-decay-iters 320000
    --lr-decay-style cosine
    --min-lr 1.0e-5
@@ -100,6 +99,7 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
+    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -107,7 +107,7 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_data
+    --profile-dir torch_prof_data_1nodes_dcu_batchgemm
    --use-pytorch-profiler
 )

@@ -170,35 +170,43 @@ fi
 case ${LOCAL_RANK} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=0 --membind=0 ${APP}
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=1 --membind=1 ${APP}
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=2 --membind=2 ${APP}
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=3 --membind=3 ${APP}
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=4 --membind=4 ${APP}
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=5 --membind=5 ${APP}
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=6 --membind=6 ${APP}
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=7 --membind=7 ${APP}
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac

--- a/train_mixtral_8x7B_2nodes.sh
+++ b/train_mixtral_8x7B_2nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export HIP_DIRECT_DISPATCH=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+#export NVTE_FLASH_ATTN_TRITON=1
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=8
+export NCCL_MIN_NCHANNELS=15
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=mlx5_0
+export NCCL_SOCKET_IFNAME=enp33s0f3u1
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+export GLOG_minloglevel=3
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT 
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+    #--tp-comm-overlap
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_data_record_shapes
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 2
+    --expert-model-parallel-size 4
+    --expert-tensor-parallel-size 2
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+