add

1310cbf8 · silencealiang · 6509c277 · 6509c277 · 1310cbf8 · 1310cbf8
Commit 1310cbf8 authored Feb 25, 2025 by silencealiang
12 changed files
--- a/2nodes
+++ b/2nodes
-10.16.6.2 slots=8
-10.16.6.7 slots=8
--- a/gptnodes
+++ b/gptnodes
+node002 slots=8
+node003 slots=8
+node004 slots=8
+node005 slots=8
+node006 slots=8
+node020 slots=8
+node021 slots=8
+node022 slots=8
+node033 slots=8
+node034 slots=8
+node035 slots=8
+node036 slots=8
+node037 slots=8
+node038 slots=8
+node039 slots=8
+node040 slots=8
+node041 slots=8
+node042 slots=8
+node043 slots=8
+node044 slots=8
+node045 slots=8
+node046 slots=8
+node047 slots=8
+node048 slots=8
+node056 slots=8
+node057 slots=8
+node058 slots=8
+node059 slots=8
+node060 slots=8
+node061 slots=8
+node062 slots=8
+node063 slots=8
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -765,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):

 if is_te_min_version("1.9.0.dev0"):

-    class TEGroupedLinear(te.pytorch.GroupedLinear):
+    class TEGroupedLinear(te.pytorch.BatchLinear if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')) else te.pytorch.GroupedLinear):
        """
        Wrapper for the Transformer-Engine's `GroupedLinear` layer.


--- a/mixtralnodes
+++ b/mixtralnodes
+node021 slots=8
+node022 slots=8
\ No newline at end of file
--- a/run_2nodes.sh
+++ b/run_2nodes.sh
@@ -7,13 +7,11 @@ do
    fi
 done

-mpirun -np 16 --hostfile 2nodes \
+mpirun -np 256 --hostfile gptnodes \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              --mca plm_rsh_args "-p 12333" \
-              --mca btl_tcp_if_include ibs8 \
-              train_mixtral_8x7B_2nodes.sh 10.16.6.2 --profiling=$profiling > output.log 2>&1
+              train_GPT-MOE_567B.sh node002 --profiling=$profiling > output.log 2>&1

 wait


--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 8 --allow-run-as-root \
+             train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/run_1nodes.sh
+++ b/run_1nodes.sh
-
-
 for para in $*
 do
    if [[ $para == --profiling* ]];then
@@ -9,7 +7,8 @@ do
    fi
 done

-mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1

 wait


--- a/run_mixtral8x7B_2nodes.sh
+++ b/run_mixtral8x7B_2nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 16 --hostfile mixtralnodes \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/train_GPT-MOE_567B.sh
+++ b/train_GPT-MOE_567B.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
+export GLOG_minloglevel=3
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 8192
+    --max-position-embeddings 32768
+    --num-layers 64
+    --hidden-size 8192
+    --ffn-hidden-size 32768
+    --num-attention-heads 64
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 16
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 4096
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+    #--tp-comm-overlap
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 16
+    --expert-model-parallel-size 16
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 5 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
--- a/train_GPT-MOE_567B_1nodes.sh
+++ b/train_GPT-MOE_567B_1nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
+export GLOG_minloglevel=3
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 8192
+    --max-position-embeddings 32768
+    --num-layers 2
+    --hidden-size 8192
+    --ffn-hidden-size 32768
+    --num-attention-heads 64
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 16
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+    #--tp-comm-overlap
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_1nodes
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 1
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 5 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
\ No newline at end of file
--- a/train_mixtral_8x7B_1nodes.sh
+++ b/train_mixtral_8x7B_1nodes.sh
@@ -11,20 +11,20 @@ done

 source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
-export HIP_DIRECT_DISPATCH=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-#export NVTE_FLASH_ATTN_TRITON=1
+
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=8
-export NCCL_MIN_NCHANNELS=15
-export NCCL_IB_TIMEOUT=22
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-#export NCCL_IB_HCA=mlx5_0
-#export NCCL_SOCKET_IFNAME=enp145s0f0
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
 export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
@@ -49,8 +49,8 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 4096
    --max-position-embeddings 32768
-    --num-layers 8
-    --hidden-size 1024
+    --num-layers 4
+    --hidden-size 4096
    --ffn-hidden-size 14336
    --num-attention-heads 32
    --init-method-std 0.01
@@ -87,7 +87,7 @@ DATA_ARGS=(

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 128
+    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
    --lr-decay-iters 320000
@@ -99,21 +99,20 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-ranks 0 1 2 3 4 5 6 7 8
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_data_1nodes_dcu_batchgemm
+    --profile-dir torch_prof_mixtral_1nodes
    --use-pytorch-profiler
 )

 HIP_PROFIE_ARGS=(
    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-ranks 0 1 2 3 4 5 6 7 8
    --profile-step-start 4
    --profile-step-end 5
    --use-hip-profiler

--- a/train_mixtral_8x7B_2nodes.sh
+++ b/train_mixtral_8x7B_2nodes.sh
@@ -11,20 +11,20 @@ done

 source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
-export HIP_DIRECT_DISPATCH=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-#export NVTE_FLASH_ATTN_TRITON=1
+
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=8
-export NCCL_MIN_NCHANNELS=15
-export NCCL_IB_TIMEOUT=22
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_HCA=mlx5_0
-export NCCL_SOCKET_IFNAME=enp33s0f3u1
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
 export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
@@ -99,30 +99,32 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
+    --recompute-granularity full
+    --recompute-method uniform
+    --recompute-num-layers 1
 )

 TORCH_PROFIE_ARGS=(
    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+    --profile-ranks 0 1 2 3 8 9 10 11 
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_data_record_shapes
+    --profile-dir torch_prof_data_mixtral_2nodes
    --use-pytorch-profiler
 )

 HIP_PROFIE_ARGS=(
    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+    --profile-ranks 0 1 2 3 8 9 10 11 
    --profile-step-start 4
    --profile-step-end 5
    --use-hip-profiler
 )

 MODEL_PARALLEL_ARGS=(
-    --tensor-model-parallel-size 2
-    --pipeline-model-parallel-size 2
-    --expert-model-parallel-size 4
+    --tensor-model-parallel-size 4
+    --pipeline-model-parallel-size 4
+    --expert-model-parallel-size 2
    --expert-tensor-parallel-size 2
    --use-distributed-optimizer
    --sequence-parallel