fix llama2 bug and update file format

fe0b03b5 · silencealiang · ee3ff5df · ee3ff5df · fe0b03b5 · fe0b03b5
Commit fe0b03b5 authored May 12, 2025 by silencealiang
6 changed files
--- a/examples/mixtral/topo-input.xml
+++ b/examples/mixtral/topo-input.xml
-<system version="2">
-  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
-      <nic>
-        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
-        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
-      </nic>
-    </pci>
-  </cpu>
-</system>
--- a/examples/mixtral/train_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_1nodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs Mixtral 8x22B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to my-mixtral_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -86,8 +89,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 99990,8,2
 )

@@ -107,23 +110,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-etp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
    )
 fi


--- a/examples/mixtral/train_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_multinodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs Mixtral 8x22B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to my-mixtral_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -86,8 +89,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 99990,8,2
 )

@@ -107,23 +110,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 4
    --pipeline-model-parallel-size 8
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
    )
 fi


--- a/examples/mixtral/train_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_1nodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs Mixtral 8x7B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to my-mixtral_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -86,8 +89,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 99990,8,2
 )

@@ -107,23 +110,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-etp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
        --wandb-project ${WANDB_PROJECT:-"Mixtral"}

--- a/examples/mixtral/train_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_multinodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs Mixtral 8x7B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to my-mixtral_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -86,8 +89,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 99990,8,2
 )

@@ -107,23 +110,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 8 9 10 11 
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 4
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 8 9 10 11 
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-etp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
        --wandb-project ${WANDB_PROJECT:-"Mixtral"}

--- a/examples/deepseek_v3/topo-input.xml
+++ b/examples/deepseek_v3/topo-input.xml