update model parameters

70368616 · silencealiang · 8551c38e · 70368616 · 70368616 · 70368616
Commit 70368616 authored Apr 30, 2025 by silencealiang
20 changed files
--- a/examples/deepseek_v3/hostfile_deepseekv3_671B
+++ b/examples/deepseek_v3/hostfile_deepseekv3_671B
--- a/examples/deepseek_v3/hostfile_deepseekv3_671B_4nodes
+++ b/examples/deepseek_v3/hostfile_deepseekv3_671B_4nodes
--- a/examples/deepseek_v3/run_deepseek_v3_1node.sh
+++ b/examples/deepseek_v3/run_deepseek_v3_1node.sh
@@ -2,14 +2,13 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done
 mpirun -np 8  --allow-run-as-root \
-              train_deepseek_v3_1node.sh localhost --profiling=$profiling > output.log 2>&1
+              train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
 wait
-rm -rf CKPT
+rm -rf output
+rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
--- a/examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
+wait
+rm -rf output
+rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
--- a/examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
+wait
+rm -rf output
+rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
--- a/examples/deepseek_v3/topo-input.xml
+++ b/examples/deepseek_v3/topo-input.xml
+<system version="2">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
--- a/examples/deepseek_v3/train_deepseek_v3_1node.sh
+++ b/examples/deepseek_v3/train_deepseek_v3_1node.sh
 #!/bin/bash
 for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        # export GPU_FLUSH_ON_EXECUTION=1
-        # export HIP_DIRECT_DISPATCH=0
    fi
 done
+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
 CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="/public/home/yuguo/check/rccl-tests-0204/topo-input.xml" #"your topo file"
+export NCCL_TOPO_FILE="./topo-input.xml"
-export GLOG_minloglevel=3
-export GROUPED_GEMM_BatchLinear=1
-export LD_LIBRARY_PATH=/public/home/yuguo/data/rocblas-install-0224/lib:$LD_LIBRARY_PATH
-LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+# enable BatchLinear
-RANK=$OMPI_COMM_WORLD_RANK
+export GROUPED_GEMM_BatchLinear=1
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
 ### BASE CONFIG ###
 MODEL_SIZE=A37B
 BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
-LR=1e-5
+LR=1e-4
 MIN_LR=1e-6
 SEQ_LEN=4096
+PAD_LEN=4096
 PR=bf16
 ### BASE CONFIG ###
@@ -45,6 +53,7 @@ PR=bf16
 TP=1
 PP=2
 CP=1
+ETP=1
 EP=4
 SP=true
 DO=true
@@ -56,13 +65,14 @@ SFT=false
 AC=none
 OPTIMIZER_OFFLOAD=false
 SAVE_INTERVAL=500
-DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
+DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-VALID_DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
+VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-PRETRAIN_CHECKPOINT_PATH=${MEGATRON_PATH}/deepseekv3_dataset #"your model path"
+PRETRAIN_CHECKPOINT_PATH="./output"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
 # the following two values will not be used when SFT is true
-TRAIN_TOKENS=100000000
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
-WARMUP_TOKENS=10000
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 ###############################
 OUTPUT_BASEPATH=./output
@@ -72,20 +82,19 @@ if [ $FL = true ]; then
    :
    #exit -1
 elif [ $FL = false ]; then
-    export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
    attn_backend_option=" \
-        --attention-backend fused
+        --attention-backend auto
    "
 fi
 if [ $MODEL_SIZE = A37B ]; then
-    TRAIN_ITERS=2
+    TRAIN_ITERS=10
    HIDDEN_SIZE=7168
    NUM_ATTENTION_HEADS=128
    NUM_LAYERS=2
    INTERMEDIATE_SIZE=18432
    MOE_INTERMEDIATE_SIZE=2048
-    MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
+    MAX_POSITION_EMBEDDINGS=163840
    EXTRA_VOCAB_SIZE=467
    Q_LORA_RANK=1536
    KV_LORA_RANK=512
@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then
    V_HEAD_DIM=128
    ROPE_THETA=10000
    SCALE_FACTOR=40
-    NUM_EXPERTS=8 #256
+    NUM_EXPERTS=8
    ROUTER_TOPK=8
    NUM_SHARED_EXPERTS=1
    RMS_NORM_EPS=1e-6
-    moe_options=" \
+moe_options=" \
-        --moe-grouped-gemm \
+    --moe-grouped-gemm \
-        --moe-expert-capacity-factor 1 \
+    --moe-expert-capacity-factor 0.5 \
-        --moe-pad-expert-input-to-capacity \
+    --moe-pad-expert-input-to-capacity \
-        --moe-token-dispatcher-type alltoall \
+    --moe-token-dispatcher-type alltoall \
-        --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-topk ${ROUTER_TOPK} \
-        --num-experts ${NUM_EXPERTS} \
+    --moe-router-group-topk 2 \
-        --expert-model-parallel-size ${EP} \
+    --moe-router-num-groups 4 \
-        --expert-tensor-parallel-size 1 \
+    --num-experts ${NUM_EXPERTS} \
-        --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --expert-model-parallel-size ${EP} \
-        --moe-router-load-balancing-type aux_loss \
+    --expert-tensor-parallel-size ${ETP} \
-        --moe-aux-loss-coeff 0.001 \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
-        --moe-layer-freq ([0]*0+[1]*2) \
+    --moe-router-load-balancing-type seq_aux_loss \
-        --q-lora-rank ${Q_LORA_RANK} \
+    --moe-router-topk-scaling-factor 2.5 \
-        --kv-lora-rank ${KV_LORA_RANK} \
+    --moe-shared-expert-overlap \
-        --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --moe-router-enable-expert-bias \
-        --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --mscale 1.0 \
-        --v-head-dim ${V_HEAD_DIM} \
+    --mscale-all-dim 1.0 \
-        --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --moe-router-score-function sigmoid \
-        "
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*1+[1]*1) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "
+mtp_options=""
 fi
 # Here are some configs controled by env
@@ -147,6 +167,14 @@ comm_overlap_option="\
    --overlap-grad-reduce \
    --overlap-param-gather"
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
 if [ $AC = full ]; then
    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
    if [ $_check != 0 ]; then
@@ -154,9 +182,9 @@ if [ $AC = full ]; then
        exit -1
    fi
    activation_checkpoint_options=" \
-		    --recompute-method uniform \
+        --recompute-method uniform \
-            --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-num-layers ${MP_AC_LAYERS} \
-		    --recompute-granularity full"
+        --recompute-granularity full"
 elif [ $AC = sel ]; then
    activation_checkpoint_options=" \
        --recompute-activations"
@@ -165,8 +193,8 @@ elif [ $AC = none ]; then
    "
 elif [ $AC = offload ]; then
    activation_checkpoint_options=" \
-		    --cpu-offloading \
+        --cpu-offloading \
-		    --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
    if [ $TP_COMM_OVERLAP -eq 1 ]; then
        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
        comm_overlap_option="\
@@ -179,8 +207,8 @@ fi
 if [ $PR = fp16 ]; then
    pr_options=" \
-		    --fp16 \
+        --fp16 \
-            --apply-query-key-layer-scaling"
+        --apply-query-key-layer-scaling"
    export NVTE_APPLY_QK_LAYER_SCALING=1
 elif [ $PR = bf16 ]; then
    pr_options=" \
@@ -200,7 +228,7 @@ fi
 if [ $DO = true ]; then
    do_option=" \
-		    --use-distributed-optimizer"
+        --use-distributed-optimizer"
 elif [ $DO = false ]; then
    do_option=" \
@@ -210,7 +238,7 @@ fi
 if [ $SP = true ] && [ $TP -gt 1 ]; then
    sp_option=" \
-		    --sequence-parallel"
+        --sequence-parallel"
 elif [ $SP = false ]; then
    sp_option=" \
@@ -236,7 +264,7 @@ fi
 if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
    load_option=" \
-            --tokenizer-model $PRETRAIN_CHECKPOINT_PATH"
+            --load $PRETRAIN_CHECKPOINT_PATH"
 fi
 if [ $OPTIMIZER_OFFLOAD != false ]; then
@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
 fi
 if [ $SFT = true ]; then
-    TRAIN_ITERS=${24}
+    TRAIN_ITERS=${25}
-    LR_WARMUP_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
-    PREFIX="finetune-mcore-deepseek-v3"
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
 else
-    # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-    PREFIX="pretrain-mcore-deepseek-v3"
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
 fi
 if [ ${MP_DATASET_TYPE} = "raw" ]; then
@@ -278,16 +312,18 @@ else
 fi
 ##### Prepare logdirs #######
-NAME="${PREFIX}"
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
 mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
 mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
 mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
 mkdir -p ${TENSORBOARD_DIR}
 SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
 mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
-find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
 megatron_options="  \
        --lr ${LR} \
@@ -314,7 +350,7 @@ megatron_options="  \
        --log-interval 1 \
        --log-throughput \
        --eval-interval 10000 \
-        --eval-iters 5 \
+        --eval-iters 3 \
        --save-interval ${SAVE_INTERVAL} \
        --tensorboard-queue-size 1 \
        --tensorboard-dir ${TENSORBOARD_DIR} \
@@ -328,13 +364,12 @@ megatron_options="  \
        --num-workers 8 \
        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
        --swiglu \
        --normalization RMSNorm \
        --norm-epsilon ${RMS_NORM_EPS} \
        --use-rotary-position-embeddings \
-        --no-bias-swiglu-fusion \
        --no-rope-fusion \
-        --position-embedding-type rope \
        --untie-embeddings-and-output-weights \
        --disable-bias-linear \
        --rotary-base ${ROPE_THETA} \
@@ -342,12 +377,11 @@ megatron_options="  \
        --no-save-optim \
        --kv-channels ${V_HEAD_DIM} \
        --qk-layernorm \
+        --multi-latent-attention \
        --ckpt-format torch \
        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
        --use-rope-scaling \
-        --multi-latent-attention \
-        --mtp-num-layers 1 \
-        --use-mcore-models \
        "
 TORCH_PROFIE_ARGS="  \
@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS="  \
    --profile-ranks 0 1 2 3 4 5 6 7 \
    --profile-step-start 3 \
    --profile-step-end 4 \
-    --profile-dir torch_prof_data_16nodes_dcu \
+    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
    --use-pytorch-profiler \
 "
@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS="  \
    --use-hip-profiler \
 "
-APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+DISTRIBUTED_ARGS="  \
-     ${megatron_options} \
-     ${dataset_options} \
-     ${pr_options} \
-     ${load_option} \
-     ${activation_checkpoint_options} \
-     ${do_option} \
-     ${sp_option} \
-     ${moe_options} \
-     ${offload_option} \
-     ${sft_options} \
-     ${vp_option} \
-     ${packing_options} \
-     ${uneven_split_option} \
-     ${attn_backend_option} \
-     ${comm_overlap_option} \
    --rank ${RANK} \
    --world-size ${WORLD_SIZE} \
    --local-rank ${LOCAL_RANK} \
-    --dist-url tcp://${1}:25900 \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
    "
 if [[ $profiling == "torch" ]]; then
@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then
    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
+#for hygon cpu
 case ${LOCAL_RANK} in
 [0])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac
--- a/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+#!/bin/bash
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
+### BASE CONFIG ###
+MODEL_SIZE=A37B
+BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+LR=1e-4
+MIN_LR=1e-6
+SEQ_LEN=4096
+PAD_LEN=4096
+PR=bf16
+### BASE CONFIG ###
+### PARALLEL / BOOL OPTION ###
+TP=2
+PP=2
+CP=1
+ETP=1
+EP=16
+SP=true
+DO=true
+FL=true
+SFT=false
+### PARALLEL / BOOL OPTION ###
+### OTHERS ###
+AC=none
+OPTIMIZER_OFFLOAD=false
+SAVE_INTERVAL=500
+DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
+VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
+PRETRAIN_CHECKPOINT_PATH="./output"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+# the following two values will not be used when SFT is true
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+###############################
+OUTPUT_BASEPATH=./output
+### OTHERS ###
+if [ $FL = true ]; then
+    :
+    #exit -1
+elif [ $FL = false ]; then
+    attn_backend_option=" \
+        --attention-backend auto
+    "
+fi
+if [ $MODEL_SIZE = A37B ]; then
+    TRAIN_ITERS=10
+    HIDDEN_SIZE=7168
+    NUM_ATTENTION_HEADS=128
+    NUM_LAYERS=3
+    INTERMEDIATE_SIZE=18432
+    MOE_INTERMEDIATE_SIZE=2048
+    MAX_POSITION_EMBEDDINGS=163840
+    EXTRA_VOCAB_SIZE=467
+    Q_LORA_RANK=1536
+    KV_LORA_RANK=512
+    QK_NOPE_HEAD_DIM=128
+    QK_ROPE_HEAD_DIM=64
+    V_HEAD_DIM=128
+    ROPE_THETA=10000
+    SCALE_FACTOR=40
+    NUM_EXPERTS=256
+    ROUTER_TOPK=8
+    NUM_SHARED_EXPERTS=1
+    RMS_NORM_EPS=1e-6
+moe_options=" \
+    --moe-grouped-gemm \
+    --moe-expert-capacity-factor 0.5 \
+    --moe-pad-expert-input-to-capacity \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-group-topk 4 \
+    --moe-router-num-groups 8 \
+    --num-experts ${NUM_EXPERTS} \
+    --expert-model-parallel-size ${EP} \
+    --expert-tensor-parallel-size ${ETP} \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-router-enable-expert-bias \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*1+[1]*2) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "
+mtp_options=""
+fi
+# Here are some configs controled by env
+if [ -z ${MP_DATASET_TYPE} ];then
+    MP_DATASET_TYPE="idxmap"
+fi
+if [ -z ${MP_AC_LAYERS} ];then
+    MP_AC_LAYERS=1
+fi
+if [ -z ${MP_VP} ]; then
+    vp_option=""
+else
+    vp_option=" \
+        --num-layers-per-virtual-pipeline-stage ${MP_VP}"
+fi
+if [ -z ${MP_SFT_PACKING} ]; then
+    MP_SFT_PACKING=false
+fi
+TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
+comm_overlap_option="\
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
+if [ $AC = full ]; then
+    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
+    if [ $_check != 0 ]; then
+        echo "the num layers per pp rank must be a multiple of the recompute layers."
+        exit -1
+    fi
+    activation_checkpoint_options=" \
+        --recompute-method uniform \
+        --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-granularity full"
+elif [ $AC = sel ]; then
+    activation_checkpoint_options=" \
+        --recompute-activations"
+elif [ $AC = none ]; then
+    activation_checkpoint_options=" \
+    "
+elif [ $AC = offload ]; then
+    activation_checkpoint_options=" \
+        --cpu-offloading \
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+    if [ $TP_COMM_OVERLAP -eq 1 ]; then
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option="\
+            --tp-comm-overlap"
+    else
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option=""
+    fi
+fi
+if [ $PR = fp16 ]; then
+    pr_options=" \
+        --fp16 \
+        --apply-query-key-layer-scaling"
+    export NVTE_APPLY_QK_LAYER_SCALING=1
+elif [ $PR = bf16 ]; then
+    pr_options=" \
+        --bf16"
+elif [ $PR = fp8 ]; then
+    pr_options=" \
+        --bf16 \
+        --fp8-format hybrid \
+        --fp8-amax-compute-algo max \
+        --fp8-amax-history-len 1024"
+fi
+if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
+    echo "Offload optimizer is valid only if \$DO=true"
+    DO=true
+fi
+if [ $DO = true ]; then
+    do_option=" \
+        --use-distributed-optimizer"
+elif [ $DO = false ]; then
+    do_option=" \
+                    "
+fi
+if [ $SP = true ] && [ $TP -gt 1 ]; then
+    sp_option=" \
+        --sequence-parallel"
+elif [ $SP = false ]; then
+    sp_option=" \
+                    "
+fi
+if [ -z ${MP_PP0_LAYERS} ];then
+    uneven_split_option=""
+elif [ ${PP} -gt 1 ]; then
+    _check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
+    if [ $_check != 0 ]; then
+        echo "With uneven pipelineing the left over layers must be divisible by left over stages."
+        exit -1
+    fi
+    uneven_split_option=" \
+        --decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
+    "
+else
+    echo "uneven pipeline split must be used when PP > 1"
+    exit -1
+fi
+if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
+    load_option=" \
+            --load $PRETRAIN_CHECKPOINT_PATH"
+fi
+if [ $OPTIMIZER_OFFLOAD != false ]; then
+    offload_option=" \
+        --optimizer-cpu-offload \
+        --use-precision-aware-optimizer \
+        --optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
+fi
+if [ $SFT = true ]; then
+    TRAIN_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
+    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
+else
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
+fi
+if [ ${MP_DATASET_TYPE} = "raw" ]; then
+    dataset_options=" \
+        --train-data-path ${DATASET_PATH} \
+        --valid-data-path ${VALID_DATASET_PATH} \
+        --dataloader-type cyclic \
+        --dataset JSON-SFT"
+else 
+    dataset_options=" \
+        --data-path ${DATASET_PATH} \
+        --split 99,1,0"
+fi
+if [ ${MP_SFT_PACKING} = true ]; then
+    echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
+    packing_options=""
+else
+    packing_options=""
+fi
+##### Prepare logdirs #######
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR}
+SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+megatron_options="  \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --weight-decay 0.1 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --clip-grad 1.0 \
+        --init-method-std 0.008 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --lr-decay-iters ${LR_DECAY_ITERS} \
+        --lr-warmup-iters ${LR_WARMUP_ITERS} \
+        --train-iters ${TRAIN_ITERS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTENTION_HEADS} \
+        --ffn-hidden-size ${INTERMEDIATE_SIZE} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --log-interval 1 \
+        --log-throughput \
+        --eval-interval 10000 \
+        --eval-iters 3 \
+        --save-interval ${SAVE_INTERVAL} \
+        --tensorboard-queue-size 1 \
+        --tensorboard-dir ${TENSORBOARD_DIR} \
+        --log-timers-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --context-parallel-size ${CP} \
+        --no-load-optim \
+        --no-load-rng \
+        --num-workers 8 \
+        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
+        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
+        --swiglu \
+        --normalization RMSNorm \
+        --norm-epsilon ${RMS_NORM_EPS} \
+        --use-rotary-position-embeddings \
+        --no-rope-fusion \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --rotary-base ${ROPE_THETA} \
+        --rotary-scaling-factor ${SCALE_FACTOR} \
+        --no-save-optim \
+        --kv-channels ${V_HEAD_DIM} \
+        --qk-layernorm \
+        --multi-latent-attention \
+        --ckpt-format torch \
+        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
+        --use-rope-scaling \
+        "
+TORCH_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 3 \
+    --profile-step-end 4 \
+    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
+    --use-pytorch-profiler \
+"
+HIP_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 4 \
+    --profile-step-end 5 \
+    --use-hip-profiler \
+"
+DISTRIBUTED_ARGS="  \
+    --rank ${RANK} \
+    --world-size ${WORLD_SIZE} \
+    --local-rank ${LOCAL_RANK} \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
+    "
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
--- a/examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
+#!/bin/bash
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
+### BASE CONFIG ###
+MODEL_SIZE=A37B
+BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=4096
+LR=1e-4
+MIN_LR=1e-6
+SEQ_LEN=4096
+PAD_LEN=4096
+PR=bf16
+### BASE CONFIG ###
+### PARALLEL / BOOL OPTION ###
+TP=4
+PP=8
+CP=1
+ETP=2
+EP=64
+SP=true
+DO=true
+FL=true
+SFT=false
+### PARALLEL / BOOL OPTION ###
+### OTHERS ###
+AC=none
+OPTIMIZER_OFFLOAD=false
+SAVE_INTERVAL=500
+DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
+VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
+PRETRAIN_CHECKPOINT_PATH="./output"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+# the following two values will not be used when SFT is true
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+###############################
+OUTPUT_BASEPATH=./output
+### OTHERS ###
+if [ $FL = true ]; then
+    :
+    #exit -1
+elif [ $FL = false ]; then
+    attn_backend_option=" \
+        --attention-backend auto
+    "
+fi
+if [ $MODEL_SIZE = A37B ]; then
+    TRAIN_ITERS=10
+    HIDDEN_SIZE=7168
+    NUM_ATTENTION_HEADS=128
+    NUM_LAYERS=61
+    INTERMEDIATE_SIZE=18432
+    MOE_INTERMEDIATE_SIZE=2048
+    MAX_POSITION_EMBEDDINGS=163840
+    EXTRA_VOCAB_SIZE=467
+    Q_LORA_RANK=1536
+    KV_LORA_RANK=512
+    QK_NOPE_HEAD_DIM=128
+    QK_ROPE_HEAD_DIM=64
+    V_HEAD_DIM=128
+    ROPE_THETA=10000
+    SCALE_FACTOR=40
+    NUM_EXPERTS=256
+    ROUTER_TOPK=8
+    NUM_SHARED_EXPERTS=1
+    RMS_NORM_EPS=1e-6
+moe_options=" \
+    --moe-grouped-gemm \
+    --moe-expert-capacity-factor 0.5 \
+    --moe-pad-expert-input-to-capacity \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-group-topk 4 \
+    --moe-router-num-groups 8 \
+    --num-experts ${NUM_EXPERTS} \
+    --expert-model-parallel-size ${EP} \
+    --expert-tensor-parallel-size ${ETP} \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-router-enable-expert-bias \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*3+[1]*58) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "
+mtp_options=""
+fi
+# Here are some configs controled by env
+if [ -z ${MP_DATASET_TYPE} ];then
+    MP_DATASET_TYPE="idxmap"
+fi
+if [ -z ${MP_AC_LAYERS} ];then
+    MP_AC_LAYERS=1
+fi
+if [ -z ${MP_VP} ]; then
+    vp_option=""
+else
+    vp_option=" \
+        --num-layers-per-virtual-pipeline-stage ${MP_VP}"
+fi
+if [ -z ${MP_SFT_PACKING} ]; then
+    MP_SFT_PACKING=false
+fi
+TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
+comm_overlap_option="\
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
+if [ $AC = full ]; then
+    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
+    if [ $_check != 0 ]; then
+        echo "the num layers per pp rank must be a multiple of the recompute layers."
+        exit -1
+    fi
+    activation_checkpoint_options=" \
+        --recompute-method uniform \
+        --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-granularity full"
+elif [ $AC = sel ]; then
+    activation_checkpoint_options=" \
+        --recompute-activations"
+elif [ $AC = none ]; then
+    activation_checkpoint_options=" \
+    "
+elif [ $AC = offload ]; then
+    activation_checkpoint_options=" \
+        --cpu-offloading \
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+    if [ $TP_COMM_OVERLAP -eq 1 ]; then
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option="\
+            --tp-comm-overlap"
+    else
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option=""
+    fi
+fi
+if [ $PR = fp16 ]; then
+    pr_options=" \
+        --fp16 \
+        --apply-query-key-layer-scaling"
+    export NVTE_APPLY_QK_LAYER_SCALING=1
+elif [ $PR = bf16 ]; then
+    pr_options=" \
+        --bf16"
+elif [ $PR = fp8 ]; then
+    pr_options=" \
+        --bf16 \
+        --fp8-format hybrid \
+        --fp8-amax-compute-algo max \
+        --fp8-amax-history-len 1024"
+fi
+if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
+    echo "Offload optimizer is valid only if \$DO=true"
+    DO=true
+fi
+if [ $DO = true ]; then
+    do_option=" \
+        --use-distributed-optimizer"
+elif [ $DO = false ]; then
+    do_option=" \
+                    "
+fi
+if [ $SP = true ] && [ $TP -gt 1 ]; then
+    sp_option=" \
+        --sequence-parallel"
+elif [ $SP = false ]; then
+    sp_option=" \
+                    "
+fi
+if [ -z ${MP_PP0_LAYERS} ];then
+    uneven_split_option=""
+elif [ ${PP} -gt 1 ]; then
+    _check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
+    if [ $_check != 0 ]; then
+        echo "With uneven pipelineing the left over layers must be divisible by left over stages."
+        exit -1
+    fi
+    uneven_split_option=" \
+        --decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
+    "
+else
+    echo "uneven pipeline split must be used when PP > 1"
+    exit -1
+fi
+if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
+    load_option=" \
+            --load $PRETRAIN_CHECKPOINT_PATH"
+fi
+if [ $OPTIMIZER_OFFLOAD != false ]; then
+    offload_option=" \
+        --optimizer-cpu-offload \
+        --use-precision-aware-optimizer \
+        --optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
+fi
+if [ $SFT = true ]; then
+    TRAIN_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
+    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
+else
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
+fi
+if [ ${MP_DATASET_TYPE} = "raw" ]; then
+    dataset_options=" \
+        --train-data-path ${DATASET_PATH} \
+        --valid-data-path ${VALID_DATASET_PATH} \
+        --dataloader-type cyclic \
+        --dataset JSON-SFT"
+else 
+    dataset_options=" \
+        --data-path ${DATASET_PATH} \
+        --split 99,1,0"
+fi
+if [ ${MP_SFT_PACKING} = true ]; then
+    echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
+    packing_options=""
+else
+    packing_options=""
+fi
+##### Prepare logdirs #######
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR}
+SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+megatron_options="  \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --weight-decay 0.1 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --clip-grad 1.0 \
+        --init-method-std 0.008 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --lr-decay-iters ${LR_DECAY_ITERS} \
+        --lr-warmup-iters ${LR_WARMUP_ITERS} \
+        --train-iters ${TRAIN_ITERS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTENTION_HEADS} \
+        --ffn-hidden-size ${INTERMEDIATE_SIZE} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --log-interval 1 \
+        --log-throughput \
+        --eval-interval 10000 \
+        --eval-iters 3 \
+        --save-interval ${SAVE_INTERVAL} \
+        --tensorboard-queue-size 1 \
+        --tensorboard-dir ${TENSORBOARD_DIR} \
+        --log-timers-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --context-parallel-size ${CP} \
+        --no-load-optim \
+        --no-load-rng \
+        --num-workers 8 \
+        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
+        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
+        --swiglu \
+        --normalization RMSNorm \
+        --norm-epsilon ${RMS_NORM_EPS} \
+        --use-rotary-position-embeddings \
+        --no-rope-fusion \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --rotary-base ${ROPE_THETA} \
+        --rotary-scaling-factor ${SCALE_FACTOR} \
+        --no-save-optim \
+        --kv-channels ${V_HEAD_DIM} \
+        --qk-layernorm \
+        --multi-latent-attention \
+        --ckpt-format torch \
+        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
+        --use-rope-scaling \
+        "
+TORCH_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 3 \
+    --profile-step-end 4 \
+    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
+    --use-pytorch-profiler \
+"
+HIP_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 4 \
+    --profile-step-end 5 \
+    --use-hip-profiler \
+"
+DISTRIBUTED_ARGS="  \
+    --rank ${RANK} \
+    --world-size ${WORLD_SIZE} \
+    --local-rank ${LOCAL_RANK} \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
+    "
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
--- a/examples/gpt3/README.md
+++ b/examples/gpt3/README.md
-# GPT3 MODEL
-## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
-## 1. Training setup
-<a id="markdown-training-setup" name="training-setup"></a>
-To run the model using a docker container run it as follows
-```
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
-CHECKPOINT_PATH="" #<Specify path>
-TENSORBOARD_LOGS_PATH=""#<Specify path>
-VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
-MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
-DATA_PATH="" #<Specify path and file prefix>_text_document
-docker run \
-  --gpus=all \
-  --ipc=host \
-  --workdir /workspace/megatron-lm \
-  -v /path/to/data:/path/to/data \
-  -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
-  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
-```
-NOTE: Depending on the environment you are running it the above command might like slightly different.
-## 2. Configurations
-<a id="markdown-configurations" name="configurations"></a>
-The example in this folder shows you how to run 175B model. There are other configs you could run as well
-### 345M
-```
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --seq-length 1024 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-```
-### 857M
-```
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --seq-length 2048 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-```
--- a/examples/gpt3/run_gpt_567B_1nodes.sh
+++ b/examples/gpt3/run_gpt_567B_1nodes.sh
@@ -6,7 +6,7 @@ do
 done
 mpirun -np 8  --allow-run-as-root \
-              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/gpt3/run_gpt_567B_multinodes.sh
+++ b/examples/gpt3/run_gpt_567B_multinodes.sh
@@ -5,11 +5,11 @@ do
    fi
 done
-mpirun -np 512 --hostfile hostfile_gpt_567B \
+mpirun -np 1024 --hostfile hostfile_gpt_567B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/gpt3/train_gpt_567B_1nodes.sh
+++ b/examples/gpt3/train_gpt_567B_1nodes.sh
@@ -93,11 +93,11 @@ TRAINING_ARGS=(
    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=(
    --pipeline-model-parallel-size 1
    --expert-model-parallel-size 4
    --expert-tensor-parallel-size 2
+    --context-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -173,42 +174,34 @@ fi
 case ${LOCAL_RANK} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=4 --membind=4 ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=5 --membind=5 ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=6 --membind=6 ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=7 --membind=7 ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac
--- a/examples/gpt3/train_gpt_567B_multinodes.sh
+++ b/examples/gpt3/train_gpt_567B_multinodes.sh
@@ -90,14 +90,14 @@ DATA_ARGS=(
 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 1024
+    --global-batch-size 2048
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2
+    --profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
    --use-pytorch-profiler
 )
@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=(
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 4
-    --pipeline-model-parallel-size 8
+    --pipeline-model-parallel-size 16
    --expert-model-parallel-size 16
    --expert-tensor-parallel-size 4
    --context-parallel-size 2
-    #--num-layers-per-virtual-pipeline-stage 2
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -175,42 +174,34 @@ fi
 case ${LOCAL_RANK} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=4 --membind=4 ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=5 --membind=5 ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=6 --membind=6 ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=7 --membind=7 ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac
--- a/examples/mixtral/run_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/run_mixtral_8x22B_1nodes.sh
@@ -6,7 +6,7 @@ do
 done
 mpirun -np 8  --allow-run-as-root \
-              train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+              train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/mixtral/run_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x22B_multinodes.sh
@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
+              train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/mixtral/run_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_1nodes.sh
@@ -6,7 +6,7 @@ do
 done
 mpirun -np 8  --allow-run-as-root \
-              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/mixtral/run_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_multinodes.sh
@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
+              train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
 wait

--- a/examples/mixtral/train_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_1nodes.sh
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
    --pipeline-model-parallel-size 1
    --expert-model-parallel-size 8
    --expert-tensor-parallel-size 1
+    --context-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
    #--load $CHECKPOINT_PATH \
    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
    --no-load-optim \
-    --no-load-rng
+    --no-load-rng \
+    --no-save-optim
 )
 if [ -n "${WANDB_API_KEY}" ]; then
@@ -175,43 +177,34 @@ fi
 case ${LOCAL_RANK} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=4 --membind=4 ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=5 --membind=5 ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=6 --membind=6 ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=7 --membind=7 ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac
--- a/examples/mixtral/train_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_multinodes.sh
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
+    --profile-dir torch_prof_mixtral8x22B_1nodes_tp4-pp8-ep8-ep_tp1-cp1
    --use-pytorch-profiler
 )
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
    --pipeline-model-parallel-size 8
    --expert-model-parallel-size 8
    --expert-tensor-parallel-size 1
+    --context-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
    #--load $CHECKPOINT_PATH \
    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
    --no-load-optim \
-    --no-load-rng
+    --no-load-rng \
+    --no-save-optim
 )
 if [ -n "${WANDB_API_KEY}" ]; then
@@ -175,43 +177,34 @@ fi
 case ${LOCAL_RANK} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=4 --membind=4 ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=5 --membind=5 ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=6 --membind=6 ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
+  numactl --cpunodebind=7 --membind=7 ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
 esac