Merge branch 'megatron_v0.11.0' into 'main'

Megatron v0.11.0 See merge request OpenDAS/dcu_megatron!2

Merge branch 'megatron_v0.11.0' into 'main'
Megatron v0.11.0 See merge request OpenDAS/dcu_megatron!2
61968c79 · dongcl · be9a69d7 · e45e6793 · 61968c79 · 61968c79
Commit 61968c79 authored Apr 11, 2025 by dongcl
9 changed files
--- a/examples/mixtral/run_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x22B_multinodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/examples/mixtral/run_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_1nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
--- a/examples/mixtral/run_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_multinodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
--- a/examples/mixtral/topo-input.xml
+++ b/examples/mixtral/topo-input.xml
+<system version="2">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
+            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
+            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
+          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
+            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
+              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
+              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
+            </gpu>
+          </pci>
+        </pci>
+      </pci>
+      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
+        <nic>
+          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
+        </nic>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
+    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
--- a/examples/mixtral/train_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_1nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Runs Mixtral 8x22B model
+source /opt/dtk/env.sh
+
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+
+# data path
+CHECKPOINT_PATH="path to CKPT" 
+TOKENIZER_MODEL="path to tokenizer.model"
+DATA_PATH="path to my-mixtral_text_document"
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 65536
+    --num-layers 4
+    --hidden-size 6144
+    --ffn-hidden-size 16384
+    --num-attention-heads 48
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+    --ckpt-format torch
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-3
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    #--moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 1
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/examples/mixtral/train_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_multinodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Runs Mixtral 8x22B model
+source /opt/dtk/env.sh
+
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+
+# data path
+CHECKPOINT_PATH="path to CKPT" 
+TOKENIZER_MODEL="path to tokenizer.model"
+DATA_PATH="path to my-mixtral_text_document"
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 65536
+    --num-layers 56
+    --hidden-size 6144
+    --ffn-hidden-size 16384
+    --num-attention-heads 48
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+    --ckpt-format torch
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-3
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    #--moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 4
+    --pipeline-model-parallel-size 8
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/examples/mixtral/train_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_1nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+
+# data path
+CHECKPOINT_PATH="path to CKPT" 
+TOKENIZER_MODEL="path to tokenizer.model"
+DATA_PATH="path to my-mixtral_text_document"
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 4
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+    --ckpt-format torch
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    #--moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 1
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/examples/mixtral/train_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_multinodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+
+# default env
+DIST_URL=${1}
+DIST_PORT=25900
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+# nccl env
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="./topo-input.xml"
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+
+# data path
+CHECKPOINT_PATH="path to CKPT" 
+TOKENIZER_MODEL="path to tokenizer.model"
+DATA_PATH="path to my-mixtral_text_document"
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+    --ckpt-format torch
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    #--moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 8 9 10 11 
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 4
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
 # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 """Pretrain GPT."""

-import os
+import os, sys
+current_dir = os.path.dirname(os.path.abspath(__file__))
+megatron_path = os.path.join(current_dir, "Megatron-LM")
+sys.path.append(megatron_path)
 import torch
 from functools import partial
 from contextlib import nullcontext