fix llama2 bug and update file format

fe0b03b5 · silencealiang · ee3ff5df · fe0b03b5 · fe0b03b5 · fe0b03b5
Commit fe0b03b5 authored May 12, 2025 by silencealiang
20 changed files
--- a/dcu_megatron/adaptor/megatron_adaptor.py
+++ b/dcu_megatron/adaptor/megatron_adaptor.py
@@ -163,6 +163,11 @@ class CoreAdaptation(MegatronAdaptationABC):
                                    staticmethod,
                                    apply_wrapper=True)

+        # reduce_scatter_to_sequence_parallel_region
+        MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
+                                    torch._dynamo.disable,
+                                    apply_wrapper=True)
+                                    
        # flux
        if int(os.getenv("USE_FLUX_OVERLAP", "0")):
            from ..core.tensor_parallel.layers import (

--- a/dcu_megatron/legacy/model/transformer.py
+++ b/dcu_megatron/legacy/model/transformer.py
@@ -6,6 +6,7 @@ from functools import wraps
 from megatron.training import get_args
 from megatron.core import tensor_parallel
 from megatron.legacy.model.enums import AttnType
+from megatron.core.utils import deprecate_inference_params
 from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.legacy.model.module import MegatronModule

@@ -94,18 +95,20 @@ class ParallelAttentionPatch(MegatronModule):
    """

    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, inference_params=None,
-                rotary_pos_emb=None):
+                encoder_output=None, inference_context=None,
+                rotary_pos_emb=None, *, inference_params=None):
        # hidden_states: [sq, b, h]

+        inference_context = deprecate_inference_params(inference_context, inference_params)
+
        # =================================================
        # Pre-allocate memory for key-values for inference.
        # =================================================
        is_first_step = False
-        if inference_params:
-            if self.layer_number not in inference_params.key_value_memory_dict:
-                inf_max_seq_len = inference_params.max_sequence_length
-                inf_max_batch_size = inference_params.max_batch_size
+        if inference_context:
+            if self.layer_number not in inference_context.key_value_memory_dict:
+                inf_max_seq_len = inference_context.max_sequence_length
+                inf_max_batch_size = inference_context.max_batch_size
                inference_key_memory = self._allocate_memory(
                    inf_max_seq_len, inf_max_batch_size,
                    self.num_query_groups_per_partition)
@@ -113,12 +116,12 @@ class ParallelAttentionPatch(MegatronModule):
                    inf_max_seq_len, inf_max_batch_size,
                    self.num_query_groups_per_partition)

-                inference_params.key_value_memory_dict[self.layer_number] = (
+                inference_context.key_value_memory_dict[self.layer_number] = (
                    inference_key_memory, inference_value_memory)
                is_first_step = True
            else:
                inference_key_memory, inference_value_memory = \
-                    inference_params.key_value_memory_dict[self.layer_number]
+                    inference_context.key_value_memory_dict[self.layer_number]

        # =====================
        # Query, Key, and Value
@@ -188,13 +191,14 @@ class ParallelAttentionPatch(MegatronModule):
            else:
                rotary_pos_emb = ((rotary_pos_emb,) * 2)

-        if inference_params:
-            batch_start = inference_params.batch_size_offset
+        if inference_context:
+            batch_start = inference_context.batch_size_offset
            batch_end = batch_start + key_layer.size(1)
            assert batch_end <= inference_key_memory.size(1)
-            sequence_start = inference_params.sequence_len_offset
+            sequence_start = inference_context.sequence_len_offset
            sequence_end = sequence_start + key_layer.size(0)
-            assert sequence_end <= inference_key_memory.size(0)
+            assert sequence_end <= inference_key_memory.size(0), ("Current sequence length is "
+            "longer than expected maximum sequence length! Increase inference_max_seq_length.")
            # Copy key and values.
            inference_key_memory[sequence_start:sequence_end,
                                 batch_start:batch_end, ...] = key_layer

--- a/examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
@@ -5,10 +5,22 @@ do
    fi
 done

-mpirun -np 8  --allow-run-as-root \
-              train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+HOST=localhost
+PORT=25900
+DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+CHECKPOINT_PATH="path to output" 
+

-wait
+mpirun -np 8  --allow-run-as-root \
+              train_deepseekv3_671B_1nodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1

-rm -rf output
-rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
+wait
\ No newline at end of file
--- a/examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
@@ -5,13 +5,25 @@ do
    fi
 done

+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+HOST=""  # modify this variable
+PORT=25900
+DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+CHECKPOINT_PATH="path to output" 
+
+
 mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
-
-wait
+              train_deepseekv3_671B_4nodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1

-rm -rf output
-rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
+wait
\ No newline at end of file
--- a/examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
@@ -5,13 +5,25 @@ do
    fi
 done

+# Runs DeepseekV3 671B model
+source /opt/dtk/env.sh
+HOST=""  # modify this variable
+PORT=25900
+DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
+TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+CHECKPOINT_PATH="path to output" 
+
+
 mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
-
-wait
+              train_deepseekv3_671B_multinodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1

-rm -rf output
-rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
+wait
\ No newline at end of file
--- a/examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
@@ -2,17 +2,20 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs DeepseekV3 671B model
-source /opt/dtk/env.sh
-
 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
@@ -51,7 +54,7 @@ PR=bf16

 ### PARALLEL / BOOL OPTION ###
 TP=1
-PP=2
+PP=1
 CP=1
 ETP=1
 EP=4
@@ -65,17 +68,17 @@ SFT=false
 AC=none
 OPTIMIZER_OFFLOAD=false
 SAVE_INTERVAL=500
-DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-PRETRAIN_CHECKPOINT_PATH="./output"
-TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}

 # the following two values will not be used when SFT is true
 TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 ###############################

-OUTPUT_BASEPATH=./output
+OUTPUT_BASEPATH=${checkpoint_path}
 ### OTHERS ###

 if [ $FL = true ]; then
@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS="  \
    --profile-ranks 0 1 2 3 4 5 6 7 \
    --profile-step-start 3 \
    --profile-step-end 4 \
-    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
+    --profile-dir torch_prof_deepseekv3_1nodes_tp1-pp1-ep4-etp1-cp1 \
    --use-pytorch-profiler \
 "


--- a/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
@@ -2,17 +2,20 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs DeepseekV3 671B model
-source /opt/dtk/env.sh
-
 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
@@ -65,17 +68,17 @@ SFT=false
 AC=none
 OPTIMIZER_OFFLOAD=false
 SAVE_INTERVAL=500
-DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-PRETRAIN_CHECKPOINT_PATH="./output"
-TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}

 # the following two values will not be used when SFT is true
 TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 ###############################

-OUTPUT_BASEPATH=./output
+OUTPUT_BASEPATH=${checkpoint_path}
 ### OTHERS ###

 if [ $FL = true ]; then

--- a/examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
@@ -2,17 +2,20 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs DeepseekV3 671B model
-source /opt/dtk/env.sh
-
 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
@@ -65,17 +68,17 @@ SFT=false
 AC=none
 OPTIMIZER_OFFLOAD=false
 SAVE_INTERVAL=500
-DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
-PRETRAIN_CHECKPOINT_PATH="./output"
-TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}

 # the following two values will not be used when SFT is true
 TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 ###############################

-OUTPUT_BASEPATH=./output
+OUTPUT_BASEPATH=${checkpoint_path}
 ### OTHERS ###

 if [ $FL = true ]; then
@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS="  \
    --profile-ranks 0 1 2 3 4 5 6 7 \
    --profile-step-start 3 \
    --profile-step-end 4 \
-    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
+    --profile-dir torch_prof_deepseekv3_128nodes_tp4-pp8-ep64-etp2-cp1 \
    --use-pytorch-profiler \
 "


--- a/examples/gpt3/run_gpt_567B_1nodes.sh
+++ b/examples/gpt3/run_gpt_567B_1nodes.sh
@@ -5,10 +5,22 @@ do
    fi
 done

-mpirun -np 8  --allow-run-as-root \
-              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
+# Runs GPT 567B model
+source /opt/dtk/env.sh
+HOST=localhost
+PORT=25900
+DATA_PATH="path to redpajama_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+

-wait
+mpirun -np 8  --allow-run-as-root \
+              train_gpt_567B_1nodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf gpt_dataset/redpajama_text_document
+wait
\ No newline at end of file
--- a/examples/gpt3/run_gpt_567B_multinodes.sh
+++ b/examples/gpt3/run_gpt_567B_multinodes.sh
@@ -5,13 +5,25 @@ do
    fi
 done

-mpirun -np 1024 --hostfile hostfile_gpt_567B \
-              --allow-run-as-root \
-              --bind-to none \
-              --mca plm_rsh_no_tree_spawn 1 \
-              train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
+# Runs GPT 567B model
+source /opt/dtk/env.sh
+HOST=""  # modify this variable
+PORT=25900
+DATA_PATH="path to redpajama_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+

-wait
+mpirun -np 1024 --hostfile hostfile_gpt_567B \
+                --allow-run-as-root \
+                --bind-to none \
+                --mca plm_rsh_no_tree_spawn 1 \
+                train_gpt_567B_multinodes.sh \
+                ${HOST} \
+                ${PORT} \
+                --data_path=$DATA_PATH \
+                --tokenizer_path=$TOKENIZER_MODEL_PATH \
+                --checkpoint_path=$CHECKPOINT_PATH \
+                --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf gpt_dataset/redpajama_text_document
\ No newline at end of file
+wait
\ No newline at end of file
--- a/examples/gpt3/topo-input.xml
+++ b/examples/gpt3/topo-input.xml
-<system version="2">
-  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-      <pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-          <gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
-            <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
-            <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-          </gpu>
-        </pci>
-      </pci>
-      <pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-        <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
-          <pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
-            <gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
-              <xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
-              <xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
-            </gpu>
-          </pci>
-        </pci>
-      </pci>
-      <pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
-        <nic>
-          <net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
-          <net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
-        </nic>
-      </pci>
-    </pci>
-  </cpu>
-  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
-    <pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
-      <nic>
-        <net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
-        <net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
-      </nic>
-    </pci>
-  </cpu>
-</system>
--- a/examples/gpt3/train_gpt_567B_1nodes.sh
+++ b/examples/gpt3/train_gpt_567B_1nodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs GPT 567B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to redpajama_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -83,8 +86,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 98,2,0
 )

@@ -104,23 +107,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -145,10 +131,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-etp2-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-project ${WANDB_PROJECT:-"GPT"}
+        --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
    )
 fi

@@ -204,4 +207,4 @@ case ${LOCAL_RANK} in
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
-esac
+esac
\ No newline at end of file
--- a/examples/gpt3/train_gpt_567B_multinodes.sh
+++ b/examples/gpt3/train_gpt_567B_multinodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs GPT 567B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to redpajama_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -83,8 +86,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 98,2,0
 )

@@ -104,23 +107,6 @@ TRAINING_ARGS=(
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 4
    --pipeline-model-parallel-size 16
@@ -145,10 +131,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_128nodes_tp4-pp16-ep16-etp4-cp2
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-project ${WANDB_PROJECT:-"GPT"}
+        --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
    )
 fi


--- a/examples/llama/run_llama2_7B_1nodes.sh
+++ b/examples/llama/run_llama2_7B_1nodes.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Runs Llama2 7B model
+source /opt/dtk/env.sh
+
+DATA_PATH="path to oscar-1GB_head-llama2_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+
+
+mpirun -np 8  --allow-run-as-root \
+              train_llama2_7b_1nodes.sh \
+              localhost \
+              34577 \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
+
+wait
\ No newline at end of file
--- a/examples/llama/Llama2_7b.sh
+++ b/examples/llama/Llama2_7b.sh
 #!/bin/bash
-# set -eux

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}
+
+# default env
+DIST_URL=${1}
+DIST_PORT=${2}
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
 CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
-
-
-#default env
-#export FLASH_ATTENTION_PRINT_PARAM=1
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
-export NCCL_P2P_LEVEL=PXB # SYS
-# export GPU_MAX_HW_QUEUES=10
-#export HIP_ALLOC_INITIALIZE=0
-export CUDA_DEVICE_MAX_CONNECTIONS=1
+export GPU_MAX_HW_QUEUES=10

 # nccl env
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=16
-export NCCL_MIN_NCHANNELS=32 # 20
-export NCCL_MAX_NCHANNELS=32 # 20
-export NCCL_IB_TIMEOUT=22
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
 export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
-export GLOG_minloglevel=3 # 打印error级别的nccl日志
-source /opt/dtk/env.sh
-
-# hipblaslt库
-export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
-
-# rocblas
-export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
+export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"

 # torch控制多流转单流
 export ALLREDUCE_STREAM_WITH_COMPUTE=1
@@ -48,187 +49,143 @@ export SENDRECV_STREAM_WITH_COMPUTE=1
 #增加编译缓存
 export cache_size_limit=64

-# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
-SAVE_PATH=./tmp_7b
-TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)

 GPT_MODEL_ARGS=(
+    --seq-length 4096
    --num-layers 32
    --hidden-size 4096
    --ffn-hidden-size 11008 
    --num-attention-heads 32
    --max-position-embeddings 4096
-
-    --normalization RMSNorm # LightopRMSNorm
-    --position-embedding-type rope # none # 
-    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
+    --normalization LightopRMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
 )

-export NVTE_FLASH_ATTN=1 # 走cutlass
-# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
-# --transformer-impl transformer_engine # 走core用这两组参数
-    # --use-mcore-models
-    # --transformer-impl local # 走legacy用这两组参数
-    # --use-legacy-models 
 TRAINING_ARGS=(
-    --transformer-impl local # 走legacy用这两组参数
+    --transformer-impl local
    --use-legacy-models 
    --micro-batch-size 1
-    --global-batch-size 256 #256 #240 #60 #512 #64
-    --train-iters 50
+    --global-batch-size 256
+    --train-iters 10
    --weight-decay 0.1 
    --adam-beta1 0.9 
    --adam-beta2 0.95 
    --init-method-std 0.006 
    --clip-grad 1.0 
    --bf16
-    # --fp16 # 开启fp16需要指定loss-scale
-    # --loss-scale 1024
-    --use-distributed-optimizer 
    --disable-bias-linear
    --attention-dropout 0
    --hidden-dropout 0
-    # --no-gradient-accumulation-fusion
    --swiglu
    --lr 3.0e-5 
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
    --ckpt-format torch
-    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
-    # --recompute-granularity full # 开启重计算降低显存增加耗时
-    # --recompute-num-layers 5 #0 #
-    # --recompute-method block
-    --overlap-grad-reduce # 重叠ddp grad reduce
-    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
-    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
+    --ddp-average-in-collective
+    --overlap-grad-reduce
    --use-flash-attn
 )
-# 使用torch fa的环境变量
-# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
-# export TORCHINDUCTOR_BENCHMARK_FUSION=1
-# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
-# export TORCHINDUCTOR_MAX_AUTOTUNE=1
-# export TORCHINDUCTOR_CACHE_DIR=./cache
-# --use-flash-attn-cutlass # cutlass fa
-# --use-flash-attn-triton # triton fa
-# --use-flash-attn-torch # torch fa

 MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 2
+    --context-parallel-size 1
+    --use-distributed-optimizer 
    --sequence-parallel
-	--tensor-model-parallel-size 1
-	--pipeline-model-parallel-size 2
-  # --context-parallel-size 2
-  # --num-layers-per-virtual-pipeline-stage 4
-  # --microbatch-group-size-per-virtual-pipeline-stage 1
-  # --no-overlap-p2p-communication # 开启后
 )

 DATA_ARGS=(
-    --data-path $DATA_PATH 
-    --seq-length 4096 #4096
-    --split 949,50,1
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH} 
+    --split 949,50,1
 )

 EVAL_AND_LOGGING_ARGS=(
    --log-throughput
-    --eval-iters 50
+    --eval-iters 5
    --log-interval 1
    --save-interval 1000 
    --eval-interval 1000 
-    --save $SAVE_PATH 
-    --load $SAVE_PATH 
-    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+    --save $CHECKPOINT_PATH
+    --load $CHECKPOINT_PATH
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" 
 )

-# FINETUNE_ARGS=(
-#     # --finetune
-#     # --pretrained-checkpoint $CHECKPOINT_PATH
-#     --load $CHECKPOINT_PATH
-#     --no-load-optim
-#     --no-load-rng
-# )
-
-PROFILE_ARGS=(
+TORCH_PROFIE_ARGS=(
    --profile
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-pytorch-profiler
    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-dir prof_data
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_llama_1nodes_tp1-pp2-cp1
+    --use-pytorch-profiler
 )

-RANK=$OMPI_COMM_WORLD_RANK
-LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
-DIST_URL=${1}
-DIST_PORT=34577
-
-DISTRIBUTED_ARGS=(
-    --rank ${RANK}
-    --world-size ${WORLD_SIZE}
-    --local-rank ${LOCAL_RANK}
-    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
 )

 APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
-        ${GPT_MODEL_ARGS[@]} \
-        ${TRAINING_ARGS[@]} \
-        ${MODEL_PARALLEL_ARGS[@]} \
-        ${DATA_ARGS[@]} \
-        ${EVAL_AND_LOGGING_ARGS[@]} \
-        ${DISTRIBUTED_ARGS[@]} \
-        
-"
-# 开启profile
-# ${PROFILE_ARGS[@]} \
-
-# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
-# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
-# ${APP}
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]} \
+    ${DISTRIBUTED_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
 case ${LOCAL_RANK} in
 [0])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [2])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
 [3])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=3 --membind=3 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [4])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=4 --membind=4 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [5])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=5 --membind=5 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [6])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=6 --membind=6 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [7])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=7 --membind=7 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 esac
\ No newline at end of file
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
-# Mixtral 8x7B Model Inference and Finetuning
-
-## Download Mixtral 8x7B Checkpoints
-Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
-
-Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
-```python
-from huggingface_hub import snapshot_download
-SAVED_DIR = "" # Specify the saved directory
-# Download HF checkpoints
-snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
-```
-
-## Convert Mixtral 8x7B checkpoints from HF to MCore
-The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
-The target model parallel size(e.g. TP,PP,EP) should be specified.
-
-Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
-
-```
-TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
-MEGATRON_PATH="/workspace/megatron-lm"
-export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-TARGET_TP_SIZE=""
-TARGET_EP_SIZE=""
-TARGET_PP_SIZE=""
-
-HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
-MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
-
-python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
-```
-
-## Text generation with Mixtral 8x7B
-Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
-
-The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
-```
-#!/bin/bash
-# This example will start serving the Mixtral 8x7B model.
-DISTRIBUTED_ARGS="--nproc_per_node 2 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint>
-TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 2  \
-       --expert-model-parallel-size 1 \
-       --load ${CHECKPOINT}  \
-       --tokenizer-type Llama2Tokenizer \
-       --tokenizer-model $TOKENIZER_MODEL \
-       --use-mcore-models \
-       --max-position-embeddings 32768 \
-       --num-layers 32 \
-       --hidden-size 4096 \
-       --ffn-hidden-size 14336 \
-       --num-attention-heads 32 \
-       --normalization RMSNorm \
-       --disable-bias-linear \
-       --position-embedding-type rope \
-       --no-position-embedding \
-       --swiglu \
-       --untie-embeddings-and-output-weights \
-       --group-query-attention \
-       --num-query-groups 8 \
-       --bf16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --seed 42 \
-       --num-experts 8 \
-       --moe-router-topk 2 \
-       --moe-token-dispatcher-type alltoall \
-       --moe-grouped-gemm \
-       --mock-data \
-       --rotary-base 1000000
-```
-
-Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
-
-```
-python tools/text_generation_cli.py localhost:5000
-```
-
-
-## Finetuning from pretrained Mixtral 8x7B
-To finetuning pretrained Mixtral 8x7B, use the following scripts:
-
-
-```bash
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
-CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
-TOKENIZER_MODEL="" # Specify path to tokenizer.model
-DATA_PATH="" # Specify path to data
-
-docker run \
-    --gpus=all \
-    --ipc=host \
-    --workdir /workspace/megatron-lm \
-    -v /path/to/data:/path/to/data \
-    -v /path/to/megatron-lm:/workspace/megatron-lm \
-    $PYTORCH_IMAGE \
-    bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
-```
-
-The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
-
-## Acknowledgements
-Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
--- a/examples/mixtral/run_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/run_mixtral_8x22B_1nodes.sh
@@ -5,10 +5,22 @@ do
    fi
 done

-mpirun -np 8  --allow-run-as-root \
-              train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
+# Runs Mixtral 8x22B model
+source /opt/dtk/env.sh
+HOST=localhost
+PORT=25900
+DATA_PATH="path to my-mixtral_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+

-wait
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x22B_1nodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
+wait
\ No newline at end of file
--- a/examples/mixtral/run_mixtral_8x22B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x22B_multinodes.sh
@@ -5,13 +5,25 @@ do
    fi
 done

-mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
+# Runs Mixtral 8x22B model
+source /opt/dtk/env.sh
+HOST=""  # modify this variable
+PORT=25900
+DATA_PATH="path to my-mixtral_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+
+
+mpirun -np 32 --hostfile hostfile_mixtral_8x22B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1
-
-wait
+              train_mixtral_8x22B_multinodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+wait
\ No newline at end of file
--- a/examples/mixtral/run_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_1nodes.sh
@@ -5,10 +5,22 @@ do
    fi
 done

-mpirun -np 8  --allow-run-as-root \
-              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
+# Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+HOST=localhost
+PORT=25900
+DATA_PATH="path to my-mixtral_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+

-wait
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x7B_1nodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
+wait
\ No newline at end of file
--- a/examples/mixtral/run_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/run_mixtral_8x7B_multinodes.sh
@@ -5,13 +5,25 @@ do
    fi
 done

+# Runs Mixtral 8x7B model
+source /opt/dtk/env.sh
+HOST=""  # modify this variable
+PORT=25900
+DATA_PATH="path to my-mixtral_text_document"
+TOKENIZER_MODEL_PATH="path to tokenizer.model"
+CHECKPOINT_PATH="path to ckpt" 
+
+
 mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
-
-wait
+              train_mixtral_8x7B_multinodes.sh \
+              ${HOST} \
+              ${PORT} \
+              --data_path=$DATA_PATH \
+              --tokenizer_path=$TOKENIZER_MODEL_PATH \
+              --checkpoint_path=$CHECKPOINT_PATH \
+              --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1

-rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+wait
\ No newline at end of file