update model parameters format

6c3cfb1d · silencealiang · 935bfd74 · 6c3cfb1d · 6c3cfb1d · 6c3cfb1d
Commit 6c3cfb1d authored May 19, 2025 by silencealiang
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "Megatron-LM"]
 	path = Megatron-LM
 	url = https://github.com/NVIDIA/Megatron-LM.git
-	branch = 4429e8ebe21fb0
+	branch = d580efc68a9f0d 
 [submodule]
 	Megatron-LM = main
--- a/dcu_megatron/adaptor/megatron_adaptor.py
+++ b/dcu_megatron/adaptor/megatron_adaptor.py
@@ -25,11 +25,11 @@ class MegatronAdaptation:
        MegatronAdaptation.apply()

        # apply features
-        from .patch_utils import MegatronPatchesManager
-        from .features_manager import a2a_overlap_adaptation
+        # from .patch_utils import MegatronPatchesManager
+        # from .features_manager import a2a_overlap_adaptation

-        a2a_overlap_adaptation(MegatronPatchesManager)
-        MegatronPatchesManager.apply_patches()
+        # a2a_overlap_adaptation(MegatronPatchesManager)
+        # MegatronPatchesManager.apply_patches()

    @classmethod
    def register(cls, orig_func_name, new_func=None, force_patch=False, create_dummy=False, apply_wrapper=False, remove_origin_wrappers=False):

--- a/dcu_megatron/training/initialize.py
+++ b/dcu_megatron/training/initialize.py
@@ -94,7 +94,7 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
            print("> initializing torch distributed ...", flush=True)
        # Manually set the device ids.
        if device_count > 0:
-            torch.cuda.set_device(args.local_rank)
+            torch.cuda.set_device(args.local_rank % device_count)
            device_id = torch.device(f'cuda:{args.local_rank}')
        else:
            device_id = None

--- a/examples/deepseek_v3/hostfile_deepseekv3_671B
+++ b/examples/deepseek_v3/hostfile_deepseekv3_671B
--- a/examples/deepseek_v3/run_deepseek_v3_1node.sh
+++ b/examples/deepseek_v3/run_deepseek_v3_1node.sh
-for para in $*
-do
-    if [[ $para == --profiling* ]];then
-        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
-    fi
-done
-
-mpirun -np 8  --allow-run-as-root \
-              train_deepseek_v3_1node.sh localhost --profiling=$profiling > output.log 2>&1
-
-wait
-
-rm -rf CKPT
--- a/examples/deepseek_v3/run_deepseekv3_671B.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Those variables need to modify
+GPUS=""                 # how many gpus to use
+DTK_ENV=""              # where env.sh of dtk
+NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+HOST=""                 # hostname
+PORT=""                 # port id
+DATA_PATH=""            # path to mmap_deepseekv3_datasets_text_document
+TOKENIZER_MODEL_PATH="" # path to deepseekv3_dataset
+CHECKPOINT_PATH=""      # path to ckpt
+
+# Runs DeepseekV3 671B model
+mpirun -np ${GPUS}  --hostfile hostfile_deepseekv3_671B \
+                    --allow-run-as-root \
+                    --bind-to none \
+                    --mca plm_rsh_no_tree_spawn 1 \
+                    bash -c "
+                    source ${DTK_ENV} && \
+                    source ${NCCL_ENV} && \
+                    ./train_deepseekv3_671B_$((${GPUS} / 8))nodes.sh \
+                    ${HOST} \
+                    ${PORT} \
+                    --data_path=$DATA_PATH \
+                    --tokenizer_path=$TOKENIZER_MODEL_PATH \
+                    --checkpoint_path=$CHECKPOINT_PATH \
+                    --profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
+
+wait
\ No newline at end of file
--- a/examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# default env
+DIST_URL=${1}
+DIST_PORT=${2}
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+export MP_PP0_LAYERS=5 # 是否使能视实际情况而定
+
+### BASE CONFIG ###
+MODEL_SIZE=A37B
+BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=4096
+LR=1e-4
+MIN_LR=1e-6
+SEQ_LEN=4096
+PAD_LEN=4096
+PR=bf16
+### BASE CONFIG ###
+
+### PARALLEL / BOOL OPTION ###
+TP=4
+PP=8
+CP=1
+ETP=2
+EP=64
+SP=true
+DO=true
+FL=true
+SFT=false
+### PARALLEL / BOOL OPTION ###
+
+### OTHERS ###
+AC=none
+OPTIMIZER_OFFLOAD=false
+SAVE_INTERVAL=500
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+
+# the following two values will not be used when SFT is true
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+###############################
+
+OUTPUT_BASEPATH=${checkpoint_path}
+### OTHERS ###
+
+if [ $FL = true ]; then
+    :
+    #exit -1
+elif [ $FL = false ]; then
+    attn_backend_option=" \
+        --attention-backend auto
+    "
+fi
+
+if [ $MODEL_SIZE = A37B ]; then
+    TRAIN_ITERS=10
+    HIDDEN_SIZE=7168
+    NUM_ATTENTION_HEADS=128
+    NUM_LAYERS=61
+    INTERMEDIATE_SIZE=18432
+    MOE_INTERMEDIATE_SIZE=2048
+    MAX_POSITION_EMBEDDINGS=163840
+    EXTRA_VOCAB_SIZE=467
+    Q_LORA_RANK=1536
+    KV_LORA_RANK=512
+    QK_NOPE_HEAD_DIM=128
+    QK_ROPE_HEAD_DIM=64
+    V_HEAD_DIM=128
+    ROPE_THETA=10000
+    SCALE_FACTOR=40
+    NUM_EXPERTS=256
+    ROUTER_TOPK=8
+    NUM_SHARED_EXPERTS=1
+    RMS_NORM_EPS=1e-6
+
+moe_options=" \
+    --moe-grouped-gemm \
+    --moe-expert-capacity-factor 0.5 \
+    --moe-pad-expert-input-to-capacity \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-group-topk 4 \
+    --moe-router-num-groups 8 \
+    --num-experts ${NUM_EXPERTS} \
+    --expert-model-parallel-size ${EP} \
+    --expert-tensor-parallel-size ${ETP} \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-router-enable-expert-bias \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*3+[1]*58) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "
+
+mtp_options=""
+fi
+
+# Here are some configs controled by env
+if [ -z ${MP_DATASET_TYPE} ];then
+    MP_DATASET_TYPE="idxmap"
+fi
+
+if [ -z ${MP_AC_LAYERS} ];then
+    MP_AC_LAYERS=1
+fi
+
+if [ -z ${MP_VP} ]; then
+    vp_option=""
+else
+    vp_option=" \
+        --num-layers-per-virtual-pipeline-stage ${MP_VP}"
+fi
+
+if [ -z ${MP_SFT_PACKING} ]; then
+    MP_SFT_PACKING=false
+fi
+
+TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
+comm_overlap_option="\
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+ 
+
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
+
+if [ $AC = full ]; then
+    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
+    if [ $_check != 0 ]; then
+        echo "the num layers per pp rank must be a multiple of the recompute layers."
+        exit -1
+    fi
+    activation_checkpoint_options=" \
+        --recompute-method uniform \
+        --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-granularity full"
+elif [ $AC = sel ]; then
+    activation_checkpoint_options=" \
+        --recompute-activations"
+elif [ $AC = none ]; then
+    activation_checkpoint_options=" \
+    "
+elif [ $AC = offload ]; then
+    activation_checkpoint_options=" \
+        --cpu-offloading \
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+    if [ $TP_COMM_OVERLAP -eq 1 ]; then
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option="\
+            --tp-comm-overlap"
+    else
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option=""
+    fi
+fi
+
+if [ $PR = fp16 ]; then
+    pr_options=" \
+        --fp16 \
+        --apply-query-key-layer-scaling"
+    export NVTE_APPLY_QK_LAYER_SCALING=1
+elif [ $PR = bf16 ]; then
+    pr_options=" \
+        --bf16"
+elif [ $PR = fp8 ]; then
+    pr_options=" \
+        --bf16 \
+        --fp8-format hybrid \
+        --fp8-amax-compute-algo max \
+        --fp8-amax-history-len 1024"
+fi
+
+if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
+    echo "Offload optimizer is valid only if \$DO=true"
+    DO=true
+fi
+
+if [ $DO = true ]; then
+    do_option=" \
+        --use-distributed-optimizer"
+
+elif [ $DO = false ]; then
+    do_option=" \
+                    "
+fi
+
+
+if [ $SP = true ] && [ $TP -gt 1 ]; then
+    sp_option=" \
+        --sequence-parallel"
+
+elif [ $SP = false ]; then
+    sp_option=" \
+                    "
+fi
+
+if [ -z ${MP_PP0_LAYERS} ];then
+    uneven_split_option=""
+elif [ ${PP} -gt 1 ]; then
+    _check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
+    if [ $_check != 0 ]; then
+        echo "With uneven pipelineing the left over layers must be divisible by left over stages."
+        exit -1
+    fi
+
+    uneven_split_option=" \
+        --decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
+    "
+else
+    echo "uneven pipeline split must be used when PP > 1"
+    exit -1
+fi
+
+if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
+    load_option=" \
+            --load $PRETRAIN_CHECKPOINT_PATH"
+fi
+
+if [ $OPTIMIZER_OFFLOAD != false ]; then
+    offload_option=" \
+        --optimizer-cpu-offload \
+        --use-precision-aware-optimizer \
+        --optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
+fi
+
+if [ $SFT = true ]; then
+    TRAIN_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
+    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
+else
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
+fi
+
+if [ ${MP_DATASET_TYPE} = "raw" ]; then
+    dataset_options=" \
+        --train-data-path ${DATASET_PATH} \
+        --valid-data-path ${VALID_DATASET_PATH} \
+        --dataloader-type cyclic \
+        --dataset JSON-SFT"
+else 
+    dataset_options=" \
+        --data-path ${DATASET_PATH} \
+        --split 99,1,0"
+fi
+
+if [ ${MP_SFT_PACKING} = true ]; then
+    echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
+    packing_options=""
+else
+    packing_options=""
+fi
+
+##### Prepare logdirs #######
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR}
+SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+
+megatron_options="  \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --weight-decay 0.1 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --clip-grad 1.0 \
+        --init-method-std 0.008 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --lr-decay-iters ${LR_DECAY_ITERS} \
+        --lr-warmup-iters ${LR_WARMUP_ITERS} \
+        --train-iters ${TRAIN_ITERS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTENTION_HEADS} \
+        --ffn-hidden-size ${INTERMEDIATE_SIZE} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --log-interval 1 \
+        --log-throughput \
+        --eval-interval 10000 \
+        --eval-iters 3 \
+        --save-interval ${SAVE_INTERVAL} \
+        --tensorboard-queue-size 1 \
+        --tensorboard-dir ${TENSORBOARD_DIR} \
+        --log-timers-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --context-parallel-size ${CP} \
+        --no-load-optim \
+        --no-load-rng \
+        --num-workers 8 \
+        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
+        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
+        --swiglu \
+        --normalization RMSNorm \
+        --norm-epsilon ${RMS_NORM_EPS} \
+        --use-rotary-position-embeddings \
+        --no-rope-fusion \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --rotary-base ${ROPE_THETA} \
+        --rotary-scaling-factor ${SCALE_FACTOR} \
+        --no-save-optim \
+        --kv-channels ${V_HEAD_DIM} \
+        --qk-layernorm \
+        --multi-latent-attention \
+        --ckpt-format torch \
+        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
+        --use-rope-scaling \
+        "
+
+TORCH_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 3 \
+    --profile-step-end 4 \
+    --profile-dir torch_prof_deepseekv3_128nodes_tp4-pp8-ep64-etp2-cp1 \
+    --use-pytorch-profiler \
+"
+
+HIP_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 4 \
+    --profile-step-end 5 \
+    --use-hip-profiler \
+"
+
+DISTRIBUTED_ARGS="  \
+    --rank ${RANK} \
+    --world-size ${WORLD_SIZE} \
+    --local-rank ${LOCAL_RANK} \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/deepseek_v3/train_deepseek_v3_1node.sh
+++ b/examples/deepseek_v3/train_deepseek_v3_1node.sh
 #!/bin/bash
+
 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        # export GPU_FLUSH_ON_EXECUTION=1
-        # export HIP_DIRECT_DISPATCH=0
    fi
 done
+
+# default env
+DIST_URL=${1}
+DIST_PORT=${2}
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
 CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
-
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

-export NCCL_ALGO=Ring
-export NCCL_MIN_NCHANNELS=32
-export NCCL_MAX_NCHANNELS=32
-export NCCL_NET_GDR_LEVEL=7
-export NCCL_NET_GDR_READ=1
-export RCCL_SDMA_COPY_ENABLE=0
-export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="/public/home/yuguo/check/rccl-tests-0204/topo-input.xml" #"your topo file"
-export GLOG_minloglevel=3
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export LD_LIBRARY_PATH=/public/home/yuguo/data/rocblas-install-0224/lib:$LD_LIBRARY_PATH
-
-LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
-RANK=$OMPI_COMM_WORLD_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定

 ### BASE CONFIG ###
 MODEL_SIZE=A37B
 BATCH_SIZE=1
 GLOBAL_BATCH_SIZE=256
-LR=1e-5
+LR=1e-4
 MIN_LR=1e-6
 SEQ_LEN=4096
+PAD_LEN=4096
 PR=bf16
 ### BASE CONFIG ###

 ### PARALLEL / BOOL OPTION ###
 TP=1
-PP=2
+PP=1
 CP=1
+ETP=1
 EP=4
 SP=true
 DO=true
@@ -56,36 +59,36 @@ SFT=false
 AC=none
 OPTIMIZER_OFFLOAD=false
 SAVE_INTERVAL=500
-DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
-VALID_DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
-PRETRAIN_CHECKPOINT_PATH=${MEGATRON_PATH}/deepseekv3_dataset #"your model path"
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}

 # the following two values will not be used when SFT is true
-TRAIN_TOKENS=100000000
-WARMUP_TOKENS=10000
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
 ###############################

-OUTPUT_BASEPATH=./output
+OUTPUT_BASEPATH=${checkpoint_path}
 ### OTHERS ###

 if [ $FL = true ]; then
    :
    #exit -1
 elif [ $FL = false ]; then
-    export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
    attn_backend_option=" \
-        --attention-backend fused
+        --attention-backend auto
    "
 fi

 if [ $MODEL_SIZE = A37B ]; then
-    TRAIN_ITERS=2
+    TRAIN_ITERS=10
    HIDDEN_SIZE=7168
    NUM_ATTENTION_HEADS=128
    NUM_LAYERS=2
    INTERMEDIATE_SIZE=18432
    MOE_INTERMEDIATE_SIZE=2048
-    MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
+    MAX_POSITION_EMBEDDINGS=163840
    EXTRA_VOCAB_SIZE=467
    Q_LORA_RANK=1536
    KV_LORA_RANK=512
@@ -94,32 +97,43 @@ if [ $MODEL_SIZE = A37B ]; then
    V_HEAD_DIM=128
    ROPE_THETA=10000
    SCALE_FACTOR=40
-    NUM_EXPERTS=8 #256
+    NUM_EXPERTS=8
    ROUTER_TOPK=8
    NUM_SHARED_EXPERTS=1
    RMS_NORM_EPS=1e-6

-    moe_options=" \
-        --moe-grouped-gemm \
-        --moe-expert-capacity-factor 1 \
-        --moe-pad-expert-input-to-capacity \
-        --moe-token-dispatcher-type alltoall \
-        --moe-router-topk ${ROUTER_TOPK} \
-        --num-experts ${NUM_EXPERTS} \
-        --expert-model-parallel-size ${EP} \
-        --expert-tensor-parallel-size 1 \
-        --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
-        --moe-router-load-balancing-type aux_loss \
-        --moe-aux-loss-coeff 0.001 \
-        --moe-layer-freq ([0]*0+[1]*2) \
-        --q-lora-rank ${Q_LORA_RANK} \
-        --kv-lora-rank ${KV_LORA_RANK} \
-        --qk-head-dim ${QK_NOPE_HEAD_DIM} \
-        --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
-        --v-head-dim ${V_HEAD_DIM} \
-        --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
-        "
+moe_options=" \
+    --moe-grouped-gemm \
+    --moe-expert-capacity-factor 0.5 \
+    --moe-pad-expert-input-to-capacity \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-group-topk 1 \
+    --moe-router-num-groups 1 \
+    --num-experts ${NUM_EXPERTS} \
+    --expert-model-parallel-size ${EP} \
+    --expert-tensor-parallel-size ${ETP} \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-router-enable-expert-bias \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*1+[1]*1) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "

+mtp_options=""
 fi

 # Here are some configs controled by env
@@ -147,6 +161,14 @@ comm_overlap_option="\
    --overlap-grad-reduce \
    --overlap-param-gather"
 
+
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
+
 if [ $AC = full ]; then
    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
    if [ $_check != 0 ]; then
@@ -154,9 +176,9 @@ if [ $AC = full ]; then
        exit -1
    fi
    activation_checkpoint_options=" \
-		    --recompute-method uniform \
-            --recompute-num-layers ${MP_AC_LAYERS} \
-		    --recompute-granularity full"
+        --recompute-method uniform \
+        --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-granularity full"
 elif [ $AC = sel ]; then
    activation_checkpoint_options=" \
        --recompute-activations"
@@ -165,8 +187,8 @@ elif [ $AC = none ]; then
    "
 elif [ $AC = offload ]; then
    activation_checkpoint_options=" \
-		    --cpu-offloading \
-		    --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+        --cpu-offloading \
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
    if [ $TP_COMM_OVERLAP -eq 1 ]; then
        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
        comm_overlap_option="\
@@ -179,8 +201,8 @@ fi

 if [ $PR = fp16 ]; then
    pr_options=" \
-		    --fp16 \
-            --apply-query-key-layer-scaling"
+        --fp16 \
+        --apply-query-key-layer-scaling"
    export NVTE_APPLY_QK_LAYER_SCALING=1
 elif [ $PR = bf16 ]; then
    pr_options=" \
@@ -200,7 +222,7 @@ fi

 if [ $DO = true ]; then
    do_option=" \
-		    --use-distributed-optimizer"
+        --use-distributed-optimizer"

 elif [ $DO = false ]; then
    do_option=" \
@@ -210,7 +232,7 @@ fi

 if [ $SP = true ] && [ $TP -gt 1 ]; then
    sp_option=" \
-		    --sequence-parallel"
+        --sequence-parallel"

 elif [ $SP = false ]; then
    sp_option=" \
@@ -236,7 +258,7 @@ fi

 if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
    load_option=" \
-            --tokenizer-model $PRETRAIN_CHECKPOINT_PATH"
+            --load $PRETRAIN_CHECKPOINT_PATH"
 fi

 if [ $OPTIMIZER_OFFLOAD != false ]; then
@@ -247,15 +269,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
 fi

 if [ $SFT = true ]; then
-    TRAIN_ITERS=${24}
-    LR_WARMUP_ITERS=${25}
+    TRAIN_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
-    PREFIX="finetune-mcore-deepseek-v3"
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
 else
-    # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-    PREFIX="pretrain-mcore-deepseek-v3"
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
 fi

 if [ ${MP_DATASET_TYPE} = "raw" ]; then
@@ -278,16 +306,18 @@ else
 fi

 ##### Prepare logdirs #######
-NAME="${PREFIX}"
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
 mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
 mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
 mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
 mkdir -p ${TENSORBOARD_DIR}
 SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"

 mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
-find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}

 megatron_options="  \
        --lr ${LR} \
@@ -314,7 +344,7 @@ megatron_options="  \
        --log-interval 1 \
        --log-throughput \
        --eval-interval 10000 \
-        --eval-iters 5 \
+        --eval-iters 3 \
        --save-interval ${SAVE_INTERVAL} \
        --tensorboard-queue-size 1 \
        --tensorboard-dir ${TENSORBOARD_DIR} \
@@ -328,13 +358,12 @@ megatron_options="  \
        --num-workers 8 \
        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
        --swiglu \
        --normalization RMSNorm \
        --norm-epsilon ${RMS_NORM_EPS} \
        --use-rotary-position-embeddings \
-        --no-bias-swiglu-fusion \
        --no-rope-fusion \
-        --position-embedding-type rope \
        --untie-embeddings-and-output-weights \
        --disable-bias-linear \
        --rotary-base ${ROPE_THETA} \
@@ -342,12 +371,11 @@ megatron_options="  \
        --no-save-optim \
        --kv-channels ${V_HEAD_DIM} \
        --qk-layernorm \
+        --multi-latent-attention \
        --ckpt-format torch \
        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
        --use-rope-scaling \
-        --multi-latent-attention \
-        --mtp-num-layers 1 \
-        --use-mcore-models \
        "

 TORCH_PROFIE_ARGS="  \
@@ -355,7 +383,7 @@ TORCH_PROFIE_ARGS="  \
    --profile-ranks 0 1 2 3 4 5 6 7 \
    --profile-step-start 3 \
    --profile-step-end 4 \
-    --profile-dir torch_prof_data_16nodes_dcu \
+    --profile-dir torch_prof_deepseekv3_1nodes_tp1-pp1-ep4-etp1-cp1 \
    --use-pytorch-profiler \
 "

@@ -367,26 +395,30 @@ HIP_PROFIE_ARGS="  \
    --use-hip-profiler \
 "

-APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
-     ${megatron_options} \
-     ${dataset_options} \
-     ${pr_options} \
-     ${load_option} \
-     ${activation_checkpoint_options} \
-     ${do_option} \
-     ${sp_option} \
-     ${moe_options} \
-     ${offload_option} \
-     ${sft_options} \
-     ${vp_option} \
-     ${packing_options} \
-     ${uneven_split_option} \
-     ${attn_backend_option} \
-     ${comm_overlap_option} \
+DISTRIBUTED_ARGS="  \
    --rank ${RANK} \
    --world-size ${WORLD_SIZE} \
    --local-rank ${LOCAL_RANK} \
-    --dist-url tcp://${1}:25900 \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
    "

 if [[ $profiling == "torch" ]]; then
@@ -397,37 +429,30 @@ elif [[ $profiling == "hip" ]]; then
    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

+#for hygon cpu
 case ${LOCAL_RANK} in
-[0])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[1])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[2])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[3])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[4])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[5])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[6])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-[7])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  ;;
-esac
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# default env
+DIST_URL=${1}
+DIST_PORT=${2}
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
+
+# enable BatchLinear
+export GROUPED_GEMM_BatchLinear=1
+export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
+
+### BASE CONFIG ###
+MODEL_SIZE=A37B
+BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+LR=1e-4
+MIN_LR=1e-6
+SEQ_LEN=4096
+PAD_LEN=4096
+PR=bf16
+### BASE CONFIG ###
+
+### PARALLEL / BOOL OPTION ###
+TP=2
+PP=2
+CP=1
+ETP=1
+EP=16
+SP=true
+DO=true
+FL=true
+SFT=false
+### PARALLEL / BOOL OPTION ###
+
+### OTHERS ###
+AC=none
+OPTIMIZER_OFFLOAD=false
+SAVE_INTERVAL=500
+DATASET_PATH=${data_path}
+VALID_DATASET_PATH=${data_path}
+PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+
+# the following two values will not be used when SFT is true
+TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
+###############################
+
+OUTPUT_BASEPATH=${checkpoint_path}
+### OTHERS ###
+
+if [ $FL = true ]; then
+    :
+    #exit -1
+elif [ $FL = false ]; then
+    attn_backend_option=" \
+        --attention-backend auto
+    "
+fi
+
+if [ $MODEL_SIZE = A37B ]; then
+    TRAIN_ITERS=10
+    HIDDEN_SIZE=7168
+    NUM_ATTENTION_HEADS=128
+    NUM_LAYERS=3
+    INTERMEDIATE_SIZE=18432
+    MOE_INTERMEDIATE_SIZE=2048
+    MAX_POSITION_EMBEDDINGS=163840
+    EXTRA_VOCAB_SIZE=467
+    Q_LORA_RANK=1536
+    KV_LORA_RANK=512
+    QK_NOPE_HEAD_DIM=128
+    QK_ROPE_HEAD_DIM=64
+    V_HEAD_DIM=128
+    ROPE_THETA=10000
+    SCALE_FACTOR=40
+    NUM_EXPERTS=256
+    ROUTER_TOPK=8
+    NUM_SHARED_EXPERTS=1
+    RMS_NORM_EPS=1e-6
+
+moe_options=" \
+    --moe-grouped-gemm \
+    --moe-expert-capacity-factor 0.5 \
+    --moe-pad-expert-input-to-capacity \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-topk ${ROUTER_TOPK} \
+    --moe-router-group-topk 4 \
+    --moe-router-num-groups 8 \
+    --num-experts ${NUM_EXPERTS} \
+    --expert-model-parallel-size ${EP} \
+    --expert-tensor-parallel-size ${ETP} \
+    --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-router-topk-scaling-factor 2.5 \
+    --moe-shared-expert-overlap \
+    --moe-router-enable-expert-bias \
+    --mscale 1.0 \
+    --mscale-all-dim 1.0 \
+    --moe-router-score-function sigmoid \
+    --moe-router-bias-update-rate 0.001 \
+    --moe-aux-loss-coeff 0.001 \
+    --moe-layer-freq ([0]*1+[1]*2) \
+    --moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
+    --q-lora-rank ${Q_LORA_RANK} \
+    --kv-lora-rank ${KV_LORA_RANK} \
+    --qk-head-dim ${QK_NOPE_HEAD_DIM} \
+    --qk-pos-emb-head-dim  ${QK_ROPE_HEAD_DIM} \
+    --v-head-dim ${V_HEAD_DIM} \
+    --mtp-num-layers 1 \
+    "
+
+mtp_options=""
+fi
+
+# Here are some configs controled by env
+if [ -z ${MP_DATASET_TYPE} ];then
+    MP_DATASET_TYPE="idxmap"
+fi
+
+if [ -z ${MP_AC_LAYERS} ];then
+    MP_AC_LAYERS=1
+fi
+
+if [ -z ${MP_VP} ]; then
+    vp_option=""
+else
+    vp_option=" \
+        --num-layers-per-virtual-pipeline-stage ${MP_VP}"
+fi
+
+if [ -z ${MP_SFT_PACKING} ]; then
+    MP_SFT_PACKING=false
+fi
+
+TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
+comm_overlap_option="\
+    --overlap-grad-reduce \
+    --overlap-param-gather"
+ 
+
+# if [ $TP_COMM_OVERLAP -eq 1 ]; then
+#     comm_overlap_option="\
+#         --tp-comm-overlap \
+#         --overlap-grad-reduce \
+#         --overlap-param-gather"
+# fi
+
+if [ $AC = full ]; then
+    _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
+    if [ $_check != 0 ]; then
+        echo "the num layers per pp rank must be a multiple of the recompute layers."
+        exit -1
+    fi
+    activation_checkpoint_options=" \
+        --recompute-method uniform \
+        --recompute-num-layers ${MP_AC_LAYERS} \
+        --recompute-granularity full"
+elif [ $AC = sel ]; then
+    activation_checkpoint_options=" \
+        --recompute-activations"
+elif [ $AC = none ]; then
+    activation_checkpoint_options=" \
+    "
+elif [ $AC = offload ]; then
+    activation_checkpoint_options=" \
+        --cpu-offloading \
+        --cpu-offloading-num-layers ${MP_AC_LAYERS}"
+    if [ $TP_COMM_OVERLAP -eq 1 ]; then
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option="\
+            --tp-comm-overlap"
+    else
+        echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
+        comm_overlap_option=""
+    fi
+fi
+
+if [ $PR = fp16 ]; then
+    pr_options=" \
+        --fp16 \
+        --apply-query-key-layer-scaling"
+    export NVTE_APPLY_QK_LAYER_SCALING=1
+elif [ $PR = bf16 ]; then
+    pr_options=" \
+        --bf16"
+elif [ $PR = fp8 ]; then
+    pr_options=" \
+        --bf16 \
+        --fp8-format hybrid \
+        --fp8-amax-compute-algo max \
+        --fp8-amax-history-len 1024"
+fi
+
+if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
+    echo "Offload optimizer is valid only if \$DO=true"
+    DO=true
+fi
+
+if [ $DO = true ]; then
+    do_option=" \
+        --use-distributed-optimizer"
+
+elif [ $DO = false ]; then
+    do_option=" \
+                    "
+fi
+
+
+if [ $SP = true ] && [ $TP -gt 1 ]; then
+    sp_option=" \
+        --sequence-parallel"
+
+elif [ $SP = false ]; then
+    sp_option=" \
+                    "
+fi
+
+if [ -z ${MP_PP0_LAYERS} ];then
+    uneven_split_option=""
+elif [ ${PP} -gt 1 ]; then
+    _check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
+    if [ $_check != 0 ]; then
+        echo "With uneven pipelineing the left over layers must be divisible by left over stages."
+        exit -1
+    fi
+
+    uneven_split_option=" \
+        --decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
+    "
+else
+    echo "uneven pipeline split must be used when PP > 1"
+    exit -1
+fi
+
+if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
+    load_option=" \
+            --load $PRETRAIN_CHECKPOINT_PATH"
+fi
+
+if [ $OPTIMIZER_OFFLOAD != false ]; then
+    offload_option=" \
+        --optimizer-cpu-offload \
+        --use-precision-aware-optimizer \
+        --optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
+fi
+
+if [ $SFT = true ]; then
+    TRAIN_ITERS=${25}
+    LR_WARMUP_ITERS=${26}
+    LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
+    PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+         --eod-mask-loss \
+         --calculate-per-token-loss \
+         --train-mode finetune"
+else
+   # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS}  / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} /  ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+    PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
+    sft_options=" \
+        --train-mode pretrain"
+fi
+
+if [ ${MP_DATASET_TYPE} = "raw" ]; then
+    dataset_options=" \
+        --train-data-path ${DATASET_PATH} \
+        --valid-data-path ${VALID_DATASET_PATH} \
+        --dataloader-type cyclic \
+        --dataset JSON-SFT"
+else 
+    dataset_options=" \
+        --data-path ${DATASET_PATH} \
+        --split 99,1,0"
+fi
+
+if [ ${MP_SFT_PACKING} = true ]; then
+    echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
+    packing_options=""
+else
+    packing_options=""
+fi
+
+##### Prepare logdirs #######
+NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR}
+SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
+
+megatron_options="  \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --weight-decay 0.1 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --clip-grad 1.0 \
+        --init-method-std 0.008 \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --lr-decay-iters ${LR_DECAY_ITERS} \
+        --lr-warmup-iters ${LR_WARMUP_ITERS} \
+        --train-iters ${TRAIN_ITERS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTENTION_HEADS} \
+        --ffn-hidden-size ${INTERMEDIATE_SIZE} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --log-interval 1 \
+        --log-throughput \
+        --eval-interval 10000 \
+        --eval-iters 3 \
+        --save-interval ${SAVE_INTERVAL} \
+        --tensorboard-queue-size 1 \
+        --tensorboard-dir ${TENSORBOARD_DIR} \
+        --log-timers-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensor-model-parallel-size ${TP} \
+        --pipeline-model-parallel-size ${PP} \
+        --context-parallel-size ${CP} \
+        --no-load-optim \
+        --no-load-rng \
+        --num-workers 8 \
+        --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
+        --tokenizer-type DeepSeekV2Tokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL_PATH} \
+        --swiglu \
+        --normalization RMSNorm \
+        --norm-epsilon ${RMS_NORM_EPS} \
+        --use-rotary-position-embeddings \
+        --no-rope-fusion \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --rotary-base ${ROPE_THETA} \
+        --rotary-scaling-factor ${SCALE_FACTOR} \
+        --no-save-optim \
+        --kv-channels ${V_HEAD_DIM} \
+        --qk-layernorm \
+        --multi-latent-attention \
+        --ckpt-format torch \
+        --transformer-impl transformer_engine \
+        --no-masked-softmax-fusion \
+        --use-rope-scaling \
+        "
+
+TORCH_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 3 \
+    --profile-step-end 4 \
+    --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
+    --use-pytorch-profiler \
+"
+
+HIP_PROFIE_ARGS="  \
+    --profile \
+    --profile-ranks 0 1 2 3 4 5 6 7 \
+    --profile-step-start 4 \
+    --profile-step-end 5 \
+    --use-hip-profiler \
+"
+
+DISTRIBUTED_ARGS="  \
+    --rank ${RANK} \
+    --world-size ${WORLD_SIZE} \
+    --local-rank ${LOCAL_RANK} \
+    --dist-url tcp://${DIST_URL}:${DIST_PORT} \
+"
+
+APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
+        ${megatron_options} \
+        ${dataset_options} \
+        ${pr_options} \
+        ${load_option} \
+        ${activation_checkpoint_options} \
+        ${do_option} \
+        ${sp_option} \
+        ${moe_options} \
+        ${offload_option} \
+        ${vp_option} \
+        ${packing_options} \
+        ${uneven_split_option} \
+        ${attn_backend_option} \
+        ${mtp_options} \
+        ${comm_overlap_option} \
+        ${DISTRIBUTED_ARGS} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/gpt3/README.md
+++ b/examples/gpt3/README.md
-# GPT3 MODEL
-
-## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
-
-## 1. Training setup
-<a id="markdown-training-setup" name="training-setup"></a>
-
-To run the model using a docker container run it as follows
-```
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
-CHECKPOINT_PATH="" #<Specify path>
-TENSORBOARD_LOGS_PATH=""#<Specify path>
-VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
-MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
-DATA_PATH="" #<Specify path and file prefix>_text_document
-
-docker run \
-  --gpus=all \
-  --ipc=host \
-  --workdir /workspace/megatron-lm \
-  -v /path/to/data:/path/to/data \
-  -v /path/to/megatron-lm:/workspace/megatron-lm \
-  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
-  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
-
-```
-NOTE: Depending on the environment you are running it the above command might like slightly different.
-
-
-## 2. Configurations
-<a id="markdown-configurations" name="configurations"></a>
-The example in this folder shows you how to run 175B model. There are other configs you could run as well
-
-### 345M
-```
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --seq-length 1024 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-
-```
-
-### 857M
-```
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --seq-length 2048 \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-
-```
--- a/examples/gpt3/run_gpt_567B.sh
+++ b/examples/gpt3/run_gpt_567B.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Those variables need to modify
+GPUS=""                 # how many gpus to use
+DTK_ENV=""              # where env.sh of dtk
+NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+HOST=""                 # hostname
+PORT=""                 # port id
+DATA_PATH=""            # path to redpajama_text_document
+TOKENIZER_MODEL_PATH="" # path to tokenizer.model
+CHECKPOINT_PATH=""      # path to ckpt
+
+# Runs GPT 567B model
+mpirun -np ${GPUS}  --hostfile hostfile_gpt_567B \
+                    --allow-run-as-root \
+                    --bind-to none \
+                    --mca plm_rsh_no_tree_spawn 1 \
+                    bash -c "
+                    source ${DTK_ENV} && \
+                    source ${NCCL_ENV} && \
+                    ./train_gpt_567B_$((${GPUS} / 8))nodes.sh \
+                    ${HOST} \
+                    ${PORT} \
+                    --data_path=$DATA_PATH \
+                    --tokenizer_path=$TOKENIZER_MODEL_PATH \
+                    --checkpoint_path=$CHECKPOINT_PATH \
+                    --profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
+
+wait
\ No newline at end of file
--- a/examples/gpt3/run_gpt_567B_1nodes.sh
+++ b/examples/gpt3/run_gpt_567B_1nodes.sh
-for para in $*
-do
-    if [[ $para == --profiling* ]];then
-        profiling=${para#*=}
-    fi
-done
-
-mpirun -np 8  --allow-run-as-root \
-              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
-
-wait
-
-rm -rf CKPT
-rm -rf gpt_dataset/redpajama_text_document
--- a/examples/gpt3/run_gpt_567B_multinodes.sh
+++ b/examples/gpt3/run_gpt_567B_multinodes.sh
-for para in $*
-do
-    if [[ $para == --profiling* ]];then
-        profiling=${para#*=}
-    fi
-done
-
-mpirun -np 512 --hostfile hostfile_gpt_567B \
-              --allow-run-as-root \
-              --bind-to none \
-              --mca plm_rsh_no_tree_spawn 1 \
-              train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1
-
-wait
-
-rm -rf CKPT
-rm -rf gpt_dataset/redpajama_text_document
\ No newline at end of file
--- a/examples/gpt3/train_gpt_567B_multinodes.sh
+++ b/examples/gpt3/train_gpt_567B_multinodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs GPT 567B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-
-# nccl env
-export NCCL_ALGO=Ring
-export NCCL_MIN_NCHANNELS=32
-export NCCL_MAX_NCHANNELS=32
-export NCCL_NET_GDR_LEVEL=7
-export NCCL_NET_GDR_READ=1
-export RCCL_SDMA_COPY_ENABLE=0
-export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to redpajama_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -83,51 +77,33 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 98,2,0
 )

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 1024
+    --global-batch-size 2048
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 4
-    --pipeline-model-parallel-size 8
+    --pipeline-model-parallel-size 16
    --expert-model-parallel-size 16
    --expert-tensor-parallel-size 4
    --context-parallel-size 2
-    #--num-layers-per-virtual-pipeline-stage 2
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -146,10 +122,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_128nodes_tp4-pp16-ep16-etp4-cp2
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-project ${WANDB_PROJECT:-"GPT"}
+        --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
    )
 fi

@@ -173,44 +166,28 @@ fi

 #for hygon cpu
 case ${LOCAL_RANK} in
-[0])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
-  ;;
-[2])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
-  ;;
-[3])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
-  ;;
-[4])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
-  ;;
-[5])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
-  ;;
-[6])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
-  ;;
-[7])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
-  ;;
-esac
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/gpt3/train_gpt_567B_1nodes.sh
+++ b/examples/gpt3/train_gpt_567B_1nodes.sh
@@ -2,17 +2,25 @@

 for para in $*
 do
-    if [[ $para == --profiling* ]];then
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
 done

-# Runs GPT 567B model
-source /opt/dtk/env.sh
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}

 # default env
 DIST_URL=${1}
-DIST_PORT=25900
+DIST_PORT=${2}
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-
-# nccl env
-export NCCL_ALGO=Ring
-export NCCL_MIN_NCHANNELS=32
-export NCCL_MAX_NCHANNELS=32
-export NCCL_NET_GDR_LEVEL=7
-export NCCL_NET_GDR_READ=1
-export RCCL_SDMA_COPY_ENABLE=0
-export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="./topo-input.xml"
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1

-# data path
-CHECKPOINT_PATH="path to CKPT" 
-TOKENIZER_MODEL="path to tokenizer.model"
-DATA_PATH="path to redpajama_text_document"
-
 DISTRIBUTED_ARGS=(
    --rank ${RANK}
    --world-size ${WORLD_SIZE}
@@ -83,8 +77,8 @@ MOE_ARGS=(

 DATA_ARGS=(
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model ${TOKENIZER_MODEL}
-    --data-path $DATA_PATH
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH}
    --split 98,2,0
 )

@@ -93,39 +87,23 @@ TRAINING_ARGS=(
    --global-batch-size 256
    --lr 1e-4
    --train-iters 10
-    --lr-decay-iters 320000
+    --lr-decay-iters 10000
    --lr-decay-style cosine
-    --min-lr 1.0e-5
+    --min-lr 1.0e-6
    --weight-decay 0.1
-    --lr-warmup-iters 500
+    --lr-warmup-iters 2000
    --clip-grad 1.0
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
 )

-TORCH_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 3
-    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
-    --use-pytorch-profiler
-)
-
-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
    --expert-model-parallel-size 4
    --expert-tensor-parallel-size 2
+    --context-parallel-size 1
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -144,10 +122,27 @@ LOGGING_ARGS=(
    --no-save-optim
 )

+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-etp2-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
 if [ -n "${WANDB_API_KEY}" ]; then
    LOGGING_ARGS+=(
-        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
-        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+        --wandb-project ${WANDB_PROJECT:-"GPT"}
+        --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
    )
 fi

@@ -171,44 +166,28 @@ fi

 #for hygon cpu
 case ${LOCAL_RANK} in
-[0])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=1 --membind=1 ${APP}
-  ;;
-[2])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=2 --membind=2 ${APP}
-  ;;
-[3])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=3 --membind=3 ${APP}
-  ;;
-[4])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=4 --membind=4 ${APP}
-  ;;
-[5])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=5 --membind=5 ${APP}
-  ;;
-[6])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=6 --membind=6 ${APP}
-  ;;
-[7])
-  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  #numactl --cpunodebind=7 --membind=7 ${APP}
-  ;;
-esac
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/llama/Llama2_7b.sh
+++ b/examples/llama/Llama2_7b.sh
-#!/bin/bash
-# set -eux
-
-for para in $*
-do
-    if [[ $para == --profiling* ]];then
-        profiling=${para#*=}
-    fi
-done
-
-CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
-
-
-#default env
-#export FLASH_ATTENTION_PRINT_PARAM=1
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export OMP_NUM_THREADS=1
-export NCCL_P2P_LEVEL=PXB # SYS
-# export GPU_MAX_HW_QUEUES=10
-#export HIP_ALLOC_INITIALIZE=0
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-# nccl env
-export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=16
-export NCCL_MIN_NCHANNELS=32 # 20
-export NCCL_MAX_NCHANNELS=32 # 20
-export NCCL_IB_TIMEOUT=22
-export NCCL_NET_GDR_LEVEL=7
-export NCCL_NET_GDR_READ=1
-export RCCL_SDMA_COPY_ENABLE=0
-export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
-export GLOG_minloglevel=3 # 打印error级别的nccl日志
-source /opt/dtk/env.sh
-
-# hipblaslt库
-export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
-
-# rocblas
-export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
-
-# torch控制多流转单流
-export ALLREDUCE_STREAM_WITH_COMPUTE=1
-export SENDRECV_STREAM_WITH_COMPUTE=1 
-
-#增加编译缓存
-export cache_size_limit=64
-
-# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
-SAVE_PATH=./tmp_7b
-TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
-
-GPT_MODEL_ARGS=(
-    --num-layers 32
-    --hidden-size 4096
-    --ffn-hidden-size 11008 
-    --num-attention-heads 32
-    --max-position-embeddings 4096
-
-    --normalization RMSNorm # LightopRMSNorm
-    --position-embedding-type rope # none # 
-    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
-)
-
-export NVTE_FLASH_ATTN=1 # 走cutlass
-# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
-# --transformer-impl transformer_engine # 走core用这两组参数
-    # --use-mcore-models
-    # --transformer-impl local # 走legacy用这两组参数
-    # --use-legacy-models 
-TRAINING_ARGS=(
-    --transformer-impl local # 走legacy用这两组参数
-    --use-legacy-models 
-    --micro-batch-size 1
-    --global-batch-size 256 #256 #240 #60 #512 #64
-    --train-iters 50
-    --weight-decay 0.1 
-    --adam-beta1 0.9 
-    --adam-beta2 0.95 
-    --init-method-std 0.006 
-    --clip-grad 1.0 
-    --bf16
-    # --fp16 # 开启fp16需要指定loss-scale
-    # --loss-scale 1024
-    --use-distributed-optimizer 
-    --disable-bias-linear
-    --attention-dropout 0
-    --hidden-dropout 0
-    # --no-gradient-accumulation-fusion
-    --swiglu
-    --lr 3.0e-5 
-    --lr-decay-style cosine 
-    --min-lr 3.0e-6
-    --lr-warmup-iters 1
-    --ckpt-format torch
-    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
-    # --recompute-granularity full # 开启重计算降低显存增加耗时
-    # --recompute-num-layers 5 #0 #
-    # --recompute-method block
-    --overlap-grad-reduce # 重叠ddp grad reduce
-    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
-    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
-    --use-flash-attn
-)
-# 使用torch fa的环境变量
-# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
-# export TORCHINDUCTOR_BENCHMARK_FUSION=1
-# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
-# export TORCHINDUCTOR_MAX_AUTOTUNE=1
-# export TORCHINDUCTOR_CACHE_DIR=./cache
-# --use-flash-attn-cutlass # cutlass fa
-# --use-flash-attn-triton # triton fa
-# --use-flash-attn-torch # torch fa
-
-MODEL_PARALLEL_ARGS=(
-    --sequence-parallel
-	--tensor-model-parallel-size 1
-	--pipeline-model-parallel-size 2
-  # --context-parallel-size 2
-  # --num-layers-per-virtual-pipeline-stage 4
-  # --microbatch-group-size-per-virtual-pipeline-stage 1
-  # --no-overlap-p2p-communication # 开启后
-)
-
-DATA_ARGS=(
-    --data-path $DATA_PATH 
-    --seq-length 4096 #4096
-    --split 949,50,1
-    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
-)
-
-EVAL_AND_LOGGING_ARGS=(
-    --log-throughput
-    --eval-iters 50
-    --log-interval 1
-    --save-interval 1000 
-    --eval-interval 1000 
-    --save $SAVE_PATH 
-    --load $SAVE_PATH 
-    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
-)
-
-# FINETUNE_ARGS=(
-#     # --finetune
-#     # --pretrained-checkpoint $CHECKPOINT_PATH
-#     --load $CHECKPOINT_PATH
-#     --no-load-optim
-#     --no-load-rng
-# )
-
-PROFILE_ARGS=(
-    --profile
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-pytorch-profiler
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-dir prof_data
-)
-
-RANK=$OMPI_COMM_WORLD_RANK
-LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
-DIST_URL=${1}
-DIST_PORT=34577
-
-DISTRIBUTED_ARGS=(
-    --rank ${RANK}
-    --world-size ${WORLD_SIZE}
-    --local-rank ${LOCAL_RANK}
-    --dist-url tcp://${DIST_URL}:${DIST_PORT}
-)
-
-APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
-        ${GPT_MODEL_ARGS[@]} \
-        ${TRAINING_ARGS[@]} \
-        ${MODEL_PARALLEL_ARGS[@]} \
-        ${DATA_ARGS[@]} \
-        ${EVAL_AND_LOGGING_ARGS[@]} \
-        ${DISTRIBUTED_ARGS[@]} \
-        
-"
-# 开启profile
-# ${PROFILE_ARGS[@]} \
-
-# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
-# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
-# ${APP}
-case ${LOCAL_RANK} in
-[0])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  numactl --cpunodebind=1 --membind=1 ${APP}
-  ;;
-[2])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  numactl --cpunodebind=2 --membind=2 ${APP}
-  ;;
-[3])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=3 --membind=3 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[4])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=4 --membind=4 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[5])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=5 --membind=5 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[6])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=6 --membind=6 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[7])
-  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=7 --membind=7 ${APP}
-  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-esac
\ No newline at end of file
--- a/examples/llama/hostfile_llama2_7B
+++ b/examples/llama/hostfile_llama2_7B
--- a/examples/llama/run_llama2_7B.sh
+++ b/examples/llama/run_llama2_7B.sh
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+# Those variables need to modify
+GPUS=""                 # how many gpus to use
+DTK_ENV=""              # where env.sh of dtk
+NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+HOST=""                 # hostname
+PORT=""                 # port id
+DATA_PATH=""            # path to oscar-1GB_head-llama2_text_document
+TOKENIZER_MODEL_PATH="" # path to tokenizer.model
+CHECKPOINT_PATH=""      # path to ckpt
+
+# Runs Llama2 7B model
+mpirun -np ${GPUS}  --hostfile hostfile_llama2_7B \
+                    --allow-run-as-root \
+                    --bind-to none \
+                    --mca plm_rsh_no_tree_spawn 1 \
+                    bash -c "
+                    source ${DTK_ENV} && \
+                    source ${NCCL_ENV} && \
+                    ./train_llama2_7b_$((${GPUS} / 8))nodes.sh \
+                    ${HOST} \
+                    ${PORT} \
+                    --data_path=$DATA_PATH \
+                    --tokenizer_path=$TOKENIZER_MODEL_PATH \
+                    --checkpoint_path=$CHECKPOINT_PATH \
+                    --profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
+
+wait
\ No newline at end of file
--- a/examples/llama/train_llama2_7b_1nodes.sh
+++ b/examples/llama/train_llama2_7b_1nodes.sh
+#!/bin/bash
+
+INITIALIZATION_ARGS=( --num-workers 2)
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=${para#*=}
+    elif [[ $para == --tokenizer_path* ]];then
+        tokenizer_path=${para#*=}
+    elif [[ $para == --checkpoint_path* ]];then
+        checkpoint_path=${para#*=}
+    elif [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    elif [[ $para == --reproduce* ]];then
+        INITIALIZATION_ARGS=( --reproduce --num-workers 0)
+        export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1  # miopen 确定算法打开
+        export ROCBLAS_ATOMICS_MOD=0                     # rocblas 关闭原子操作
+        # 关闭miopen中的atomic操作算法, 只保留gemm算法
+        export MIOPEN_DEBUG_CONV_FFT=0
+        export MIOPEN_DEBUG_CONV_DIRECT=0
+        export MIOPEN_DEBUG_CONV_GEMM=1
+        export MIOPEN_DEBUG_CONV_WINOGRAD=0
+        export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
+    fi
+done
+
+# data path
+DATA_PATH=${data_path}
+TOKENIZER_MODEL_PATH=${tokenizer_path}
+CHECKPOINT_PATH=${checkpoint_path}
+
+# default env
+DIST_URL=${1}
+DIST_PORT=${2}
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export GLOG_minloglevel=3
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
+
+# torch控制多流转单流
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
+
+#增加编译缓存
+export cache_size_limit=64
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+GPT_MODEL_ARGS=(
+    --seq-length 4096
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 11008 
+    --num-attention-heads 32
+    --max-position-embeddings 4096
+    --normalization LightopRMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
+)
+
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1
+    --global-batch-size 256
+    --train-iters 50
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective
+    --overlap-grad-reduce
+    --use-flash-attn
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 2
+    --context-parallel-size 1
+    --use-distributed-optimizer 
+    --sequence-parallel
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL_PATH}
+    --data-path ${DATA_PATH} 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-throughput
+    --eval-iters 5
+    --log-interval 1
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH
+    --load $CHECKPOINT_PATH
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" 
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_llama_1nodes_tp1-pp2-cp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]} \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${INITIALIZATION_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+    0) 
+        export HIP_VISIBLE_DEVICES=0
+        numactl --cpunodebind=0 --membind=0 ${APP} ;;
+    1) 
+        export HIP_VISIBLE_DEVICES=1
+        numactl --cpunodebind=1 --membind=1 ${APP} ;;
+    2) 
+        export HIP_VISIBLE_DEVICES=2
+        numactl --cpunodebind=2 --membind=2 ${APP} ;;
+    3) 
+        export HIP_VISIBLE_DEVICES=3
+        numactl --cpunodebind=3 --membind=3 ${APP} ;;
+    4) 
+        export HIP_VISIBLE_DEVICES=4
+        numactl --cpunodebind=4 --membind=4 ${APP} ;;
+    5) 
+        export HIP_VISIBLE_DEVICES=5
+        numactl --cpunodebind=5 --membind=5 ${APP} ;;
+    6) 
+        export HIP_VISIBLE_DEVICES=6
+        numactl --cpunodebind=6 --membind=6 ${APP} ;;
+    7) 
+        export HIP_VISIBLE_DEVICES=7
+        numactl --cpunodebind=7 --membind=7 ${APP} ;;
+esac
\ No newline at end of file
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
-# Mixtral 8x7B Model Inference and Finetuning
-
-## Download Mixtral 8x7B Checkpoints
-Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
-
-Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
-```python
-from huggingface_hub import snapshot_download
-SAVED_DIR = "" # Specify the saved directory
-# Download HF checkpoints
-snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
-```
-
-## Convert Mixtral 8x7B checkpoints from HF to MCore
-The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
-The target model parallel size(e.g. TP,PP,EP) should be specified.
-
-Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
-
-```
-TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
-MEGATRON_PATH="/workspace/megatron-lm"
-export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-TARGET_TP_SIZE=""
-TARGET_EP_SIZE=""
-TARGET_PP_SIZE=""
-
-HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
-MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
-
-python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
-```
-
-## Text generation with Mixtral 8x7B
-Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
-
-The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
-```
-#!/bin/bash
-# This example will start serving the Mixtral 8x7B model.
-DISTRIBUTED_ARGS="--nproc_per_node 2 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint>
-TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 2  \
-       --expert-model-parallel-size 1 \
-       --load ${CHECKPOINT}  \
-       --tokenizer-type Llama2Tokenizer \
-       --tokenizer-model $TOKENIZER_MODEL \
-       --use-mcore-models \
-       --max-position-embeddings 32768 \
-       --num-layers 32 \
-       --hidden-size 4096 \
-       --ffn-hidden-size 14336 \
-       --num-attention-heads 32 \
-       --normalization RMSNorm \
-       --disable-bias-linear \
-       --position-embedding-type rope \
-       --no-position-embedding \
-       --swiglu \
-       --untie-embeddings-and-output-weights \
-       --group-query-attention \
-       --num-query-groups 8 \
-       --bf16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --seed 42 \
-       --num-experts 8 \
-       --moe-router-topk 2 \
-       --moe-token-dispatcher-type alltoall \
-       --moe-grouped-gemm \
-       --mock-data \
-       --rotary-base 1000000
-```
-
-Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
-
-```
-python tools/text_generation_cli.py localhost:5000
-```
-
-
-## Finetuning from pretrained Mixtral 8x7B
-To finetuning pretrained Mixtral 8x7B, use the following scripts:
-
-
-```bash
-PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
-CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
-TOKENIZER_MODEL="" # Specify path to tokenizer.model
-DATA_PATH="" # Specify path to data
-
-docker run \
-    --gpus=all \
-    --ipc=host \
-    --workdir /workspace/megatron-lm \
-    -v /path/to/data:/path/to/data \
-    -v /path/to/megatron-lm:/workspace/megatron-lm \
-    $PYTORCH_IMAGE \
-    bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
-```
-
-The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
-
-## Acknowledgements
-Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>