update model parameters format

250c7fb0 · wxj · c788823b · 250c7fb0 · 250c7fb0 · 250c7fb0
Commit 250c7fb0 authored May 22, 2025 by wxj
14 changed files
--- a/examples/deepseek_v3/run_deepseekv3_671B.sh
+++ b/examples/deepseek_v3/run_deepseekv3_671B.sh
@@ -7,8 +7,7 @@ done

 # Those variables need to modify
 GPUS=""                 # how many gpus to use
-DTK_ENV=""              # where env.sh of dtk
-NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+MPI_PORT=""             # mpi port to use
 HOST=""                 # hostname
 PORT=""                 # port id
 DATA_PATH=""            # path to mmap_deepseekv3_datasets_text_document
@@ -20,9 +19,8 @@ mpirun -np ${GPUS}  --hostfile hostfile_deepseekv3_671B \
                    --allow-run-as-root \
                    --bind-to none \
                    --mca plm_rsh_no_tree_spawn 1 \
+                    --mca plm_rsh_args "-p ${MPI_PORT}" \
                    bash -c "
-                    source ${DTK_ENV} && \
-                    source ${NCCL_ENV} && \
                    ./train_deepseekv3_671B_$((${GPUS} / 8))nodes.sh \
                    ${HOST} \
                    ${PORT} \

--- a/examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
@@ -29,7 +29,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1
 export MP_PP0_LAYERS=5 # 是否使能视实际情况而定

 ### BASE CONFIG ###

--- a/examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
@@ -29,7 +29,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1
 #export MP_PP0_LAYERS=2 # 是否使能视实际情况而定

 ### BASE CONFIG ###

--- a/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+++ b/examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
@@ -29,7 +29,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1
 export MP_PP0_LAYERS=2 # 是否使能视实际情况而定

 ### BASE CONFIG ###

--- a/examples/gpt3/run_gpt_567B.sh
+++ b/examples/gpt3/run_gpt_567B.sh
@@ -7,8 +7,7 @@ done

 # Those variables need to modify
 GPUS=""                 # how many gpus to use
-DTK_ENV=""              # where env.sh of dtk
-NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+MPI_PORT=""             # mpi port to use
 HOST=""                 # hostname
 PORT=""                 # port id
 DATA_PATH=""            # path to redpajama_text_document
@@ -20,9 +19,8 @@ mpirun -np ${GPUS}  --hostfile hostfile_gpt_567B \
                    --allow-run-as-root \
                    --bind-to none \
                    --mca plm_rsh_no_tree_spawn 1 \
+                    --mca plm_rsh_args "-p ${MPI_PORT}" \
                    bash -c "
-                    source ${DTK_ENV} && \
-                    source ${NCCL_ENV} && \
                    ./train_gpt_567B_$((${GPUS} / 8))nodes.sh \
                    ${HOST} \
                    ${PORT} \

--- a/examples/gpt3/train_gpt_567B_128nodes.sh
+++ b/examples/gpt3/train_gpt_567B_128nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/examples/gpt3/train_gpt_567B_1nodes.sh
+++ b/examples/gpt3/train_gpt_567B_1nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/examples/mixtral/run_mixtral_8x22B.sh
+++ b/examples/mixtral/run_mixtral_8x22B.sh
@@ -7,8 +7,7 @@ done

 # Those variables need to modify
 GPUS=""                 # how many gpus to use
-DTK_ENV=""              # where env.sh of dtk
-NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+MPI_PORT=""             # mpi port to use
 HOST=""                 # hostname
 PORT=""                 # port id
 DATA_PATH=""            # path to my-mixtral_text_document
@@ -20,9 +19,8 @@ mpirun -np ${GPUS}  --hostfile hostfile_mixtral_8x22B \
                    --allow-run-as-root \
                    --bind-to none \
                    --mca plm_rsh_no_tree_spawn 1 \
+                    --mca plm_rsh_args "-p ${MPI_PORT}" \
                    bash -c "
-                    source ${DTK_ENV} && \
-                    source ${NCCL_ENV} && \
                    ./train_mixtral_8x22B_$((${GPUS} / 8))nodes.sh \
                    ${HOST} \
                    ${PORT} \

--- a/examples/mixtral/run_mixtral_8x7B.sh
+++ b/examples/mixtral/run_mixtral_8x7B.sh
@@ -7,8 +7,7 @@ done

 # Those variables need to modify
 GPUS=""                 # how many gpus to use
-DTK_ENV=""              # where env.sh of dtk
-NCCL_ENV=""             # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
+MPI_PORT=""             # mpi port to use
 HOST=""                 # hostname
 PORT=""                 # port id
 DATA_PATH=""            # path to my-mixtral_text_document
@@ -20,9 +19,8 @@ mpirun -np ${GPUS}  --hostfile hostfile_mixtral_8x7B \
                    --allow-run-as-root \
                    --bind-to none \
                    --mca plm_rsh_no_tree_spawn 1 \
+                    --mca plm_rsh_args "-p ${MPI_PORT}" \
                    bash -c "
-                    source ${DTK_ENV} && \
-                    source ${NCCL_ENV} && \
                    ./train_mixtral_8x7B_$((${GPUS} / 8))nodes.sh \
                    ${HOST} \
                    ${PORT} \

--- a/examples/mixtral/train_mixtral_8x22B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_1nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/examples/mixtral/train_mixtral_8x22B_8nodes.sh
+++ b/examples/mixtral/train_mixtral_8x22B_8nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/examples/mixtral/train_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_1nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/examples/mixtral/train_mixtral_8x7B_4nodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_4nodes.sh
@@ -34,7 +34,7 @@ export GPU_MAX_HW_QUEUES=10
 export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH

 # enable BatchLinear
-export GROUPED_GEMM_BatchLinear=1
+#export GROUPED_GEMM_BatchLinear=1

 DISTRIBUTED_ARGS=(
    --rank ${RANK}

--- a/requirements/launch_with_binding.sh
+++ b/requirements/launch_with_binding.sh
 #!/bin/bash

-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+"${@:2}"
+# LOCAL_RANK=$1
+# shift

-LOCAL_RANK=$1
-shift
-
-numa_map=(0 1 2 3 4 5 6 7)
-NUMA_ID=${numa_map[$LOCAL_RANK]}
-numactl --cpunodebind=${NUMA_ID} --membind=${NUMA_ID} "$@"
+# numa_map=(0 1 2 3 4 5 6 7)
+# NUMA_ID=${numa_map[$LOCAL_RANK]}
+# numactl --cpunodebind=${NUMA_ID} --membind=${NUMA_ID} "$@"