Commit 70368616 authored by silencealiang's avatar silencealiang
Browse files

update model parameters

parent 8551c38e
...@@ -2,14 +2,13 @@ for para in $* ...@@ -2,14 +2,13 @@ for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi fi
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_deepseek_v3_1node.sh localhost --profiling=$profiling > output.log 2>&1 train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait wait
rm -rf CKPT rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
wait
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
#!/bin/bash #!/bin/bash
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10 export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32
...@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/public/home/yuguo/check/rccl-tests-0204/topo-input.xml" #"your topo file" export NCCL_TOPO_FILE="./topo-input.xml"
export GLOG_minloglevel=3
export GROUPED_GEMM_BatchLinear=1
export LD_LIBRARY_PATH=/public/home/yuguo/data/rocblas-install-0224/lib:$LD_LIBRARY_PATH
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK # enable BatchLinear
RANK=$OMPI_COMM_WORLD_RANK export GROUPED_GEMM_BatchLinear=1
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE #export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ### ### BASE CONFIG ###
MODEL_SIZE=A37B MODEL_SIZE=A37B
BATCH_SIZE=1 BATCH_SIZE=1
GLOBAL_BATCH_SIZE=256 GLOBAL_BATCH_SIZE=256
LR=1e-5 LR=1e-4
MIN_LR=1e-6 MIN_LR=1e-6
SEQ_LEN=4096 SEQ_LEN=4096
PAD_LEN=4096
PR=bf16 PR=bf16
### BASE CONFIG ### ### BASE CONFIG ###
...@@ -45,6 +53,7 @@ PR=bf16 ...@@ -45,6 +53,7 @@ PR=bf16
TP=1 TP=1
PP=2 PP=2
CP=1 CP=1
ETP=1
EP=4 EP=4
SP=true SP=true
DO=true DO=true
...@@ -56,13 +65,14 @@ SFT=false ...@@ -56,13 +65,14 @@ SFT=false
AC=none AC=none
OPTIMIZER_OFFLOAD=false OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500 SAVE_INTERVAL=500
DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path" DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path" VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH=${MEGATRON_PATH}/deepseekv3_dataset #"your model path" PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true # the following two values will not be used when SFT is true
TRAIN_TOKENS=100000000 TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=10000 WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
############################### ###############################
OUTPUT_BASEPATH=./output OUTPUT_BASEPATH=./output
...@@ -72,20 +82,19 @@ if [ $FL = true ]; then ...@@ -72,20 +82,19 @@ if [ $FL = true ]; then
: :
#exit -1 #exit -1
elif [ $FL = false ]; then elif [ $FL = false ]; then
export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
attn_backend_option=" \ attn_backend_option=" \
--attention-backend fused --attention-backend auto
" "
fi fi
if [ $MODEL_SIZE = A37B ]; then if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=2 TRAIN_ITERS=10
HIDDEN_SIZE=7168 HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128 NUM_ATTENTION_HEADS=128
NUM_LAYERS=2 NUM_LAYERS=2
INTERMEDIATE_SIZE=18432 INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048 MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=${SEQ_LEN} MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467 EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536 Q_LORA_RANK=1536
KV_LORA_RANK=512 KV_LORA_RANK=512
...@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then ...@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then
V_HEAD_DIM=128 V_HEAD_DIM=128
ROPE_THETA=10000 ROPE_THETA=10000
SCALE_FACTOR=40 SCALE_FACTOR=40
NUM_EXPERTS=8 #256 NUM_EXPERTS=8
ROUTER_TOPK=8 ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1 NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6 RMS_NORM_EPS=1e-6
moe_options=" \ moe_options=" \
--moe-grouped-gemm \ --moe-grouped-gemm \
--moe-expert-capacity-factor 1 \ --moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \ --moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \ --moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \ --moe-router-topk ${ROUTER_TOPK} \
--num-experts ${NUM_EXPERTS} \ --moe-router-group-topk 2 \
--expert-model-parallel-size ${EP} \ --moe-router-num-groups 4 \
--expert-tensor-parallel-size 1 \ --num-experts ${NUM_EXPERTS} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ --expert-model-parallel-size ${EP} \
--moe-router-load-balancing-type aux_loss \ --expert-tensor-parallel-size ${ETP} \
--moe-aux-loss-coeff 0.001 \ --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-layer-freq ([0]*0+[1]*2) \ --moe-router-load-balancing-type seq_aux_loss \
--q-lora-rank ${Q_LORA_RANK} \ --moe-router-topk-scaling-factor 2.5 \
--kv-lora-rank ${KV_LORA_RANK} \ --moe-shared-expert-overlap \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \ --moe-router-enable-expert-bias \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \ --mscale 1.0 \
--v-head-dim ${V_HEAD_DIM} \ --mscale-all-dim 1.0 \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \ --moe-router-score-function sigmoid \
" --moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*1) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi fi
# Here are some configs controled by env # Here are some configs controled by env
...@@ -147,6 +167,14 @@ comm_overlap_option="\ ...@@ -147,6 +167,14 @@ comm_overlap_option="\
--overlap-grad-reduce \ --overlap-grad-reduce \
--overlap-param-gather" --overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} )) _check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then if [ $_check != 0 ]; then
...@@ -154,9 +182,9 @@ if [ $AC = full ]; then ...@@ -154,9 +182,9 @@ if [ $AC = full ]; then
exit -1 exit -1
fi fi
activation_checkpoint_options=" \ activation_checkpoint_options=" \
--recompute-method uniform \ --recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \ --recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full" --recompute-granularity full"
elif [ $AC = sel ]; then elif [ $AC = sel ]; then
activation_checkpoint_options=" \ activation_checkpoint_options=" \
--recompute-activations" --recompute-activations"
...@@ -165,8 +193,8 @@ elif [ $AC = none ]; then ...@@ -165,8 +193,8 @@ elif [ $AC = none ]; then
" "
elif [ $AC = offload ]; then elif [ $AC = offload ]; then
activation_checkpoint_options=" \ activation_checkpoint_options=" \
--cpu-offloading \ --cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}" --cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..." echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\ comm_overlap_option="\
...@@ -179,8 +207,8 @@ fi ...@@ -179,8 +207,8 @@ fi
if [ $PR = fp16 ]; then if [ $PR = fp16 ]; then
pr_options=" \ pr_options=" \
--fp16 \ --fp16 \
--apply-query-key-layer-scaling" --apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1 export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then elif [ $PR = bf16 ]; then
pr_options=" \ pr_options=" \
...@@ -200,7 +228,7 @@ fi ...@@ -200,7 +228,7 @@ fi
if [ $DO = true ]; then if [ $DO = true ]; then
do_option=" \ do_option=" \
--use-distributed-optimizer" --use-distributed-optimizer"
elif [ $DO = false ]; then elif [ $DO = false ]; then
do_option=" \ do_option=" \
...@@ -210,7 +238,7 @@ fi ...@@ -210,7 +238,7 @@ fi
if [ $SP = true ] && [ $TP -gt 1 ]; then if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \ sp_option=" \
--sequence-parallel" --sequence-parallel"
elif [ $SP = false ]; then elif [ $SP = false ]; then
sp_option=" \ sp_option=" \
...@@ -236,7 +264,7 @@ fi ...@@ -236,7 +264,7 @@ fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \ load_option=" \
--tokenizer-model $PRETRAIN_CHECKPOINT_PATH" --load $PRETRAIN_CHECKPOINT_PATH"
fi fi
if [ $OPTIMIZER_OFFLOAD != false ]; then if [ $OPTIMIZER_OFFLOAD != false ]; then
...@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then ...@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
fi fi
if [ $SFT = true ]; then if [ $SFT = true ]; then
TRAIN_ITERS=${24} TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${25} LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS})) LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3" PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) # TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3" PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then if [ ${MP_DATASET_TYPE} = "raw" ]; then
...@@ -278,16 +312,18 @@ else ...@@ -278,16 +312,18 @@ else
fi fi
##### Prepare logdirs ####### ##### Prepare logdirs #######
NAME="${PREFIX}" NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/" mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/" mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/" mkdir -p "${OUTPUT_BASEPATH}/log/"
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}" current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR} mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}" SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH} mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH} #find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \ megatron_options=" \
--lr ${LR} \ --lr ${LR} \
...@@ -314,7 +350,7 @@ megatron_options=" \ ...@@ -314,7 +350,7 @@ megatron_options=" \
--log-interval 1 \ --log-interval 1 \
--log-throughput \ --log-throughput \
--eval-interval 10000 \ --eval-interval 10000 \
--eval-iters 5 \ --eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \ --save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \ --tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
...@@ -328,13 +364,12 @@ megatron_options=" \ ...@@ -328,13 +364,12 @@ megatron_options=" \
--num-workers 8 \ --num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \ --extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \ --tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \ --swiglu \
--normalization RMSNorm \ --normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \ --norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \ --use-rotary-position-embeddings \
--no-bias-swiglu-fusion \
--no-rope-fusion \ --no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--rotary-base ${ROPE_THETA} \ --rotary-base ${ROPE_THETA} \
...@@ -342,12 +377,11 @@ megatron_options=" \ ...@@ -342,12 +377,11 @@ megatron_options=" \
--no-save-optim \ --no-save-optim \
--kv-channels ${V_HEAD_DIM} \ --kv-channels ${V_HEAD_DIM} \
--qk-layernorm \ --qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \ --ckpt-format torch \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \ --use-rope-scaling \
--multi-latent-attention \
--mtp-num-layers 1 \
--use-mcore-models \
" "
TORCH_PROFIE_ARGS=" \ TORCH_PROFIE_ARGS=" \
...@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS=" \ ...@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \ --profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \ --profile-step-start 3 \
--profile-step-end 4 \ --profile-step-end 4 \
--profile-dir torch_prof_data_16nodes_dcu \ --profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \ --use-pytorch-profiler \
" "
...@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS=" \ ...@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS=" \
--use-hip-profiler \ --use-hip-profiler \
" "
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py DISTRIBUTED_ARGS=" \
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${sft_options} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${comm_overlap_option} \
--rank ${RANK} \ --rank ${RANK} \
--world-size ${WORLD_SIZE} \ --world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \ --local-rank ${LOCAL_RANK} \
--dist-url tcp://${1}:25900 \ --dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
" "
if [[ $profiling == "torch" ]]; then if [[ $profiling == "torch" ]]; then
...@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then ...@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}" APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi fi
#for hygon cpu
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=512
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=2
PP=2
CP=1
ETP=1
EP=16
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=3
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*2) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=4096
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=4
PP=8
CP=1
ETP=2
EP=64
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=61
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*3+[1]*58) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
...@@ -6,7 +6,7 @@ do ...@@ -6,7 +6,7 @@ do
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1 train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -5,11 +5,11 @@ do ...@@ -5,11 +5,11 @@ do
fi fi
done done
mpirun -np 512 --hostfile hostfile_gpt_567B \ mpirun -np 1024 --hostfile hostfile_gpt_567B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1 train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -93,11 +93,11 @@ TRAINING_ARGS=( ...@@ -93,11 +93,11 @@ TRAINING_ARGS=(
--global-batch-size 256 --global-batch-size 256
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=( ...@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
--expert-model-parallel-size 4 --expert-model-parallel-size 4
--expert-tensor-parallel-size 2 --expert-tensor-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -173,42 +174,34 @@ fi ...@@ -173,42 +174,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
...@@ -90,14 +90,14 @@ DATA_ARGS=( ...@@ -90,14 +90,14 @@ DATA_ARGS=(
TRAINING_ARGS=( TRAINING_ARGS=(
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 1024 --global-batch-size 2048
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=( ...@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2 --profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
--use-pytorch-profiler --use-pytorch-profiler
) )
...@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=( ...@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=(
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4 --tensor-model-parallel-size 4
--pipeline-model-parallel-size 8 --pipeline-model-parallel-size 16
--expert-model-parallel-size 16 --expert-model-parallel-size 16
--expert-tensor-parallel-size 4 --expert-tensor-parallel-size 4
--context-parallel-size 2 --context-parallel-size 2
#--num-layers-per-virtual-pipeline-stage 2
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -175,42 +174,34 @@ fi ...@@ -175,42 +174,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
...@@ -6,7 +6,7 @@ do ...@@ -6,7 +6,7 @@ do
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1 train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \ ...@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1 train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -6,7 +6,7 @@ do ...@@ -6,7 +6,7 @@ do
done done
mpirun -np 8 --allow-run-as-root \ mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1 train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \ ...@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1 train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait wait
......
...@@ -96,11 +96,11 @@ TRAINING_ARGS=( ...@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256 --global-batch-size 256
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=( ...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
--expert-model-parallel-size 8 --expert-model-parallel-size 8
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -143,7 +144,8 @@ LOGGING_ARGS=( ...@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \ #--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \ --no-load-optim \
--no-load-rng --no-load-rng \
--no-save-optim
) )
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
...@@ -175,43 +177,34 @@ fi ...@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
...@@ -96,11 +96,11 @@ TRAINING_ARGS=( ...@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256 --global-batch-size 256
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=( ...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1 --profile-dir torch_prof_mixtral8x22B_1nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler --use-pytorch-profiler
) )
...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=( ...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 8 --pipeline-model-parallel-size 8
--expert-model-parallel-size 8 --expert-model-parallel-size 8
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -143,7 +144,8 @@ LOGGING_ARGS=( ...@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \ #--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \ --no-load-optim \
--no-load-rng --no-load-rng \
--no-save-optim
) )
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
...@@ -175,43 +177,34 @@ fi ...@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment