Commit 70368616 authored by silencealiang's avatar silencealiang
Browse files

update model parameters

parent 8551c38e
......@@ -2,14 +2,13 @@ for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 8 --allow-run-as-root \
train_deepseek_v3_1node.sh localhost --profiling=$profiling > output.log 2>&1
train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
rm -rf CKPT
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
wait
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
......@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/public/home/yuguo/check/rccl-tests-0204/topo-input.xml" #"your topo file"
export GLOG_minloglevel=3
export GROUPED_GEMM_BatchLinear=1
export LD_LIBRARY_PATH=/public/home/yuguo/data/rocblas-install-0224/lib:$LD_LIBRARY_PATH
export NCCL_TOPO_FILE="./topo-input.xml"
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=256
LR=1e-5
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
......@@ -45,6 +53,7 @@ PR=bf16
TP=1
PP=2
CP=1
ETP=1
EP=4
SP=true
DO=true
......@@ -56,13 +65,14 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
VALID_DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
PRETRAIN_CHECKPOINT_PATH=${MEGATRON_PATH}/deepseekv3_dataset #"your model path"
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=100000000
WARMUP_TOKENS=10000
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
......@@ -72,20 +82,19 @@ if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
attn_backend_option=" \
--attention-backend fused
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=2
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=2
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
......@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=8 #256
NUM_EXPERTS=8
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 1 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size 1 \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type aux_loss \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*0+[1]*2) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
"
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 2 \
--moe-router-num-groups 4 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*1) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
......@@ -147,6 +167,14 @@ comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
......@@ -154,9 +182,9 @@ if [ $AC = full ]; then
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
......@@ -165,8 +193,8 @@ elif [ $AC = none ]; then
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
......@@ -179,8 +207,8 @@ fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
......@@ -200,7 +228,7 @@ fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
......@@ -210,7 +238,7 @@ fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
......@@ -236,7 +264,7 @@ fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--tokenizer-model $PRETRAIN_CHECKPOINT_PATH"
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
......@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${24}
LR_WARMUP_ITERS=${25}
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3"
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3"
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
......@@ -278,16 +312,18 @@ else
fi
##### Prepare logdirs #######
NAME="${PREFIX}"
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
......@@ -314,7 +350,7 @@ megatron_options=" \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 5 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
......@@ -328,13 +364,12 @@ megatron_options=" \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-bias-swiglu-fusion \
--no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
......@@ -342,12 +377,11 @@ megatron_options=" \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
--multi-latent-attention \
--mtp-num-layers 1 \
--use-mcore-models \
"
TORCH_PROFIE_ARGS=" \
......@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_data_16nodes_dcu \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
......@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS=" \
--use-hip-profiler \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${sft_options} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${comm_overlap_option} \
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${1}:25900 \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
......@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=512
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=2
PP=2
CP=1
ETP=1
EP=16
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=3
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*2) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=4096
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=4
PP=8
CP=1
ETP=2
EP=64
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=61
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*3+[1]*58) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
......@@ -6,7 +6,7 @@ do
done
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -5,11 +5,11 @@ do
fi
done
mpirun -np 512 --hostfile hostfile_gpt_567B \
mpirun -np 1024 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -93,11 +93,11 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1
--expert-model-parallel-size 4
--expert-tensor-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -173,42 +174,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
......@@ -90,14 +90,14 @@ DATA_ARGS=(
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1024
--global-batch-size 2048
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2
--profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
--use-pytorch-profiler
)
......@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=(
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--pipeline-model-parallel-size 16
--expert-model-parallel-size 16
--expert-tensor-parallel-size 4
--context-parallel-size 2
#--num-layers-per-virtual-pipeline-stage 2
--use-distributed-optimizer
--sequence-parallel
)
......@@ -175,42 +174,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
......@@ -6,7 +6,7 @@ do
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -6,7 +6,7 @@ do
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
......
......@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
......@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
--profile-dir torch_prof_mixtral8x22B_1nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
......@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 8
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment