Commit 6c3cfb1d authored by silencealiang's avatar silencealiang
Browse files

update model parameters format

parent 935bfd74
[submodule "Megatron-LM"]
path = Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
branch = 4429e8ebe21fb0
branch = d580efc68a9f0d
[submodule]
Megatron-LM = main
......@@ -25,11 +25,11 @@ class MegatronAdaptation:
MegatronAdaptation.apply()
# apply features
from .patch_utils import MegatronPatchesManager
from .features_manager import a2a_overlap_adaptation
# from .patch_utils import MegatronPatchesManager
# from .features_manager import a2a_overlap_adaptation
a2a_overlap_adaptation(MegatronPatchesManager)
MegatronPatchesManager.apply_patches()
# a2a_overlap_adaptation(MegatronPatchesManager)
# MegatronPatchesManager.apply_patches()
@classmethod
def register(cls, orig_func_name, new_func=None, force_patch=False, create_dummy=False, apply_wrapper=False, remove_origin_wrappers=False):
......
......@@ -94,7 +94,7 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
print("> initializing torch distributed ...", flush=True)
# Manually set the device ids.
if device_count > 0:
torch.cuda.set_device(args.local_rank)
torch.cuda.set_device(args.local_rank % device_count)
device_id = torch.device(f'cuda:{args.local_rank}')
else:
device_id = None
......
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 8 --allow-run-as-root \
train_deepseek_v3_1node.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to mmap_deepseekv3_datasets_text_document
TOKENIZER_MODEL_PATH="" # path to deepseekv3_dataset
CHECKPOINT_PATH="" # path to ckpt
# Runs DeepseekV3 671B model
mpirun -np ${GPUS} --hostfile hostfile_deepseekv3_671B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_deepseekv3_671B_$((${GPUS} / 8))nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
#!/bin/bash
for para in $*
do
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=5 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=4096
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=4
PP=8
CP=1
ETP=2
EP=64
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=61
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*3+[1]*58) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_128nodes_tp4-pp8-ep64-etp2-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
fi
done
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/public/home/yuguo/check/rccl-tests-0204/topo-input.xml" #"your topo file"
export GLOG_minloglevel=3
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export LD_LIBRARY_PATH=/public/home/yuguo/data/rocblas-install-0224/lib:$LD_LIBRARY_PATH
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=256
LR=1e-5
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=1
PP=2
PP=1
CP=1
ETP=1
EP=4
SP=true
DO=true
......@@ -56,36 +59,36 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
VALID_DATASET_PATH=${MEGATRON_PATH}/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document #"your data path"
PRETRAIN_CHECKPOINT_PATH=${MEGATRON_PATH}/deepseekv3_dataset #"your model path"
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=100000000
WARMUP_TOKENS=10000
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
attn_backend_option=" \
--attention-backend fused
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=2
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=2
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=${SEQ_LEN}
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
......@@ -94,32 +97,43 @@ if [ $MODEL_SIZE = A37B ]; then
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=8 #256
NUM_EXPERTS=8
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 1 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size 1 \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type aux_loss \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*0+[1]*2) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
"
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 1 \
--moe-router-num-groups 1 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*1) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
......@@ -147,6 +161,14 @@ comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
......@@ -154,9 +176,9 @@ if [ $AC = full ]; then
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
......@@ -165,8 +187,8 @@ elif [ $AC = none ]; then
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
......@@ -179,8 +201,8 @@ fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
......@@ -200,7 +222,7 @@ fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
......@@ -210,7 +232,7 @@ fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
......@@ -236,7 +258,7 @@ fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--tokenizer-model $PRETRAIN_CHECKPOINT_PATH"
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
......@@ -247,15 +269,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${24}
LR_WARMUP_ITERS=${25}
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3"
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3"
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
......@@ -278,16 +306,18 @@ else
fi
##### Prepare logdirs #######
NAME="${PREFIX}"
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
......@@ -314,7 +344,7 @@ megatron_options=" \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 5 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
......@@ -328,13 +358,12 @@ megatron_options=" \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-bias-swiglu-fusion \
--no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
......@@ -342,12 +371,11 @@ megatron_options=" \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
--multi-latent-attention \
--mtp-num-layers 1 \
--use-mcore-models \
"
TORCH_PROFIE_ARGS=" \
......@@ -355,7 +383,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_data_16nodes_dcu \
--profile-dir torch_prof_deepseekv3_1nodes_tp1-pp1-ep4-etp1-cp1 \
--use-pytorch-profiler \
"
......@@ -367,26 +395,30 @@ HIP_PROFIE_ARGS=" \
--use-hip-profiler \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${sft_options} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${comm_overlap_option} \
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${1}:25900 \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
......@@ -397,37 +429,30 @@ elif [[ $profiling == "hip" ]]; then
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
#!/bin/bash
for para in $*
do
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE=A37B
BATCH_SIZE=1
GLOBAL_BATCH_SIZE=512
LR=1e-4
MIN_LR=1e-6
SEQ_LEN=4096
PAD_LEN=4096
PR=bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP=2
PP=2
CP=1
ETP=1
EP=16
SP=true
DO=true
FL=true
SFT=false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
:
#exit -1
elif [ $FL = false ]; then
attn_backend_option=" \
--attention-backend auto
"
fi
if [ $MODEL_SIZE = A37B ]; then
TRAIN_ITERS=10
HIDDEN_SIZE=7168
NUM_ATTENTION_HEADS=128
NUM_LAYERS=3
INTERMEDIATE_SIZE=18432
MOE_INTERMEDIATE_SIZE=2048
MAX_POSITION_EMBEDDINGS=163840
EXTRA_VOCAB_SIZE=467
Q_LORA_RANK=1536
KV_LORA_RANK=512
QK_NOPE_HEAD_DIM=128
QK_ROPE_HEAD_DIM=64
V_HEAD_DIM=128
ROPE_THETA=10000
SCALE_FACTOR=40
NUM_EXPERTS=256
ROUTER_TOPK=8
NUM_SHARED_EXPERTS=1
RMS_NORM_EPS=1e-6
moe_options=" \
--moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \
--moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \
--moe-router-group-topk 4 \
--moe-router-num-groups 8 \
--num-experts ${NUM_EXPERTS} \
--expert-model-parallel-size ${EP} \
--expert-tensor-parallel-size ${ETP} \
--moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \
--moe-router-load-balancing-type seq_aux_loss \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-router-enable-expert-bias \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-aux-loss-coeff 0.001 \
--moe-layer-freq ([0]*1+[1]*2) \
--moe-shared-expert-intermediate-size $((${MOE_INTERMEDIATE_SIZE} * ${NUM_SHARED_EXPERTS} )) \
--q-lora-rank ${Q_LORA_RANK} \
--kv-lora-rank ${KV_LORA_RANK} \
--qk-head-dim ${QK_NOPE_HEAD_DIM} \
--qk-pos-emb-head-dim ${QK_ROPE_HEAD_DIM} \
--v-head-dim ${V_HEAD_DIM} \
--mtp-num-layers 1 \
"
mtp_options=""
fi
# Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
if [ -z ${MP_VP} ]; then
vp_option=""
else
vp_option=" \
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="\
--overlap-grad-reduce \
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if [ $AC = full ]; then
_check=$(( ($NUM_LAYERS / $PP) % ${MP_AC_LAYERS} ))
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options=" \
--recompute-method uniform \
--recompute-num-layers ${MP_AC_LAYERS} \
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options=" \
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options=" \
"
elif [ $AC = offload ]; then
activation_checkpoint_options=" \
--cpu-offloading \
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="\
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options=" \
--fp16 \
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options=" \
--bf16"
elif [ $PR = fp8 ]; then
pr_options=" \
--bf16 \
--fp8-format hybrid \
--fp8-amax-compute-algo max \
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if \$DO=true"
DO=true
fi
if [ $DO = true ]; then
do_option=" \
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_option=" \
"
fi
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_option=" \
--sequence-parallel"
elif [ $SP = false ]; then
sp_option=" \
"
fi
if [ -z ${MP_PP0_LAYERS} ];then
uneven_split_option=""
elif [ ${PP} -gt 1 ]; then
_check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
if [ $_check != 0 ]; then
echo "With uneven pipelineing the left over layers must be divisible by left over stages."
exit -1
fi
uneven_split_option=" \
--decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
"
else
echo "uneven pipeline split must be used when PP > 1"
exit -1
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_option=" \
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD != false ]; then
offload_option=" \
--optimizer-cpu-offload \
--use-precision-aware-optimizer \
--optimizer-offload-fraction ${OPTIMIZER_OFFLOAD}"
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${25}
LR_WARMUP_ITERS=${26}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--eod-mask-loss \
--calculate-per-token-loss \
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-deepseek-v3-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_options=" \
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_options=" \
--train-data-path ${DATASET_PATH} \
--valid-data-path ${VALID_DATASET_PATH} \
--dataloader-type cyclic \
--dataset JSON-SFT"
else
dataset_options=" \
--data-path ${DATASET_PATH} \
--split 99,1,0"
fi
if [ ${MP_SFT_PACKING} = true ]; then
echo "Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options=""
else
packing_options=""
fi
##### Prepare logdirs #######
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options=" \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--clip-grad 1.0 \
--init-method-std 0.008 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--lr-decay-iters ${LR_DECAY_ITERS} \
--lr-warmup-iters ${LR_WARMUP_ITERS} \
--train-iters ${TRAIN_ITERS} \
--micro-batch-size ${BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--log-interval 1 \
--log-throughput \
--eval-interval 10000 \
--eval-iters 3 \
--save-interval ${SAVE_INTERVAL} \
--tensorboard-queue-size 1 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--context-parallel-size ${CP} \
--no-load-optim \
--no-load-rng \
--num-workers 8 \
--extra-vocab-size ${EXTRA_VOCAB_SIZE} \
--tokenizer-type DeepSeekV2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL_PATH} \
--swiglu \
--normalization RMSNorm \
--norm-epsilon ${RMS_NORM_EPS} \
--use-rotary-position-embeddings \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--rotary-base ${ROPE_THETA} \
--rotary-scaling-factor ${SCALE_FACTOR} \
--no-save-optim \
--kv-channels ${V_HEAD_DIM} \
--qk-layernorm \
--multi-latent-attention \
--ckpt-format torch \
--transformer-impl transformer_engine \
--no-masked-softmax-fusion \
--use-rope-scaling \
"
TORCH_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--use-pytorch-profiler \
"
HIP_PROFIE_ARGS=" \
--profile \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 4 \
--profile-step-end 5 \
--use-hip-profiler \
"
DISTRIBUTED_ARGS=" \
--rank ${RANK} \
--world-size ${WORLD_SIZE} \
--local-rank ${LOCAL_RANK} \
--dist-url tcp://${DIST_URL}:${DIST_PORT} \
"
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py
${megatron_options} \
${dataset_options} \
${pr_options} \
${load_option} \
${activation_checkpoint_options} \
${do_option} \
${sp_option} \
${moe_options} \
${offload_option} \
${vp_option} \
${packing_options} \
${uneven_split_option} \
${attn_backend_option} \
${mtp_options} \
${comm_overlap_option} \
${DISTRIBUTED_ARGS} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to redpajama_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
# Runs GPT 567B model
mpirun -np ${GPUS} --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_gpt_567B_$((${GPUS} / 8))nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -83,51 +77,33 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 98,2,0
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1024
--global-batch-size 2048
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--pipeline-model-parallel-size 16
--expert-model-parallel-size 16
--expert-tensor-parallel-size 4
--context-parallel-size 2
#--num-layers-per-virtual-pipeline-stage 2
--use-distributed-optimizer
--sequence-parallel
)
......@@ -146,10 +122,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_128nodes_tp4-pp16-ep16-etp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
)
fi
......@@ -173,44 +166,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -83,8 +77,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 98,2,0
)
......@@ -93,39 +87,23 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--expert-model-parallel-size 4
--expert-tensor-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -144,10 +122,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-etp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
)
fi
......@@ -171,44 +166,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
#!/bin/bash
# set -eux
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
#default env
#export FLASH_ATTENTION_PRINT_PARAM=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
# export GPU_MAX_HW_QUEUES=10
#export HIP_ALLOC_INITIALIZE=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
# nccl env
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm
--position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256 #256 #240 #60 #512 #64
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-throughput
--eval-iters 50
--log-interval 1
--save-interval 1000
--eval-interval 1000
--save $SAVE_PATH
--load $SAVE_PATH
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to oscar-1GB_head-llama2_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
# Runs Llama2 7B model
mpirun -np ${GPUS} --hostfile hostfile_llama2_7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_llama2_7b_$((${GPUS} / 8))nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
#!/bin/bash
INITIALIZATION_ARGS=( --num-workers 2)
for para in $*
do
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
elif [[ $para == --reproduce* ]];then
INITIALIZATION_ARGS=( --reproduce --num-workers 0)
export MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC=1 # miopen 确定算法打开
export ROCBLAS_ATOMICS_MOD=0 # rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export MIOPEN_DEBUG_CONV_FFT=0
export MIOPEN_DEBUG_CONV_DIRECT=0
export MIOPEN_DEBUG_CONV_GEMM=1
export MIOPEN_DEBUG_CONV_WINOGRAD=0
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
fi
done
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
#增加编译缓存
export cache_size_limit=64
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
GPT_MODEL_ARGS=(
--seq-length 4096
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization LightopRMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-throughput
--eval-iters 5
--log-interval 1
--save-interval 1000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard"
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${INITIALIZATION_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
# Mixtral 8x7B Model Inference and Finetuning
## Download Mixtral 8x7B Checkpoints
Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
```python
from huggingface_hub import snapshot_download
SAVED_DIR = "" # Specify the saved directory
# Download HF checkpoints
snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
```
## Convert Mixtral 8x7B checkpoints from HF to MCore
The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
The target model parallel size(e.g. TP,PP,EP) should be specified.
Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
```
TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
MEGATRON_PATH="/workspace/megatron-lm"
export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
TARGET_TP_SIZE=""
TARGET_EP_SIZE=""
TARGET_PP_SIZE=""
HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
```
## Text generation with Mixtral 8x7B
Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
```
#!/bin/bash
# This example will start serving the Mixtral 8x7B model.
DISTRIBUTED_ARGS="--nproc_per_node 2 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint>
TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 2 \
--expert-model-parallel-size 1 \
--load ${CHECKPOINT} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--use-mcore-models \
--max-position-embeddings 32768 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--normalization RMSNorm \
--disable-bias-linear \
--position-embedding-type rope \
--no-position-embedding \
--swiglu \
--untie-embeddings-and-output-weights \
--group-query-attention \
--num-query-groups 8 \
--bf16 \
--micro-batch-size 1 \
--seq-length 1024 \
--seed 42 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-token-dispatcher-type alltoall \
--moe-grouped-gemm \
--mock-data \
--rotary-base 1000000
```
Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
```
python tools/text_generation_cli.py localhost:5000
```
## Finetuning from pretrained Mixtral 8x7B
To finetuning pretrained Mixtral 8x7B, use the following scripts:
```bash
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
TOKENIZER_MODEL="" # Specify path to tokenizer.model
DATA_PATH="" # Specify path to data
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
$PYTORCH_IMAGE \
bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
```
The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
## Acknowledgements
Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment