#!/bin/bash #sh run_pretrain_megatron_llama.sh dsw /workspace/Pai-Megatron-Patch 7B 1 8 1e-5 1e-6 2048 2048 0 bf16 1 1 sel true true true false 100000 /mnt/llama2-datasets/wudao_llamabpe_text_document /mnt/llama2-ckpts/Llama-2-7b-hf-to-mg-tp1-pp1/ 10000000000 100000000 /mnt/output_patch_test set -e ENV=$1 MEGATRON_PATCH_PATH=$2 MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-231007 export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH export CUDA_DEVICE_MAX_CONNECTIONS=1 if [ $ENV = dsw ]; then export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 MASTER_ADDR=localhost MASTER_PORT=$(shuf -n 1 -i 10000-65535) NNODES=1 NODE_RANK=0 GPUS_PER_NODE=8 elif [ $ENV = dlc ]; then NNODES=${WORLD_SIZE} NODE_RANK=${RANK} GPUS_PER_NODE=${KUBERNETES_CONTAINER_RESOURCE_GPU} fi DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" MODEL_SIZE=$3 BATCH_SIZE=$4 GLOBAL_BATCH_SIZE=$5 LR=$6 MIN_LR=$7 SEQ_LEN=$8 PAD_LEN=$9 EXTRA_VOCAB_SIZE=${10} PR=${11} TP=${12} PP=${13} AC=${14} DO=${15} FL=${16} SP=${17} TE=${18} SAVE_INTERVAL=${19} DATASET_PATH=${20} PRETRAIN_CHECKPOINT_PATH=${21} TRAIN_TOKENS=${22} WARMUP_TOKENS=${23} OUTPUT_BASEPATH=${24} if [ $MODEL_SIZE = 7B ]; then NUM_LAYERS=32 HIDDEN_SIZE=4096 NUM_ATTN_HEADS=32 INTERMEDIATE_SIZE=11008 gqa_options="" elif [ $MODEL_SIZE = 13B ]; then NUM_LAYERS=40 HIDDEN_SIZE=5120 NUM_ATTN_HEADS=40 INTERMEDIATE_SIZE=13824 gqa_options="" elif [ $MODEL_SIZE = 34B ]; then NUM_LAYERS=48 HIDDEN_SIZE=8192 NUM_ATTN_HEADS=64 INTERMEDIATE_SIZE=22016 gqa_options=" \ --group-query-attention \ --num-query-groups 8" fi if [ $AC = full ]; then activation_checkpoint_options=" \ --recompute-method uniform \ --recompute-granularity full" elif [ $AC = sel ]; then activation_checkpoint_options=" \ --recompute-activations" elif [ $AC = none ]; then activation_checkpoint_options=" \ " fi if [ $PR = fp16 ]; then pr_options=" \ --fp16" elif [ $PR = bf16 ]; then pr_options=" \ --bf16" elif [ $PR = fp8 ]; then pr_options=" \ --bf16 --fp8-hybrid \ --fp8-amax-compute-algo max \ --fp8-amax-history-len 1024 \ --transformer-impl transformer_engine" fi if [ $DO = true ]; then do_options=" \ --use-distributed-optimizer" elif [ $DO = false ]; then do_options=" \ " fi if [ $FL = true ]; then flash_options=" \ --use-flash-attn" elif [ $FL = false ]; then flash_options=" \ " fi if [ $TE = true ]; then te_options=" \ --transformer-impl transformer_engine" elif [ $TE = false ]; then te_options=" \ " fi if [ $SP = true ] && [ $TP -gt 1 ]; then sp_options=" \ --sequence-parallel" elif [ $SP = false ]; then sp_options=" \ " fi if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then load_options=" \ --load $PRETRAIN_CHECKPOINT_PATH" fi TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} )) NAME="${ENV}-pretrain-megatron-gpt3-${MODEL_SIZE}-lr-${LR}-bs-${BATCH_SIZE}-seqlen-${SEQ_LEN}-pr-${PR}-tp-${TP}-pp-${PP}-ac-${AC}-do-${DO}-sp-${SP}-tt-${TRAIN_TOKENS}-wt-${WARMUP_TOKENS}" mkdir -p "${OUTPUT_BASEPATH}/tensorboard/" mkdir -p "${OUTPUT_BASEPATH}/checkpoint/" mkdir -p "${OUTPUT_BASEPATH}/log/" current_time=$(date "+%Y.%m.%d-%H.%M.%S") TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}" mkdir -p ${TENSORBOARD_DIR} SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}" megatron_options=" \ --save ${SAVED_PRETRAIN_CHECKPOINT_PATH} \ --split 98,2,0 \ --train-data-path ${DATASET_PATH} --lr ${LR} \ --min-lr ${MIN_LR} \ --lr-decay-style linear \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --weight-decay 0.1 \ --clip-grad 1.0 \ --init-method-std 0.006 \ --lr-decay-iters ${LR_DECAY_ITERS} \ --lr-warmup-iters ${LR_WARMUP_ITERS} \ --train-iters ${TRAIN_ITERS} \ --micro-batch-size ${BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ --num-layers ${NUM_LAYERS} \ --hidden-size ${HIDDEN_SIZE} \ --num-attention-heads ${NUM_ATTN_HEADS} \ --ffn-hidden-size ${INTERMEDIATE_SIZE} \ --seq-length ${SEQ_LEN} \ --max-position-embeddings ${SEQ_LEN} \ --log-interval 1 \ --eval-interval 10000 \ --eval-iters 10 \ --save-interval ${SAVE_INTERVAL} \ --tensorboard-queue-size 1 \ --tensorboard-dir ${TENSORBOARD_DIR} \ --log-timers-to-tensorboard \ --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --dataset LLama-Pretrain-Idxmap \ --no-load-optim \ --no-load-rng \ --num-workers 8 \ --seed 1234 \ --max-padding-length ${PAD_LEN} \ --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ --patch-tokenizer-type LLamaTokenizer \ --swiglu \ --normalization RMSNorm \ --use-llama2-rotary-position-embeddings \ --position-embedding-type rope \ --untie-embeddings-and-output-weights \ --rotary-base 1000000 \ --disable-bias-linear " run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_megatron_llama.py ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}" echo ${run_cmd} eval ${run_cmd} set +x