set -eux # 多节点环境变量 # Runs the "7B" parameter model export HSA_FORCE_FINE_GRAIN_PCIE=1 export OMP_NUM_THREADS=1 export NCCL_P2P_LEVEL=5 source /opt/dtk/env.sh # te调用gemm需要导入hipblaslt库 export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH #export HIP_ALLOC_INITIALIZE=0 #export GPU_MAX_HW_QUEUES=20 export NCCL_ALGO=Ring export NCCL_NCHANNELS_PER_PEER=8 export NCCL_MIN_NCHANNELS=20 export NCCL_MIN_P2P_NCHANNELS=8 export NCCL_IB_TIMEOUT=22 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_IB_HCA=mlx5_1,mlx5_2 #export NCCL_SOCKET_IFNAME=ibs8 export NCCL_NET_GDR_LEVEL=SYS export NCCL_NET_GDR_READ=0 #export NCCL_DEBUG=info # 离线设置 # export HF_DATASETS_OFFLINE=1 # export HF_HUB_OFFLINE=1 # prof加入同步 # export GPU_FLUSH_ON_EXECUTION=1 # # 多机卡顿 # export HIP_DIRECT_DISPATCH=0 # # torchrun参数 # NNODES=1 # NODE_RANK=0 # NUM_GPUS=8 # MASTER_ADDR="172.16.1.76" # MASTER_PORT=29500 # # 模型大小 # MODEL_SIZE=7 # # 数据集 # DATASET="[1.0,/data/nemo_dataset/oscar-1GB-llama/oscar-1GB-llama_text_document]" # # 超参数 # MICRO_BATCH_SIZE=1 # GLOBAL_BATCH_SIZE=16 # TRAIN_STEPS=250000 # LR=3e-4 # MIN_LR=3e-5 # LR_WARMUP_STEPS=2000 # DROP_OUT=0.0 # WEIGHT_DECAY=0.1 # GRAD_CLIP=1 # MAX_SEQ_LEN=4096 # MAX_POSITION_EMBEDDINGS=4096 # # 设置TP和PP # TP=4 # PP=1 # SP=False # # 获取参数 # while [ $# -gt 0 ] # do # case $1 in # -M|--MODEL_SIZE) # MODEL_SIZE=$2; shift;; # --TP) # TP=$2; shift;; # --PP) # PP=$2; shift;; # --SP) # SP=$2; shift;; # --peft) # peft_scheme=$2; shift;; # --global_batch) # global_batch=$2; shift;; # --NNODES) # NNODES=$2; shift;; # --NODE_RANK) # NODE_RANK=$2; shift;; # --NUM_GPUS) # NUM_GPUS=$2; shift;; # --MASTER_ADDR) # MASTER_ADDR=$2; shift;; # --MASTER_PORT) # MASTER_PORT=$2; shift;; # (*) # echo "param is error!" # exit 0 # break;; # esac # shift # done # # 模型确定 # if [[ ${MODEL_SIZE} == 7 ]]; then HIDDEN_SIZE=4096; NUM_HEADS=32; NUM_QUERY_GROUP=32; NUM_LAYERS=32; FFN_HIDDEN_SIZE=11008; NORM_EPS=1e-5; # elif [[ ${MODEL_SIZE} == 13 ]]; then HIDDEN_SIZE=5120; NUM_HEADS=40; NUM_QUERY_GROUP=40; NUM_LAYERS=40; FFN_HIDDEN_SIZE=13824; NORM_EPS=1e-5; # elif [[ ${MODEL_SIZE} == 70 ]]; then HIDDEN_SIZE=8192; NUM_HEADS=64; NUM_QUERY_GROUP=8; NUM_LAYERS=80; FFN_HIDDEN_SIZE=28672; NORM_EPS=1e-5; # elif [[ ${MODEL_SIZE} == "tiny" ]]; then HIDDEN_SIZE=128; NUM_HEADS=4; NUM_QUERY_GROUP=4; NUM_LAYERS=4; FFN_HIDDEN_SIZE=512; NORM_EPS=1e-5; # else echo "invalid MODEL_SIZE: ${MODEL_SIZE}"; exit 1 # fi # 启动训练 # torchrun --nnodes $NNODES --node_rank $NODE_RANK --nproc_per_node $NUM_GPUS \ # --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ # /workspace/NeMo-2.0.0.rc0.beta/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ # --config-path=conf/ \ # --config-name=megatron_gpt_config \ # trainer.devices=${NUM_GPUS} \ # trainer.num_nodes=${NNODES} \ # trainer.max_epochs=null \ # trainer.max_steps=300000 \ # trainer.val_check_interval=300 \ # trainer.log_every_n_steps=50 \ # trainer.limit_val_batches=50 \ # trainer.limit_test_batches=50 \ # trainer.accumulate_grad_batches=1 \ # trainer.precision=16 \ # model.micro_batch_size=${MICRO_BATCH_SIZE} \ # model.global_batch_size=${GLOBAL_BATCH_SIZE} \ # model.tensor_model_parallel_size=${TP} \ # model.pipeline_model_parallel_size=${PP} \ # model.max_position_embeddings=${MAX_POSITION_EMBEDDINGS} \ # model.encoder_seq_length=${MAX_POSITION_EMBEDDINGS} \ # model.hidden_size=${HIDDEN_SIZE} \ # model.ffn_hidden_size=${FFN_HIDDEN_SIZE} \ # model.num_layers=${NUM_LAYERS} \ # model.num_attention_heads=${NUM_HEADS} \ # model.init_method_std=0.021 \ # model.hidden_dropout=${DROP_OUT} \ # model.layernorm_epsilon=${NORM_EPS} \ # model.data.data_prefix=${DATASET} \ # model.data.num_workers=2 \ # model.data.seq_length=${MAX_SEQ_LEN} \ # model.data.splits_string=\'949,50,1\' \ # model.optim.name=fused_adam \ # model.optim.lr=${LR} \ # model.optim.betas=[0.9,0.95] \ # model.optim.weight_decay=${WEIGHT_DECAY} \ # model.optim.sched.name=CosineAnnealing \ # model.optim.sched.warmup_steps=750 \ # model.optim.sched.constant_steps=80000 \ # model.optim.sched.min_lr=${MIN_LR} \ # model.tokenizer.type=Llama2Tokenizer \ # model.tokenizer.model=/data/Megatron_LM/llama/tokenizer.model \ # model.num_query_groups=${NUM_QUERY_GROUP} \ # model.position_embedding_type=rope \ # model.normalization=rmsnorm # model.tokenizer.vocab_file=gpt2-vocab.json \ # model.tokenizer.merge_file=gpt2-merges.txt \ # TOKENIZER_TYPE=Llama2Tokenizer # TOKENIZER_MODEL=/data/Megatron_LM/llama/tokenizer.model DATASET="[1.0,/data/nemo_dataset/oscar-1GB-llama/oscar-1GB-llama_text_document]" export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # export NVTE_FLASH_ATTN=1 # 走autlass export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa python ./megatron_gpt_pretraining.py \ --config-path=conf/ \ --config-name=megatron_gpt_config \ trainer.devices=8 \ trainer.num_nodes=1 \ trainer.precision=bf16 \ model.micro_batch_size=1 \ model.global_batch_size=60 \ model.tensor_model_parallel_size=2 \ model.pipeline_model_parallel_size=2 \ model.sequence_parallel=True \ model.encoder_seq_length=4096 \ model.num_layers=32 \ model.hidden_size=4096 \ model.ffn_hidden_size=11008 \ model.num_attention_heads=32 \ model.max_position_embeddings=4096 \ model.num_query_groups=null \ model.mcore_gpt=False \ model.transformer_engine=False \ model.fp8=False \ model.ub_tp_comm_overlap=False \ model.use_flash_attention=True \ model.data.seq_length=4096 # model.mcore_gpt=True \ # model.transformer_engine=True \