Commit db01036c authored by wangsen's avatar wangsen
Browse files

ok

parent 7c19b3a8
#!/bin/bash
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_1,mlx5_2
#export NCCL_SOCKET_IFNAME=ibs8
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
#export NCCL_DEBUG=info
source /opt/dtk/env.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
DATA_PATH="/mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/my-qwen_text_document" #<Specify path and file prefix>_text_document
#TOKENIZER_PATH="/mnt/fs/user/llama/dcu/megatron-llama/tokenizer.model"
GPT_MODEL_ARGS=(
--num-layers 80 #80
--hidden-size 8192 #8192
--ffn-hidden-size 29568
--num-attention-heads 64 # 64 --ffn-hidden-size 49152
--seq-length 4096
--max-position-embeddings 4096 #32768
--num-query-groups 8
--group-query-attention
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 64
--train-iters 100
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-v2
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--ddp-average-in-collective
#--overlap-grad-reduce
#--recompute-granularity full
#--recompute-activations
#--use-flash-attn
#--sequence-parallel
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-type QwenTokenizer
--merge-file /mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/merges.txt
--vocab-file /mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/vocab.json
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
APP="python3 -u pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment