ok

db01036c · wangsen · 7c19b3a8 · db01036c
Commit db01036c authored Sep 26, 2024 by wangsen
Show whitespace changes
Inline Side-by-side

Showing with 170 additions and 0 deletions

qwen_72b.sh.bak qwen_72b.sh.bak +170 -0

No files found.
--- a/qwen_72b.sh.bak
+++ b/qwen_72b.sh.bak
+#!/bin/bash
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=5
+#export HIP_ALLOC_INITIALIZE=0
+#export GPU_MAX_HW_QUEUES=20
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=mlx5_1,mlx5_2
+#export NCCL_SOCKET_IFNAME=ibs8
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+#export NCCL_DEBUG=info
+source /opt/dtk/env.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+DATA_PATH="/mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/my-qwen_text_document" #<Specify path and file prefix>_text_document
+#TOKENIZER_PATH="/mnt/fs/user/llama/dcu/megatron-llama/tokenizer.model"
+GPT_MODEL_ARGS=(
+    --num-layers 80 #80 
+    --hidden-size 8192 #8192
+    --ffn-hidden-size 29568
+    --num-attention-heads 64 # 64   --ffn-hidden-size  49152
+    --seq-length 4096
+    --max-position-embeddings 4096  #32768
+    --num-query-groups 8
+    --group-query-attention
+)
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 64
+    --train-iters 100
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --use-flash-attn-v2
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+)
+#--ddp-average-in-collective
+#--overlap-grad-reduce
+#--recompute-granularity full
+#--recompute-activations
+#--use-flash-attn
+#--sequence-parallel
+MODEL_PARALLEL_ARGS=(
+        --sequence-parallel
+	--tensor-model-parallel-size 4
+	--pipeline-model-parallel-size 8
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-type QwenTokenizer
+    --merge-file /mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/merges.txt
+    --vocab-file /mnt/fs/user/llama/dcu/zhaoying_qwen/Megatron-LM-main/qwen_token/vocab.json
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     ${GPT_MODEL_ARGS[@]} \
+     ${TRAINING_ARGS[@]} \
+     ${MODEL_PARALLEL_ARGS[@]} \
+     ${DATA_ARGS[@]} \
+     ${EVAL_AND_LOGGING_ARGS[@]}
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+esac