update

f2464dc2 · liangjing · 0b5cd1a0 · f2464dc2 · f2464dc2 · f2464dc2
Commit f2464dc2 authored Nov 07, 2024 by liangjing
Show whitespace changes
Inline Side-by-side

Showing with 446 additions and 0 deletions

scripts/llama2_13b.sh scripts/llama2_13b.sh +155 -0

scripts/llama2_70b.sh scripts/llama2_70b.sh +138 -0

scripts/llama2_7b.sh scripts/llama2_7b.sh +153 -0

No files found.
--- a/scripts/llama2_13b.sh
+++ b/scripts/llama2_13b.sh
+#!/bin/bash
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "13B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=SYS
+source /opt/dtk/env.sh
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CHECKPOINT_PATH=./tmp_13b #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp_13b  #$2 #<Specify path>
+DATA_PATH="/path_to_dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
+TOKENIZER_PATH="/path_to_tokenizer.model"
+GPT_MODEL_ARGS=(
+    --num-layers 40 
+    --hidden-size 5120
+    --num-attention-heads 40
+    --ffn-hidden-size 13824
+    --seq-length 4096 
+    --max-position-embeddings 4096
+)
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 256
+    --train-iters 100 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-flash-attn-triton
+    --use-distributed-optimizer
+    --recompute-activations 
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --use-fast-cross-entropy-loss
+)
+MODEL_PARALLEL_ARGS=(
+        --sequence-parallel
+	--tensor-model-parallel-size 1
+	--pipeline-model-parallel-size 8 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-model $TOKENIZER_PATH 
+    --tokenizer-type Llama2Tokenizer
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     ${GPT_MODEL_ARGS[@]} \
+     ${TRAINING_ARGS[@]} \
+     ${MODEL_PARALLEL_ARGS[@]} \
+     ${DATA_ARGS[@]} \
+     ${EVAL_AND_LOGGING_ARGS[@]}
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+#for hygon cpu
+case ${lrank} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1  ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2  ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3  ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4  ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5  ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6  ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7  ${APP}
+  ;;
+esac
--- a/scripts/llama2_70b.sh
+++ b/scripts/llama2_70b.sh
+#!/bin/bash
+# Runs the "70B" parameter model
+source /opt/dtk/env.sh
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=xx #based on your environment
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+DATA_PATH="/path_to_dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
+TOKENIZER_PATH="/path_to_tokenizer.model"
+GPT_MODEL_ARGS=(
+    --num-layers 80 
+    --hidden-size 8192
+    --num-attention-heads 64
+    --ffn-hidden-size 28672
+    --seq-length 4096 
+    --max-position-embeddings 4096
+    --num-query-groups 8
+    --group-query-attention
+)
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models
+    --micro-batch-size 1
+    --global-batch-size 512
+    --train-iters 100
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer
+    --use-flash-attn-triton
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-gradient-accumulation-fusion
+    --overlap-grad-reduce
+    --swiglu
+    --lr 5.0e-4 
+    --lr-decay-style cosine 
+    --min-lr 1.0e-4
+    --lr-warmup-iters 1
+    --use-fast-cross-entropy-loss
+)
+MODEL_PARALLEL_ARGS=(
+	--sequence-parallel
+	--tensor-model-parallel-size 4
+	--pipeline-model-parallel-size 8 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-model $TOKENIZER_PATH 
+    --tokenizer-type Llama2Tokenizer
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 200 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     ${GPT_MODEL_ARGS[@]} \
+     ${TRAINING_ARGS[@]} \
+     ${MODEL_PARALLEL_ARGS[@]} \
+     ${DATA_ARGS[@]} \
+     ${EVAL_AND_LOGGING_ARGS[@]}
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+#for hygon cpu
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0  ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
--- a/scripts/llama2_7b.sh
+++ b/scripts/llama2_7b.sh
+#!/bin/bash
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+DATA_PATH="/path_to_dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
+TOKENIZER_PATH="/path_to_tokenizer.model"
+GPT_MODEL_ARGS=(
+    --num-layers 32 
+    --hidden-size 4096
+    --num-attention-heads 32
+    --ffn-hidden-size 11008
+    --seq-length 4096 
+    --max-position-embeddings 4096
+)
+TRAINING_ARGS=(
+    --log-throughput
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 240 
+    --train-iters 100 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --use-flash-attn-triton
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --use-fast-cross-entropy-loss
+)
+MODEL_PARALLEL_ARGS=(
+        --sequence-parallel
+	--tensor-model-parallel-size 1 
+	--pipeline-model-parallel-size 2 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-model $TOKENIZER_PATH 
+    --tokenizer-type Llama2Tokenizer
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     ${GPT_MODEL_ARGS[@]} \
+     ${TRAINING_ARGS[@]} \
+     ${MODEL_PARALLEL_ARGS[@]} \
+     ${DATA_ARGS[@]} \
+     ${EVAL_AND_LOGGING_ARGS[@]}
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+#for hygon cpu
+case ${lrank} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac