Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
566da70d
Commit
566da70d
authored
Mar 07, 2025
by
wangxj
Browse files
更新示例llama脚本
parent
2c63b5cd
Pipeline
#2452
passed with stage
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
247 additions
and
0 deletions
+247
-0
Llama2_7b.sh
Llama2_7b.sh
+247
-0
No files found.
Llama
_pretraining
.sh
→
Llama
2_7b
.sh
View file @
566da70d
...
@@ -8,45 +8,56 @@ export OMP_NUM_THREADS=1
...
@@ -8,45 +8,56 @@ export OMP_NUM_THREADS=1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=
20 # sglang空泡
#
export GPU_MAX_HW_QUEUES=
10
export
NCCL_ALGO
=
Ring
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
20
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_1,mlx5_2
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,,mlx5_4,,mlx5_5,,mlx5_6,,mlx5_7
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
# export TORCH_COMPILE_DEBUG=1 # 查看编译后的图
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# 导入hipblaslt库
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_0107_trans/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0224/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/rocblas-install-0227/lib:
$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
# torch控制多流转单流
# export ALLREDUCE_STREAM_WITH_COMPUTE=1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# 采集rocblas size
export
ROCBLAS_LAYER
=
3
#
export ROCBLAS_LAYER=3
# 采集 fa size
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
# export FLASH_ATTENTION_PRINT_PARAM=1
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
GPT_MODEL_ARGS
=(
--num-layers
32
--num-layers
32
...
@@ -56,34 +67,22 @@ GPT_MODEL_ARGS=(
...
@@ -56,34 +67,22 @@ GPT_MODEL_ARGS=(
--max-position-embeddings
4096
--max-position-embeddings
4096
--normalization
RMSNorm
--normalization
RMSNorm
--position-embedding-type
rope
--position-embedding-type
rope
# none #
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
)
# GPT_MODEL_ARGS=(
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# --num-layers 40
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --hidden-size 5120
# --ffn-hidden-size 13824
# --num-attention-heads 40
# --max-position-embeddings 4096
# --normalization RMSNorm
# --position-embedding-type rope
# --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
# )
# export NVTE_FLASH_ATTN=1 # 走cutlass
export
NVTE_FLASH_ATTN_TRITON
=
1
# 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
# --use-legacy-models
TRAINING_ARGS
=(
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy
用这两组参数
--transformer-impl
transformer_engine
# 走core
用这两组参数
--use-
legacy
-models
--use-
mcore
-models
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
6
4
#240 #60 #512 #64
--global-batch-size
256
#25
6 #240 #60 #512 #64
--train-iters
1
0
--train-iters
5
0
--weight-decay
0.1
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta1
0.9
--adam-beta2
0.95
--adam-beta2
0.95
...
@@ -96,7 +95,7 @@ TRAINING_ARGS=(
...
@@ -96,7 +95,7 @@ TRAINING_ARGS=(
--disable-bias-linear
--disable-bias-linear
--attention-dropout
0
--attention-dropout
0
--hidden-dropout
0
--hidden-dropout
0
--no-gradient-accumulation-fusion
# 开启后精度不对, apex更新后可以开启
#
--no-gradient-accumulation-fusion
--swiglu
--swiglu
--lr
3.0e-5
--lr
3.0e-5
--lr-decay-style
cosine
--lr-decay-style
cosine
...
@@ -109,16 +108,26 @@ TRAINING_ARGS=(
...
@@ -109,16 +108,26 @@ TRAINING_ARGS=(
# --recompute-method block
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
-cutlass
--use-flash-attn
)
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--sequence-parallel
--tensor-model-parallel-size
2
--tensor-model-parallel-size
1
--pipeline-model-parallel-size
2
--pipeline-model-parallel-size
2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
)
DATA_ARGS
=(
DATA_ARGS
=(
...
@@ -126,7 +135,8 @@ DATA_ARGS=(
...
@@ -126,7 +135,8 @@ DATA_ARGS=(
--seq-length
4096
#4096
--seq-length
4096
#4096
--split
949,50,1
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/data/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model
/public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
# --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
)
EVAL_AND_LOGGING_ARGS
=(
EVAL_AND_LOGGING_ARGS
=(
...
@@ -134,12 +144,20 @@ EVAL_AND_LOGGING_ARGS=(
...
@@ -134,12 +144,20 @@ EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-throughput
--save-interval
1000
--save-interval
1000
--eval-interval
1000
--eval-interval
1000
--save
$
CHECKPOINT
_PATH
--save
$
SAVE
_PATH
--load
$
CHECKPOINT
_PATH
--load
$
SAVE
_PATH
--eval-iters
10
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
PROFILE_ARGS
=(
--profile
--profile
--profile-step-start
4
--profile-step-start
4
...
@@ -153,7 +171,7 @@ RANK=$OMPI_COMM_WORLD_RANK
...
@@ -153,7 +171,7 @@ RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
345
6
7
DIST_PORT
=
345
7
7
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
...
@@ -162,6 +180,13 @@ DISTRIBUTED_ARGS=(
...
@@ -162,6 +180,13 @@ DISTRIBUTED_ARGS=(
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
)
# torchrun --nproc_per_node 8 pretrain_gpt.py \
# ${GPT_MODEL_ARGS[@]} \
# ${TRAINING_ARGS[@]} \
# ${MODEL_PARALLEL_ARGS[@]} \
# ${DATA_ARGS[@]} \
# ${EVAL_AND_LOGGING_ARGS[@]}
APP
=
"python -u pretrain_gpt.py
\
APP
=
"python -u pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
...
@@ -174,48 +199,49 @@ APP="python -u pretrain_gpt.py \
...
@@ -174,48 +199,49 @@ APP="python -u pretrain_gpt.py \
# 开启profile
# 开启profile
# ${PROFILE_ARGS[@]} \
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
${
APP
}
# case ${LOCAL_RANK} in
# [0])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [1])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [2])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [3])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [4])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [5])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [6])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [7])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
case
${
LOCAL_RANK
}
in
# ;;
[
0]
)
# esac
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment