Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
03a2a3c1
Commit
03a2a3c1
authored
Jan 23, 2025
by
wxj
Browse files
Update Llama_pretraining.sh 最优化超参数环境变量设置
parent
c4ea7127
Pipeline
#2289
passed with stage
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
50 deletions
+57
-50
Llama_pretraining.sh
Llama_pretraining.sh
+57
-50
No files found.
Llama_pretraining.sh
View file @
03a2a3c1
...
@@ -17,8 +17,8 @@ export NCCL_IB_TIMEOUT=22
...
@@ -17,8 +17,8 @@ export NCCL_IB_TIMEOUT=22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_1,mlx5_2
export
NCCL_IB_HCA
=
mlx5_1,mlx5_2
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
0
export
NCCL_NET_GDR_READ
=
1
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
source
/opt/dtk/env.sh
# 导入hipblaslt库
# 导入hipblaslt库
...
@@ -26,9 +26,11 @@ source /opt/dtk/env.sh
...
@@ -26,9 +26,11 @@ source /opt/dtk/env.sh
# 更新rocblas
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
# torch控制多流转单流
# export ALLREDUCE_STREAM_WITH_COMPUTE=1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export GPU_FLUSH_ON_EXECUTION=1
...
@@ -39,6 +41,9 @@ source /opt/dtk/env.sh
...
@@ -39,6 +41,9 @@ source /opt/dtk/env.sh
# 采集 fa size
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
CHECKPOINT_PATH
=
./tmp_7b
#$1 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
DATA_PATH
=
"/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document"
#<Specify path and file prefix>_text_document
...
@@ -91,8 +96,8 @@ TRAINING_ARGS=(
...
@@ -91,8 +96,8 @@ TRAINING_ARGS=(
# --recompute-num-layers 5 #0 #
# --recompute-num-layers 5 #0 #
# --recompute-method block
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠
, 优化项未适配
# --tp-comm-overlap # tensor parallel comm和gemm重叠
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn-triton
--use-flash-attn-triton
)
)
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-cutlass # cutlass fa
...
@@ -100,7 +105,7 @@ TRAINING_ARGS=(
...
@@ -100,7 +105,7 @@ TRAINING_ARGS=(
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--sequence-parallel
--tensor-model-parallel-size
2
--tensor-model-parallel-size
1
--pipeline-model-parallel-size
2
--pipeline-model-parallel-size
2
)
)
...
@@ -157,48 +162,50 @@ APP="python -u pretrain_gpt.py \
...
@@ -157,48 +162,50 @@ APP="python -u pretrain_gpt.py \
# 开启profile
# 开启profile
# ${PROFILE_ARGS[@]} \
# ${PROFILE_ARGS[@]} \
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
#
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
${
APP
}
# case ${LOCAL_RANK} in
# [0])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [1])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [2])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [3])
# # export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [4])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# 使用numactl绑定
# [5])
case
${
LOCAL_RANK
}
in
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[
0]
)
# ${APP}
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # numactl --cpunodebind=0 --membind=0 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
# [6])
;;
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[
1]
)
# ${APP}
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # numactl --cpunodebind=0 --membind=0 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
# [7])
;;
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[
2]
)
# ${APP}
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # numactl --cpunodebind=0 --membind=0 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
# esac
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment