Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
1e8185f4
Commit
1e8185f4
authored
May 23, 2025
by
wangxj
Browse files
添加llama和qwen的脚本
parent
3e2c4b1e
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
1174 additions
and
510 deletions
+1174
-510
examples/llama/Llama2_70b.sh
examples/llama/Llama2_70b.sh
+0
-248
examples/llama/Llama3_70b.sh
examples/llama/Llama3_70b.sh
+0
-244
examples/llama/hostfile
examples/llama/hostfile
+8
-0
examples/llama/hostfile_llama2_7B
examples/llama/hostfile_llama2_7B
+0
-0
examples/llama/run_llama2_13B.sh
examples/llama/run_llama2_13B.sh
+38
-0
examples/llama/run_llama2_70B.sh
examples/llama/run_llama2_70B.sh
+37
-0
examples/llama/run_llama2_7B.sh
examples/llama/run_llama2_7B.sh
+15
-11
examples/llama/run_llama3_70B.sh
examples/llama/run_llama3_70B.sh
+37
-0
examples/llama/train_llama2_13b_1nodes.sh
examples/llama/train_llama2_13b_1nodes.sh
+173
-0
examples/llama/train_llama2_70b_8nodes.sh
examples/llama/train_llama2_70b_8nodes.sh
+175
-0
examples/llama/train_llama2_7b_1nodes.sh
examples/llama/train_llama2_7b_1nodes.sh
+19
-7
examples/llama/train_llama3_70b_8nodes.sh
examples/llama/train_llama3_70b_8nodes.sh
+175
-0
examples/qwen/hostfile
examples/qwen/hostfile
+2
-0
examples/qwen/run_qwen1.5_14B.sh
examples/qwen/run_qwen1.5_14B.sh
+38
-0
examples/qwen/run_qwen1.5_32B.sh
examples/qwen/run_qwen1.5_32B.sh
+38
-0
examples/qwen/train_qwen1.5_14b_1nodes.sh
examples/qwen/train_qwen1.5_14b_1nodes.sh
+208
-0
examples/qwen/train_qwen1.5_32b_2nodes.sh
examples/qwen/train_qwen1.5_32b_2nodes.sh
+211
-0
No files found.
examples/llama/Llama2_70b.sh
deleted
100755 → 0
View file @
3e2c4b1e
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# export HIPBLASLT_LOG_LEVEL=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# lightop算子库
export
PYTORCH_ROCM_ARCH
=
'gfx906;gfx926;gfx936'
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/redpajama_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
80
#80 #80 #40 # 20 #
--hidden-size
8192
--ffn-hidden-size
22016
# 28672
--num-attention-heads
64
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
512
#32 #512 #256 # 64 #240 #60 #512 #64
--train-iters
300
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
# --no-check-for-nan-in-loss-and-grad
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-activations
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 1 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 启动core
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 启动core
--use-flash-attn
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--context-parallel-size
1
# --num-layers-per-virtual-pipeline-stage 1
# --microbatch-group-size-per-virtual-pipeline-stage 5
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#8192 #4096
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
500
--eval-interval
50
--eval-iters
3
--save
$SAVE_PATH
--load
$SAVE_PATH
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u ../../pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
examples/llama/Llama3_70b.sh
deleted
100644 → 0
View file @
3e2c4b1e
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/rocblas-install-0203-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# export HIPBLASLT_LOG_LEVEL=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
80
#80 #80 #40 # 20 #
--hidden-size
8192
--ffn-hidden-size
28672
--num-attention-heads
64
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
960
#32 #512 #256 # 64 #240 #60 #512 #64
--train-iters
100
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-activations
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 1 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 启动core
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 启动core
--use-flash-attn-cutlass
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 5
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#8192 #4096
--split
949,50,1
--tokenizer-type
Llama3Tokenizer
--tokenizer-model
/public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
1000
--eval-interval
1000
--save
$SAVE_PATH
--load
$SAVE_PATH
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
examples/llama/hostfile
0 → 100644
View file @
1e8185f4
node036 slots=8
node061 slots=8
node062 slots=8
node063 slots=8
node064 slots=8
node065 slots=8
node066 slots=8
node067 slots=8
examples/llama/hostfile_llama2_7B
deleted
100644 → 0
View file @
3e2c4b1e
examples/llama/run_llama2_13B.sh
0 → 100755
View file @
1e8185f4
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"8"
# how many gpus to use
# DTK_ENV="/opt/dtk/env.sh"
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
HOST
=
"localhost"
# hostname
PORT
=
"11451"
# port id
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_llama2_13b_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./log/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/llama/run_llama2_70B.sh
0 → 100755
View file @
1e8185f4
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"64"
# how many gpus to use
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
HOST
=
"node036"
# hostname
PORT
=
"11451"
# port id
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_llama2_70b_8nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./log/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/llama/run_llama2_7B.sh
View file @
1e8185f4
...
@@ -6,29 +6,33 @@ do
...
@@ -6,29 +6,33 @@ do
done
done
# Those variables need to modify
# Those variables need to modify
GPUS
=
""
# how many gpus to use
GPUS
=
"8"
# how many gpus to use
DTK_ENV
=
""
# where env.sh of dtk
# DTK_ENV="/opt/dtk/env.sh"
NCCL_ENV
=
""
# where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
HOST
=
""
# hostname
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
PORT
=
""
# port id
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
DATA_PATH
=
""
# path to oscar-1GB_head-llama2_text_document
HOST
=
"localhost"
# hostname
TOKENIZER_MODEL_PATH
=
""
# path to tokenizer.model
PORT
=
"11451"
# port id
CHECKPOINT_PATH
=
""
# path to ckpt
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
_llama2_7B
\
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_llama2_7b_
$((${
GPUS
}
/
8
))
nodes.sh
\
./train_llama2_7b_
1
nodes.sh
\
${
HOST
}
\
${
HOST
}
\
${
PORT
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
--profiling=
$profiling
"
>
./log/
log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
\ No newline at end of file
examples/llama/run_llama3_70B.sh
0 → 100755
View file @
1e8185f4
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"64"
# how many gpus to use
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
HOST
=
"node036"
# hostname
PORT
=
"11451"
# port id
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_llama3_70b_8nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./log/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/llama/train_llama2_13b_1nodes.sh
0 → 100755
View file @
1e8185f4
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# 运行环境参数
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
#10 # 4 # 20
export
NVTE_DISABLE_FC2_DGRAD_OVERLAP
=
1
export
NVTE_NO_PIPELINE_OVERLAP
=
1
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13824
--num-attention-heads
40
--max-position-embeddings
4096
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
\ No newline at end of file
examples/llama/train_llama2_70b_8nodes.sh
0 → 100755
View file @
1e8185f4
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# 运行环境参数
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
#10 # 4
# export NVTE_DISABLE_FC2_DGRAD_OVERLAP=1
# export NVTE_NO_PIPELINE_OVERLAP=1
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
80
#80 #80 #40 # 20 #
--hidden-size
8192
--ffn-hidden-size
28672
# 28672
--num-attention-heads
64
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
512
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp4-pp2-cp1-tpoverlap-nosyns_20
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
\ No newline at end of file
examples/llama/train_llama2_7b_1nodes.sh
View file @
1e8185f4
#!/bin/bash
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
for
para
in
$*
...
@@ -30,7 +34,7 @@ DATA_PATH=${data_path}
...
@@ -30,7 +34,7 @@ DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
#
default env
#
运行环境参数
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
...
@@ -38,12 +42,18 @@ LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
...
@@ -38,12 +42,18 @@ LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
GPU_MAX_HW_QUEUES
=
10
# 4 # 20
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# tp-overlap控制参数
export
NVTE_DISABLE_FC2_DGRAD_OVERLAP
=
1
export
NVTE_NO_PIPELINE_OVERLAP
=
1
# torch控制多流转单流
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
...
@@ -66,14 +76,14 @@ GPT_MODEL_ARGS=(
...
@@ -66,14 +76,14 @@ GPT_MODEL_ARGS=(
--ffn-hidden-size
11008
--ffn-hidden-size
11008
--num-attention-heads
32
--num-attention-heads
32
--max-position-embeddings
4096
--max-position-embeddings
4096
--normalization
Lightop
RMSNorm
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--position-embedding-type
rope
--untie-embeddings-and-output-weights
--untie-embeddings-and-output-weights
)
)
TRAINING_ARGS
=(
TRAINING_ARGS
=(
--transformer-impl
local
--transformer-impl
transformer_engine
--use-
legacy
-models
--use-
mcore
-models
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
256
--global-batch-size
256
--train-iters
50
--train-iters
50
...
@@ -94,6 +104,8 @@ TRAINING_ARGS=(
...
@@ -94,6 +104,8 @@ TRAINING_ARGS=(
--ckpt-format
torch
--ckpt-format
torch
--ddp-average-in-collective
--ddp-average-in-collective
--overlap-grad-reduce
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
--use-flash-attn
)
)
...
@@ -159,4 +171,4 @@ elif [[ $profiling == "hip" ]]; then
...
@@ -159,4 +171,4 @@ elif [[ $profiling == "hip" ]]; then
fi
fi
#for hygon cpu
#for hygon cpu
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
\ No newline at end of file
examples/llama/train_llama3_70b_8nodes.sh
0 → 100755
View file @
1e8185f4
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# 运行环境参数
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
20
#10 # 4
# export NVTE_DISABLE_FC2_DGRAD_OVERLAP=1
# export NVTE_NO_PIPELINE_OVERLAP=1
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
80
#80 #80 #40 # 20 #
--hidden-size
8192
--ffn-hidden-size
28672
# 28672
--num-attention-heads
64
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
512
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
Llama3Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp4-pp2-cp1-tpoverlap-nosyns_20
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
${
MEGATRON_PATH
}
/requirements/launch_with_binding.sh
${
LOCAL_RANK
}
${
APP
}
\ No newline at end of file
examples/qwen/hostfile
0 → 100644
View file @
1e8185f4
node036 slots=8
node034 slots=8
\ No newline at end of file
examples/qwen/run_qwen1.5_14B.sh
0 → 100755
View file @
1e8185f4
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"8"
# how many gpus to use
# DTK_ENV="/opt/dtk/env.sh"
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
HOST
=
"localhost"
# hostname
PORT
=
"11451"
# port id
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-qwen_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/qwen1.5"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_qwen1.5_14b_1nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./log/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/qwen/run_qwen1.5_32B.sh
0 → 100755
View file @
1e8185f4
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Those variables need to modify
GPUS
=
"16"
# how many gpus to use
# DTK_ENV="/opt/dtk/env.sh"
DTK_ENV
=
"/public/home/wangxj/Downloads/blas/dtk-25.04.1-rc1/env.sh"
# where env.sh of dtk
# NCCL_ENV="/workspace/dcu_megatron/requirements/nccl_wz/env.sh" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
NCCL_ENV
=
"/public/home/wangxj/Projects/dcu_megatron/requirements/nccl_wz/env.sh"
HOST
=
"node036"
# hostname
PORT
=
"11451"
# port id
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" # path to oscar-1GB_head-llama2_text_document
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-qwen_text_document"
# TOKENIZER_MODEL_PATH="/data/model_weights/llama2_7b_hf/tokenizer.model" # path to tokenizer.model
TOKENIZER_MODEL_PATH
=
"/public/home/wangxj/Downloads/model_weights/qwen1.5"
CHECKPOINT_PATH
=
"./ckpt"
# path to ckpt
# Runs Llama2 7B model
mpirun
-np
${
GPUS
}
--hostfile
hostfile
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
bash
-c
"
source
${
DTK_ENV
}
&&
\
source
${
NCCL_ENV
}
&&
\
./train_qwen1.5_32b_2nodes.sh
\
${
HOST
}
\
${
PORT
}
\
--data_path=
$DATA_PATH
\
--tokenizer_path=
$TOKENIZER_MODEL_PATH
\
--checkpoint_path=
$CHECKPOINT_PATH
\
--profiling=
$profiling
"
>
./log/log-
$((${
GPUS
}
/
8
))
nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
\ No newline at end of file
examples/qwen/train_qwen1.5_14b_1nodes.sh
0 → 100755
View file @
1e8185f4
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# 运行环境参数
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
#10 # 4 # 20
export
NVTE_DISABLE_FC2_DGRAD_OVERLAP
=
1
export
NVTE_NO_PIPELINE_OVERLAP
=
1
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
40
--hidden-size
5120
--ffn-hidden-size
13696
--num-attention-heads
40
--max-position-embeddings
4096
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--add-qkv-bias
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
4
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
QwenTokenizer
--merge-file
${
TOKENIZER_MODEL_PATH
}
/merges.txt
--vocab-file
${
TOKENIZER_MODEL_PATH
}
/vocab.json
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
\ No newline at end of file
examples/qwen/train_qwen1.5_32b_2nodes.sh
0 → 100755
View file @
1e8185f4
#!/bin/bash
# 使用blaslt和rocblas
export export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install0519/lib:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0513-release/lib:
$LD_LIBRARY_PATH
INITIALIZATION_ARGS
=(
--num-workers
2
)
for
para
in
$*
do
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
elif
[[
$para
==
--reproduce
*
]]
;
then
INITIALIZATION_ARGS
=(
--reproduce
--num-workers
0
)
export
MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC
=
1
# miopen 确定算法打开
export
ROCBLAS_ATOMICS_MOD
=
0
# rocblas 关闭原子操作
# 关闭miopen中的atomic操作算法, 只保留gemm算法
export
MIOPEN_DEBUG_CONV_FFT
=
0
export
MIOPEN_DEBUG_CONV_DIRECT
=
0
export
MIOPEN_DEBUG_CONV_GEMM
=
1
export
MIOPEN_DEBUG_CONV_WINOGRAD
=
0
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
fi
done
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# 运行环境参数
DIST_URL
=
${
1
}
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
PYTHONPATH
=
${
MEGATRON_PATH
}
/Megatron-LM:
$PYTHONPATH
# default env
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
#10 # 4 # 20
export
NVTE_DISABLE_FC2_DGRAD_OVERLAP
=
1
export
NVTE_NO_PIPELINE_OVERLAP
=
1
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
GPT_MODEL_ARGS
=(
--seq-length
4096
--num-layers
64
--hidden-size
5120
--ffn-hidden-size
27392
--num-attention-heads
40
--max-position-embeddings
32768
--num-query-groups
8
--group-query-attention
--normalization
RMSNorm
# Lightop
--position-embedding-type
rope
--untie-embeddings-and-output-weights
)
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
--use-mcore-models
--micro-batch-size
1
--global-batch-size
256
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
--swiglu
--add-qkv-bias
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
--overlap-grad-reduce
# --tp-comm-overlap
# --tp-comm-overlap-rs-dgrad
--use-flash-attn
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
4
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
DATA_ARGS
=(
--tokenizer-type
QwenTokenizer
--merge-file
${
TOKENIZER_MODEL_PATH
}
/merges.txt
--vocab-file
${
TOKENIZER_MODEL_PATH
}
/vocab.json
--data-path
${
DATA_PATH
}
--split
949,50,1
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
5
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$CHECKPOINT_PATH
--load
$CHECKPOINT_PATH
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
${
INITIALIZATION_ARGS
[@]
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
[@]
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
[@]
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment