Commit 52610942 authored by silencealiang's avatar silencealiang
Browse files

添加prof参数

parent a65607d4
...@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH ...@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path> CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path> TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document DATA_PATH="/public/home/wangxj3/Downloads/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
# GPT_MODEL_ARGS=( # GPT_MODEL_ARGS=(
# --num-layers 32 # --num-layers 32
...@@ -115,7 +115,7 @@ DATA_ARGS=( ...@@ -115,7 +115,7 @@ DATA_ARGS=(
--normalization RMSNorm --normalization RMSNorm
--no-position-embedding --no-position-embedding
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model --tokenizer-model /public/home/wangxj3/Downloads/model_weights/llama2_7b_hf/tokenizer.model
) )
EVAL_AND_LOGGING_ARGS=( EVAL_AND_LOGGING_ARGS=(
......
...@@ -19,8 +19,18 @@ ...@@ -19,8 +19,18 @@
2024.12.16适配了torch prof 2024.12.16适配了torch prof
使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息 使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息
```python
# 采集torchprof
mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=torch
# 采集hipprof
mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=hip
```
```bash ```bash
PROFILE_ARGS=( # prof相关参数
TORCH_PROFIE_ARGS=(
--profile # 开启profile --profile # 开启profile
--profile-step-start 4 # skip前3个iter, warm第4个iter --profile-step-start 4 # skip前3个iter, warm第4个iter
--profile-step-end 5 # 采集第5个iter --profile-step-end 5 # 采集第5个iter
...@@ -28,10 +38,14 @@ PROFILE_ARGS=( ...@@ -28,10 +38,14 @@ PROFILE_ARGS=(
--profile-ranks 0 3 # 采集全局rank 第0和3 --profile-ranks 0 3 # 采集全局rank 第0和3
--profile-dir ./prof_data # prof文件的保存目录 --profile-dir ./prof_data # prof文件的保存目录
) )
APP="... \
${PROFILE_ARGS[@]} \ HIP_PROFIE_ARGS=(
" --profile
${APP} --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
``` ```
......
...@@ -1263,6 +1263,9 @@ def _add_training_args(parser): ...@@ -1263,6 +1263,9 @@ def _add_training_args(parser):
help='Use the built-in pytorch profiler. ' help='Use the built-in pytorch profiler. '
'Useful if you wish to view profiles in tensorboard.', 'Useful if you wish to view profiles in tensorboard.',
dest='use_pytorch_profiler') dest='use_pytorch_profiler')
group.add_argument('--use-hip-profiler', action='store_true',
help='Use HIP PROFILER',
dest='use_hip_profiler')
group.add_argument('--profile-ranks', nargs='+', type=int, default=[0], group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
help='Global ranks to profile.') help='Global ranks to profile.')
group.add_argument('--profile-dir', type=str, default="./", group.add_argument('--profile-dir', type=str, default="./",
......
...@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio ...@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
if args.use_pytorch_profiler: if args.use_pytorch_profiler:
assert prof is not None assert prof is not None
prof.stop() prof.stop()
print_rank_0(f"prof stop!") else:
torch.cuda.cudart().cudaProfilerStop()
# Manual garbage collection. # Manual garbage collection.
...@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, ...@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
def trace_handler(p): def trace_handler(p):
from pathlib import Path from pathlib import Path
Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True) Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True)
print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) if args.rank in [0]:
print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format( p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format(
path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num)) path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num))
...@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, ...@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
warmup=1 if args.profile_step_start > 0 else 0, warmup=1 if args.profile_step_start > 0 else 0,
active=args.profile_step_end-args.profile_step_start, active=args.profile_step_end-args.profile_step_start,
repeat=1), repeat=1),
# record_shapes=True, on_trace_ready=trace_handler)
# with_stack=True,
on_trace_ready=trace_handler,)
prof.start() prof.start()
elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
import ctypes
roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
# Run training iterations till done. # Run training iterations till done.
while iteration < args.train_iters: while iteration < args.train_iters:
if args.profile and torch.distributed.get_rank() in args.profile_ranks: if args.profile and torch.distributed.get_rank() in args.profile_ranks:
if args.use_pytorch_profiler: if args.use_pytorch_profiler:
prof.step() prof.step()
elif args.use_hip_profiler:
if iteration == args.profile_step_start: roctracer.roctracer_start()
if iteration == args.profile_step_end: roctracer.roctracer_stop()
elif iteration == args.profile_step_start:
torch.cuda.cudart().cudaProfilerStart()
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
maybe_finalize_async_save(blocking=False) maybe_finalize_async_save(blocking=False)
......
#!/bin/bash #!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
source /opt/dtk/env.sh source /opt/dtk/env.sh
# Runs Mixtral 8x7B model # Runs Mixtral 8x7B model
export HIP_DIRECT_DISPATCH=0 export HIP_DIRECT_DISPATCH=0
...@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 ...@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
#export NCCL_SOCKET_IFNAME=enp145s0f0 #export NCCL_SOCKET_IFNAME=enp145s0f0
export NCCL_NET_GDR_LEVEL=SYS export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0 export NCCL_NET_GDR_READ=0
export GLOG_minloglevel=3
export LD_LIBRARY_PATH=/opt/hipblaslt-install/lib/:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/opt/hipblaslt-install/lib/:$LD_LIBRARY_PATH
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -26,8 +38,8 @@ DIST_URL=${1} ...@@ -26,8 +38,8 @@ DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=25900
CHECKPOINT_PATH=./CKPT CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL=../Mixtral8x7B/mixtral_dataset/tokenizer.model TOKENIZER_MODEL=../../megatron-lm/mixtral_dataset/tokenizer.model
DATA_PATH=../Mixtral8x7B/mixtral_dataset/my-mixtral_text_document DATA_PATH=../../megatron-lm/mixtral_dataset/my-mixtral_text_document
DISTRIBUTED_ARGS=( DISTRIBUTED_ARGS=(
--rank ${RANK} --rank ${RANK}
...@@ -41,7 +53,7 @@ MODEL_ARGS=( ...@@ -41,7 +53,7 @@ MODEL_ARGS=(
--disable-bias-linear --disable-bias-linear
--seq-length 4096 --seq-length 4096
--max-position-embeddings 32768 --max-position-embeddings 32768
--num-layers 2 --num-layers 8 #16
--hidden-size 1024 --hidden-size 1024
--ffn-hidden-size 14336 --ffn-hidden-size 14336
--num-attention-heads 32 --num-attention-heads 32
...@@ -65,8 +77,6 @@ MOE_ARGS=( ...@@ -65,8 +77,6 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2 --moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--overlap-param-gather
--overlap-grad-reduce
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
--moe-grouped-gemm --moe-grouped-gemm
...@@ -81,7 +91,7 @@ DATA_ARGS=( ...@@ -81,7 +91,7 @@ DATA_ARGS=(
TRAINING_ARGS=( TRAINING_ARGS=(
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 16 --global-batch-size 128 #256
--lr 1e-4 --lr 1e-4
--train-iters 20 --train-iters 20
--lr-decay-iters 320000 --lr-decay-iters 320000
...@@ -91,6 +101,25 @@ TRAINING_ARGS=( ...@@ -91,6 +101,25 @@ TRAINING_ARGS=(
--lr-warmup-iters 500 --lr-warmup-iters 500
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_data
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
) )
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
...@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \ ...@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \
${LOGGING_ARGS[@]} \ ${LOGGING_ARGS[@]} \
" "
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu #for hygon cpu
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment