Commit 52610942 authored by silencealiang's avatar silencealiang
Browse files

添加prof参数

parent a65607d4
Pipeline #2117 failed with stages
in 0 seconds
......@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
DATA_PATH="/public/home/wangxj3/Downloads/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
# GPT_MODEL_ARGS=(
# --num-layers 32
......@@ -115,7 +115,7 @@ DATA_ARGS=(
--normalization RMSNorm
--no-position-embedding
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model /public/home/wangxj3/Downloads/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
......
......@@ -19,8 +19,18 @@
2024.12.16适配了torch prof
使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息
```python
# 采集torchprof
mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=torch
# 采集hipprof
mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=hip
```
```bash
PROFILE_ARGS=(
# prof相关参数
TORCH_PROFIE_ARGS=(
--profile # 开启profile
--profile-step-start 4 # skip前3个iter, warm第4个iter
--profile-step-end 5 # 采集第5个iter
......@@ -28,10 +38,14 @@ PROFILE_ARGS=(
--profile-ranks 0 3 # 采集全局rank 第0和3
--profile-dir ./prof_data # prof文件的保存目录
)
APP="... \
${PROFILE_ARGS[@]} \
"
${APP}
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
```
......
......@@ -1263,6 +1263,9 @@ def _add_training_args(parser):
help='Use the built-in pytorch profiler. '
'Useful if you wish to view profiles in tensorboard.',
dest='use_pytorch_profiler')
group.add_argument('--use-hip-profiler', action='store_true',
help='Use HIP PROFILER',
dest='use_hip_profiler')
group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
help='Global ranks to profile.')
group.add_argument('--profile-dir', type=str, default="./",
......
......@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
if args.use_pytorch_profiler:
assert prof is not None
prof.stop()
print_rank_0(f"prof stop!")
else:
torch.cuda.cudart().cudaProfilerStop()
# Manual garbage collection.
......@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
def trace_handler(p):
from pathlib import Path
Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True)
print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
if args.rank in [0]:
print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format(
path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num))
......@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
warmup=1 if args.profile_step_start > 0 else 0,
active=args.profile_step_end-args.profile_step_start,
repeat=1),
# record_shapes=True,
# with_stack=True,
on_trace_ready=trace_handler,)
on_trace_ready=trace_handler)
prof.start()
elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
import ctypes
roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
# Run training iterations till done.
while iteration < args.train_iters:
if args.profile and torch.distributed.get_rank() in args.profile_ranks:
if args.use_pytorch_profiler:
prof.step()
elif args.use_hip_profiler:
if iteration == args.profile_step_start: roctracer.roctracer_start()
if iteration == args.profile_step_end: roctracer.roctracer_stop()
elif iteration == args.profile_step_start:
torch.cuda.cudart().cudaProfilerStart()
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
maybe_finalize_async_save(blocking=False)
......
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
source /opt/dtk/env.sh
# Runs Mixtral 8x7B model
export HIP_DIRECT_DISPATCH=0
......@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
#export NCCL_SOCKET_IFNAME=enp145s0f0
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
export GLOG_minloglevel=3
export LD_LIBRARY_PATH=/opt/hipblaslt-install/lib/:$LD_LIBRARY_PATH
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -26,8 +38,8 @@ DIST_URL=${1}
DIST_PORT=25900
CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL=../Mixtral8x7B/mixtral_dataset/tokenizer.model
DATA_PATH=../Mixtral8x7B/mixtral_dataset/my-mixtral_text_document
TOKENIZER_MODEL=../../megatron-lm/mixtral_dataset/tokenizer.model
DATA_PATH=../../megatron-lm/mixtral_dataset/my-mixtral_text_document
DISTRIBUTED_ARGS=(
--rank ${RANK}
......@@ -41,7 +53,7 @@ MODEL_ARGS=(
--disable-bias-linear
--seq-length 4096
--max-position-embeddings 32768
--num-layers 2
--num-layers 8 #16
--hidden-size 1024
--ffn-hidden-size 14336
--num-attention-heads 32
......@@ -65,8 +77,6 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall
--overlap-param-gather
--overlap-grad-reduce
--moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
......@@ -81,7 +91,7 @@ DATA_ARGS=(
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 16
--global-batch-size 128 #256
--lr 1e-4
--train-iters 20
--lr-decay-iters 320000
......@@ -91,6 +101,25 @@ TRAINING_ARGS=(
--lr-warmup-iters 500
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_data
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
......@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \
${LOGGING_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment