添加prof参数

52610942 · silencealiang · a65607d4 · 52610942 · 52610942 · 52610942
Commit 52610942 authored Dec 17, 2024 by silencealiang
5 changed files
--- a/Llama_pretraining.sh
+++ b/Llama_pretraining.sh
@@ -32,7 +32,7 @@ export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH

 CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
 TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
+DATA_PATH="/public/home/wangxj3/Downloads/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document

 # GPT_MODEL_ARGS=(
 #     --num-layers 32
@@ -115,7 +115,7 @@ DATA_ARGS=(
    --normalization RMSNorm 
    --no-position-embedding 
    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
+    --tokenizer-model /public/home/wangxj3/Downloads/model_weights/llama2_7b_hf/tokenizer.model
 )

 EVAL_AND_LOGGING_ARGS=(

--- a/README.md
+++ b/README.md
@@ -19,8 +19,18 @@
 2024.12.16适配了torch prof

 使用方法: 启动脚本中添加下列参数, 即可采集对应的prof信息
+
+```python
+# 采集torchprof
+mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=torch
+
+# 采集hipprof
+mpirun -np 8 --allow-run-as-root train_mixtral_8x7B_1nodes.sh localhost --profiling=hip
+```
+
 ```bash
-PROFILE_ARGS=(
+# prof相关参数
+TORCH_PROFIE_ARGS=(
    --profile # 开启profile
    --profile-step-start 4 # skip前3个iter, warm第4个iter
    --profile-step-end 5 # 采集第5个iter
@@ -28,10 +38,14 @@ PROFILE_ARGS=(
    --profile-ranks 0 3 # 采集全局rank 第0和3
    --profile-dir ./prof_data # prof文件的保存目录
 )
-APP="... \
-    ${PROFILE_ARGS[@]} \
-"
-${APP}
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
 ```



--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1263,6 +1263,9 @@ def _add_training_args(parser):
                       help='Use the built-in pytorch profiler. '
                       'Useful if you wish to view profiles in tensorboard.',
                       dest='use_pytorch_profiler')
+    group.add_argument('--use-hip-profiler', action='store_true',
+                       help='Use HIP PROFILER',
+                       dest='use_hip_profiler')
    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
                       help='Global ranks to profile.')
    group.add_argument('--profile-dir', type=str, default="./",

--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -1221,7 +1221,8 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
        if args.use_pytorch_profiler:
            assert prof is not None
            prof.stop()
-            print_rank_0(f"prof stop!")
+        else:
+            torch.cuda.cudart().cudaProfilerStop()
        

    # Manual garbage collection.
@@ -1412,7 +1413,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
        def trace_handler(p):
            from pathlib import Path
            Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True)
-            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+            if args.rank in [0]:
+                print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
            p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format(
                path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num))

@@ -1426,16 +1428,23 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
            warmup=1 if args.profile_step_start > 0 else 0,
            active=args.profile_step_end-args.profile_step_start,
            repeat=1),
-        # record_shapes=True,
-        # with_stack=True,
-        on_trace_ready=trace_handler,)
+        on_trace_ready=trace_handler)
        prof.start()
+    elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
+        import ctypes
+        roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
    
    # Run training iterations till done.
    while iteration < args.train_iters:
        if args.profile and torch.distributed.get_rank() in args.profile_ranks:
            if args.use_pytorch_profiler:
                prof.step()
+            elif args.use_hip_profiler:
+                if iteration == args.profile_step_start: roctracer.roctracer_start()
+                if iteration == args.profile_step_end: roctracer.roctracer_stop()
+            elif iteration == args.profile_step_start:
+                torch.cuda.cudart().cudaProfilerStart()
+                torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()

        maybe_finalize_async_save(blocking=False)


--- a/train_mixtral_8x7B_1nodes.sh
+++ b/train_mixtral_8x7B_1nodes.sh
 #!/bin/bash

+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
 source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
 export HIP_DIRECT_DISPATCH=0
@@ -17,8 +26,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 #export NCCL_SOCKET_IFNAME=enp145s0f0
 export NCCL_NET_GDR_LEVEL=SYS
 export NCCL_NET_GDR_READ=0
+export GLOG_minloglevel=3
 export LD_LIBRARY_PATH=/opt/hipblaslt-install/lib/:$LD_LIBRARY_PATH

+
+
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -26,8 +38,8 @@ DIST_URL=${1}
 DIST_PORT=25900

 CHECKPOINT_PATH=./CKPT 
-TOKENIZER_MODEL=../Mixtral8x7B/mixtral_dataset/tokenizer.model
-DATA_PATH=../Mixtral8x7B/mixtral_dataset/my-mixtral_text_document
+TOKENIZER_MODEL=../../megatron-lm/mixtral_dataset/tokenizer.model
+DATA_PATH=../../megatron-lm/mixtral_dataset/my-mixtral_text_document

 DISTRIBUTED_ARGS=(
    --rank ${RANK}
@@ -41,7 +53,7 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 4096
    --max-position-embeddings 32768
-    --num-layers 2
+    --num-layers 8 #16
    --hidden-size 1024
    --ffn-hidden-size 14336
    --num-attention-heads 32
@@ -65,8 +77,6 @@ MOE_ARGS=(
    --moe-router-load-balancing-type aux_loss
    --moe-aux-loss-coeff 1e-2
    --moe-token-dispatcher-type alltoall
-    --overlap-param-gather
-    --overlap-grad-reduce
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
    --moe-grouped-gemm
@@ -81,7 +91,7 @@ DATA_ARGS=(

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 16
+    --global-batch-size 128 #256
    --lr 1e-4
    --train-iters 20
    --lr-decay-iters 320000
@@ -91,6 +101,25 @@ TRAINING_ARGS=(
    --lr-warmup-iters 500
    --clip-grad 1.0
    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_data
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
 )

 MODEL_PARALLEL_ARGS=(
@@ -132,6 +161,14 @@ APP="python3 -u pretrain_gpt.py \
    ${LOGGING_ARGS[@]} \
    "

+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
 #for hygon cpu
 case ${LOCAL_RANK} in
 [0])