add hip profiler

1cc3fbe9 · silencealiang · 2757c9c9 · 1cc3fbe9 · 1cc3fbe9 · 1cc3fbe9
Commit 1cc3fbe9 authored Mar 26, 2025 by silencealiang
6 changed files
--- a/examples/gpt3/train_gpt_567B_1nodes.sh
+++ b/examples/gpt3/train_gpt_567B_1nodes.sh
@@ -113,6 +113,14 @@ TORCH_PROFIE_ARGS=(
    --use-pytorch-profiler
 )
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -154,6 +162,10 @@ APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu

--- a/examples/gpt3/train_gpt_567B_multinodes.sh
+++ b/examples/gpt3/train_gpt_567B_multinodes.sh
@@ -113,6 +113,14 @@ TORCH_PROFIE_ARGS=(
    --use-pytorch-profiler
 )
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 16
@@ -155,6 +163,10 @@ APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu

--- a/examples/mixtral/train_mixtral_8x7B_1nodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_1nodes.sh
@@ -116,6 +116,14 @@ TORCH_PROFIE_ARGS=(
    --use-pytorch-profiler
 )
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -157,6 +165,10 @@ APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu

--- a/examples/mixtral/train_mixtral_8x7B_multinodes.sh
+++ b/examples/mixtral/train_mixtral_8x7B_multinodes.sh
@@ -116,6 +116,14 @@ TORCH_PROFIE_ARGS=(
    --use-pytorch-profiler
 )
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 4
@@ -157,6 +165,10 @@ APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi
 #for hygon cpu

--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -1408,10 +1408,13 @@ def _add_training_args(parser):
                       help='Use the built-in pytorch profiler. '
                       'Useful if you wish to view profiles in tensorboard.',
                       dest='use_pytorch_profiler')
-    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
+    group.add_argument('--use-hip-profiler', action='store_true',
-                       help='Global ranks to profile.')
+                       help='Use HIP PROFILER',
+                       dest='use_hip_profiler')
    group.add_argument('--profile-dir', type=str, default="./",
                   help='profile dir to save.')
+    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
+                       help='Global ranks to profile.')
    group.add_argument('--record-memory-history', action="store_true", default=False,
                       help='Record memory history in last rank.')
    group.add_argument('--memory-snapshot-path', type=str, default="snapshot.pickle",

--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -1519,6 +1519,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
            #on_trace_ready=torch.profiler.tensorboard_trace_handler('./torch_prof_data'))
            on_trace_ready=trace_handler)
        prof.start()
+    elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
+        import ctypes
+        roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
    start_iteration = iteration
    # Disable forward pre-hook to start training to ensure that errors in checkpoint loading
@@ -1543,6 +1546,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
        if args.profile and torch.distributed.get_rank() in args.profile_ranks:
            if args.use_pytorch_profiler:
                prof.step()
+            elif args.use_hip_profiler:
+                if iteration == args.profile_step_start: roctracer.roctracer_start()
+                if iteration == args.profile_step_end: roctracer.roctracer_stop()
            elif iteration == args.profile_step_start:
                torch.cuda.cudart().cudaProfilerStart()
                torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()