[Profile] dump memory trace when cuda graph profile is enabled (#11083)

6535fda1 · Cheng Wan · GitHub · 3713eb61 · 6535fda1
Unverified Commit 6535fda1 authored Sep 29, 2025 by Cheng Wan Committed by GitHub Sep 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

python/sglang/srt/model_executor/cuda_graph_runner.py python/sglang/srt/model_executor/cuda_graph_runner.py +4 -0

No files found.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -437,6 +437,7 @@ class CudaGraphRunner:
                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                record_shapes=True,
            )
+            torch.cuda.memory._record_memory_history()

        # Trigger CUDA graph capture for specific shapes.
        # Capture the large shapes first so that the smaller shapes
@@ -485,6 +486,8 @@ class CudaGraphRunner:
                    save_gemlite_cache()

        if self.enable_profile_cuda_graph:
+            torch.cuda.memory._dump_snapshot(f"cuda_graph_runner_memory_usage.pickle")
+            torch.cuda.memory._record_memory_history(enabled=None)
            log_message = (
                "Sorted by CUDA Time:\n"
                + prof.key_averages(group_by_input_shape=True).table(
@@ -494,6 +497,7 @@ class CudaGraphRunner:
                + prof.key_averages(group_by_input_shape=True).table(
                    sort_by="cpu_time_total", row_limit=10
                )
+                + "\n\nMemory Usage is saved to cuda_graph_runner_memory_usage.pickle\n"
            )
            logger.info(log_message)