[UX] Add Feedback During CUDAGraph Capture (#19501)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>

[UX] Add Feedback During CUDAGraph Capture (#19501)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
97a9465b · Robert Shaw · GitHub · c7ea0b56 · 97a9465b
Unverified Commit 97a9465b authored Jun 11, 2025 by Robert Shaw Committed by GitHub Jun 11, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +4 -1

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -12,6 +12,7 @@ import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
+from tqdm import tqdm

 import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
@@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # can reuse the memory pool allocated for the large shapes.
        with graph_capture(device=self.device):
            skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
-            for num_tokens in reversed(self.cudagraph_batch_sizes):
+            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
+                                   desc="Capturing CUDA graphs",
+                                   total=len(self.cudagraph_batch_sizes)):
                for _ in range(self.vllm_config.compilation_config.
                               cudagraph_num_of_warmups):
                    self._dummy_run(num_tokens, skip_attn=skip_attn)