[Misc] Respect `no_use_tqdm_on_load` flag while capturing CUDA graph (#20834)

Signed-off-by: Linkun <github@lkchen.net>

[Misc] Respect `no_use_tqdm_on_load` flag while capturing CUDA graph (#20834)
Signed-off-by: Linkun <github@lkchen.net>
f56d2996 · lkchen · GitHub · 147afb44 · f56d2996 · f56d2996
Unverified Commit f56d2996 authored Jul 11, 2025 by lkchen Committed by GitHub Jul 11, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +4 -2

vllm/worker/model_runner.py vllm/worker/model_runner.py +1 -0

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2270,8 +2270,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            # Only rank 0 should print progress bar during capture
            compilation_cases = reversed(self.cudagraph_batch_sizes)
            if is_global_first_rank():
-                compilation_cases = tqdm(list(compilation_cases),
+                compilation_cases = tqdm(
-                                         desc="Capturing CUDA graph shapes")
+                    list(compilation_cases),
+                    disable=not self.load_config.use_tqdm_on_load,
+                    desc="Capturing CUDA graph shapes")
            for num_tokens in compilation_cases:
                # We skip EPLB here since we don't want to record dummy metrics
                for _ in range(

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1587,6 +1587,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                if get_tensor_model_parallel_rank() == 0:
                    compilation_cases = tqdm(
                        list(compilation_cases),
+                        disable=not self.load_config.use_tqdm_on_load,
                        desc="Capturing CUDA graph shapes")
                for batch_size, use_inputs_embeds in compilation_cases:
                    attn_metadata = (