Unverified Commit 97a9465b authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[UX] Add Feedback During CUDAGraph Capture (#19501)


Signed-off-by: default avatarrshaw@neuralmagic.com <robertgshaw2@gmail.com>
parent c7ea0b56
......@@ -12,6 +12,7 @@ import numpy as np
import torch
import torch.distributed
import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs
from vllm.attention import AttentionType, get_attn_backend
......@@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# can reuse the memory pool allocated for the large shapes.
with graph_capture(device=self.device):
skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
for num_tokens in reversed(self.cudagraph_batch_sizes):
for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
desc="Capturing CUDA graphs",
total=len(self.cudagraph_batch_sizes)):
for _ in range(self.vllm_config.compilation_config.
cudagraph_num_of_warmups):
self._dummy_run(num_tokens, skip_attn=skip_attn)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment