Unverified Commit 97a9465b authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[UX] Add Feedback During CUDAGraph Capture (#19501)


Signed-off-by: default avatarrshaw@neuralmagic.com <robertgshaw2@gmail.com>
parent c7ea0b56
...@@ -12,6 +12,7 @@ import numpy as np ...@@ -12,6 +12,7 @@ import numpy as np
import torch import torch
import torch.distributed import torch.distributed
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs import vllm.envs as envs
from vllm.attention import AttentionType, get_attn_backend from vllm.attention import AttentionType, get_attn_backend
...@@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# can reuse the memory pool allocated for the large shapes. # can reuse the memory pool allocated for the large shapes.
with graph_capture(device=self.device): with graph_capture(device=self.device):
skip_attn = not self.vllm_config.compilation_config.full_cuda_graph skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
for num_tokens in reversed(self.cudagraph_batch_sizes): for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
desc="Capturing CUDA graphs",
total=len(self.cudagraph_batch_sizes)):
for _ in range(self.vllm_config.compilation_config. for _ in range(self.vllm_config.compilation_config.
cudagraph_num_of_warmups): cudagraph_num_of_warmups):
self._dummy_run(num_tokens, skip_attn=skip_attn) self._dummy_run(num_tokens, skip_attn=skip_attn)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment