Unverified Commit b880ffb8 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Misc] Add tqdm progress bar during graph capture (#11349)


Signed-off-by: default avatarmgoin <michael@neuralmagic.com>
parent 7801f56e
...@@ -13,6 +13,7 @@ import numpy as np ...@@ -13,6 +13,7 @@ import numpy as np
import torch import torch
import torch.distributed import torch.distributed
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs import vllm.envs as envs
from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention import AttentionMetadata, get_attn_backend
...@@ -21,7 +22,8 @@ from vllm.attention.backends.utils import CommonAttentionState ...@@ -21,7 +22,8 @@ from vllm.attention.backends.utils import CommonAttentionState
from vllm.config import CompilationLevel, VllmConfig from vllm.config import CompilationLevel, VllmConfig
from vllm.core.scheduler import SchedulerOutputs from vllm.core.scheduler import SchedulerOutputs
from vllm.distributed import get_kv_transfer_group, get_pp_group from vllm.distributed import get_kv_transfer_group, get_pp_group
from vllm.distributed.parallel_state import graph_capture from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
graph_capture)
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.inputs import INPUT_REGISTRY, InputRegistry
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -1413,8 +1415,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1413,8 +1415,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
logger.info("Capturing cudagraphs for decoding. This may lead to " logger.info("Capturing cudagraphs for decoding. This may lead to "
"unexpected consequences if the model is not static. To " "unexpected consequences if the model is not static. To "
"run the model in eager mode, set 'enforce_eager=True' or " "run the model in eager mode, set 'enforce_eager=True' or "
"use '--enforce-eager' in the CLI.") "use '--enforce-eager' in the CLI. "
logger.info("If out-of-memory error occurs during cudagraph capture," "If out-of-memory error occurs during cudagraph capture,"
" consider decreasing `gpu_memory_utilization` or " " consider decreasing `gpu_memory_utilization` or "
"switching to eager mode. You can also reduce the " "switching to eager mode. You can also reduce the "
"`max_num_seqs` as needed to decrease memory usage.") "`max_num_seqs` as needed to decrease memory usage.")
...@@ -1451,8 +1453,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1451,8 +1453,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# memory usage of CUDA graph. # memory usage of CUDA graph.
for virtual_engine in range( for virtual_engine in range(
self.parallel_config.pipeline_parallel_size): self.parallel_config.pipeline_parallel_size):
for batch_size in \ # Only rank 0 should print progress bar during capture
self.vllm_config.compilation_config.capture_sizes: capture_sizes = (
tqdm(
self.vllm_config.compilation_config.capture_sizes,
desc="Capturing CUDA graph shapes",
) if get_tensor_model_parallel_rank() == 0 else
self.vllm_config.compilation_config.capture_sizes)
for batch_size in capture_sizes:
attn_metadata = ( attn_metadata = (
self.attn_state.graph_capture_get_metadata_for_batch( self.attn_state.graph_capture_get_metadata_for_batch(
batch_size, batch_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment