Unverified Commit 936e0b79 authored by Shanshan Shen's avatar Shanshan Shen Committed by GitHub
Browse files

[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph...


[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph video inference (#40445)
Signed-off-by: default avatarshen-shanshan <467638484@qq.com>
parent b2a55186
......@@ -76,6 +76,7 @@ Models opt-in to encoder CUDA Graphs by implementing the [SupportsEncoderCudaGra
* `encoder_cudagraph_forward(...)` — forward pass using precomputed buffers (called during capture and replay).
* `encoder_eager_forward(...)` — fallback eager forward when no graph fits.
* `get_input_modality(...)` - return the modality of the inputs.
* `get_max_frames_per_video()` - return model-specific max frames per video.
!!! note
The `SupportsEncoderCudaGraph` protocol is designed to be model-agnostic. New vision encoder models can opt-in by implementing the protocol methods without modifying the manager.
......@@ -96,7 +97,7 @@ Three fields in `CompilationConfig` control encoder CUDA Graphs:
* `cudagraph_mm_encoder` (`bool`, default `False`) — enable CUDA Graph capture for multimodal encoder. When enabled, captures the full encoder forward as a CUDA Graph for each token budget level.
* `encoder_cudagraph_token_budgets` (`list[int]`, default `[]`) — token budget levels for capture. If empty (default), auto-inferred from model architecture as power-of-2 levels. User-provided values override auto-inference.
* `encoder_cudagraph_max_vision_items_per_batch` (`int`, default `0`) — maximum number of images/videos per batch during capture. If 0 (default), auto-inferred as `max_budget // min_budget`.
* `encoder_cudagraph_max_frames_per_batch` (`int`, default `0`) — maximum number of video frames per batch during capture. If 0 (default), auto-inferred as `encoder_cudagraph_max_vision_items_per_batch * 2` (to be optimized).
* `encoder_cudagraph_max_frames_per_batch` (`int`, default `None`) — maximum number of video frames per batch during capture. If `None` (default), auto-inferred as `encoder_cudagraph_max_vision_items_per_batch * max_frames_per_video` (`max_frames_per_video` is a model-specific value according to its `processing_info`). If we limit the video count per prompt to `0`, it will also be set to `0` (i.e., fall back to image-only mode).
## Usage guide
......
......@@ -535,13 +535,15 @@ class CompilationConfig:
model's budget range. User-provided positive value overrides
auto-inference."""
encoder_cudagraph_max_frames_per_batch: int = 0
encoder_cudagraph_max_frames_per_batch: int | None = None
"""Maximum total video frames per batch for encoder CUDA graph capture.
Controls the cu_seqlens buffer size (one entry per attention sequence,
i.e. one per video frame). If 0 (default), auto-inferred per budget
level as token_budget (tight bound: packing guarantees
sum(T_i) <= token_budget). Positive value overrides auto-inference
and applies to all budget levels."""
i.e. one per video frame).
If None (default), auto-inferred as encoder_cudagraph_max_vision_items_per_batch
* max_frames_per_video (model-specific value according to processing_info).
Positive value overrides auto-inference and applies to all budget levels.
If we limit the video count per prompt to `0`, it will also be set to `0`
(i.e., fall back to image-only mode)."""
# Inductor capture
compile_sizes: list[int | str] | None = None
......@@ -993,11 +995,12 @@ class CompilationConfig:
)
if (
self.cudagraph_mm_encoder
and self.encoder_cudagraph_max_frames_per_batch is not None
and self.encoder_cudagraph_max_frames_per_batch < 0
):
raise ValueError(
"encoder_cudagraph_max_frames_per_batch must be "
"non-negative (0 = auto-infer)"
"non-negative (None = auto-infer)"
)
if self.backend == "":
......
......@@ -1531,6 +1531,12 @@ class SupportsEncoderCudaGraph(Protocol):
"""Return the modality of the inputs."""
...
def get_max_frames_per_video(
self,
) -> int:
"""Return model-specific max frames per video."""
...
def get_encoder_cudagraph_budget_range(
self,
vllm_config: "VllmConfig",
......
......@@ -1640,6 +1640,7 @@ class Qwen3VLForConditionalGeneration(
multimodal_config = vllm_config.model_config.multimodal_config
self.config = config
self.model_config = vllm_config.model_config
self._tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
......@@ -1783,6 +1784,15 @@ class Qwen3VLForConditionalGeneration(
return "image"
return "video"
def get_max_frames_per_video(self) -> int:
mm_registry = MULTIMODAL_REGISTRY
info = mm_registry.get_processing_info(self.model_config)
max_frames_per_video = info.get_num_frames_with_most_features(
seq_len=self.model_config.max_model_len,
mm_counts={"video": self.multimodal_config.get_limit_per_prompt("video")},
)
return max_frames_per_video
def get_encoder_cudagraph_budget_range(
self,
vllm_config,
......
......@@ -193,6 +193,9 @@ class MultiModalRegistry:
ctx = self._create_processing_ctx(model_config, tokenizer)
return factories.info(ctx)
def get_processing_info(self, model_config: "ModelConfig") -> BaseProcessingInfo:
return self._create_processing_info(model_config, tokenizer=None)
def create_processor(
self,
model_config: "ModelConfig",
......
......@@ -67,13 +67,15 @@ class EncoderCudaGraphManager:
comp_config = vllm_config.compilation_config
user_budgets = comp_config.encoder_cudagraph_token_budgets
user_max_mm_items = comp_config.encoder_cudagraph_max_vision_items_per_batch
user_max_vision_items = comp_config.encoder_cudagraph_max_vision_items_per_batch
user_max_frames = comp_config.encoder_cudagraph_max_frames_per_batch
if user_budgets and user_max_mm_items > 0:
multimodal_config = vllm_config.model_config.multimodal_config
if user_budgets and user_max_vision_items > 0:
# Fully user-specified
self.token_budgets = sorted(user_budgets)
self.max_batch_size = user_max_mm_items
self.max_batch_size = user_max_vision_items
else:
# Auto-infer missing values from model
min_budget, max_budget = model.get_encoder_cudagraph_budget_range(
......@@ -85,14 +87,20 @@ class EncoderCudaGraphManager:
else self._generate_budgets(min_budget, max_budget)
)
self.max_batch_size = (
user_max_mm_items if user_max_mm_items > 0 else max_budget // min_budget
user_max_vision_items
if user_max_vision_items > 0
else max_budget // min_budget
)
if user_max_frames > 0:
assert multimodal_config is not None
if multimodal_config.get_limit_per_prompt("video") == 0:
self.max_frames_per_batch = 0
elif user_max_frames is not None:
self.max_frames_per_batch = user_max_frames
else:
# TODO(shen-shanshan): optimize this auto-infer for max_frames_per_batch.
self.max_frames_per_batch = self.max_batch_size * 2
# Set it to the model-specific value according to its `processing_info`.
max_frames_per_video = self.model.get_max_frames_per_video()
self.max_frames_per_batch = self.max_batch_size * max_frames_per_video
mm_config = vllm_config.model_config.multimodal_config
self.use_dp = (
......@@ -111,7 +119,7 @@ class EncoderCudaGraphManager:
"budgets=%s, max_batch_size=%d, max_frames_per_batch=%s, use_dp=%s",
self.token_budgets,
self.max_batch_size,
self.max_frames_per_batch if self.max_frames_per_batch > 0 else "auto",
self.max_frames_per_batch,
self.use_dp,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment