[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph...

[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph video inference (#40445) Signed-off-by: shen-shanshan <467638484@qq.com>

[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph...
[MM][CG] Optimize default `max_frames_per_batch` auto-infer for ViT CUDA graph video inference (#40445) Signed-off-by: shen-shanshan <467638484@qq.com>
936e0b79 · Shanshan Shen · GitHub · b2a55186 · 936e0b79 · 936e0b79
Unverified Commit 936e0b79 authored Apr 21, 2026 by Shanshan Shen Committed by GitHub Apr 21, 2026
6 changed files
--- a/docs/design/cuda_graphs_multimodal.md
+++ b/docs/design/cuda_graphs_multimodal.md
@@ -76,6 +76,7 @@ Models opt-in to encoder CUDA Graphs by implementing the [SupportsEncoderCudaGra
 * `encoder_cudagraph_forward(...)` — forward pass using precomputed buffers (called during capture and replay).
 * `encoder_eager_forward(...)` — fallback eager forward when no graph fits.
 * `get_input_modality(...)` - return the modality of the inputs.
+* `get_max_frames_per_video()` - return model-specific max frames per video.

 !!! note
    The `SupportsEncoderCudaGraph` protocol is designed to be model-agnostic. New vision encoder models can opt-in by implementing the protocol methods without modifying the manager.
@@ -96,7 +97,7 @@ Three fields in `CompilationConfig` control encoder CUDA Graphs:
 * `cudagraph_mm_encoder` (`bool`, default `False`) — enable CUDA Graph capture for multimodal encoder. When enabled, captures the full encoder forward as a CUDA Graph for each token budget level.
 * `encoder_cudagraph_token_budgets` (`list[int]`, default `[]`) — token budget levels for capture. If empty (default), auto-inferred from model architecture as power-of-2 levels. User-provided values override auto-inference.
 * `encoder_cudagraph_max_vision_items_per_batch` (`int`, default `0`) — maximum number of images/videos per batch during capture. If 0 (default), auto-inferred as `max_budget // min_budget`.
-* `encoder_cudagraph_max_frames_per_batch` (`int`, default `0`) — maximum number of video frames per batch during capture. If 0 (default), auto-inferred as `encoder_cudagraph_max_vision_items_per_batch * 2` (to be optimized).
+* `encoder_cudagraph_max_frames_per_batch` (`int`, default `None`) — maximum number of video frames per batch during capture. If `None` (default), auto-inferred as `encoder_cudagraph_max_vision_items_per_batch * max_frames_per_video` (`max_frames_per_video` is a model-specific value according to its `processing_info`). If we limit the video count per prompt to `0`, it will also be set to `0` (i.e., fall back to image-only mode).

 ## Usage guide


--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -535,13 +535,15 @@ class CompilationConfig:
    model's budget range. User-provided positive value overrides
    auto-inference."""

-    encoder_cudagraph_max_frames_per_batch: int = 0
+    encoder_cudagraph_max_frames_per_batch: int | None = None
    """Maximum total video frames per batch for encoder CUDA graph capture.
    Controls the cu_seqlens buffer size (one entry per attention sequence,
-    i.e. one per video frame). If 0 (default), auto-inferred per budget
-    level as token_budget (tight bound: packing guarantees
-    sum(T_i) <= token_budget). Positive value overrides auto-inference
-    and applies to all budget levels."""
+    i.e. one per video frame).
+    If None (default), auto-inferred as encoder_cudagraph_max_vision_items_per_batch
+    * max_frames_per_video (model-specific value according to processing_info).
+    Positive value overrides auto-inference and applies to all budget levels.
+    If we limit the video count per prompt to `0`, it will also be set to `0`
+    (i.e., fall back to image-only mode)."""

    # Inductor capture
    compile_sizes: list[int | str] | None = None
@@ -993,11 +995,12 @@ class CompilationConfig:
            )
        if (
            self.cudagraph_mm_encoder
+            and self.encoder_cudagraph_max_frames_per_batch is not None
            and self.encoder_cudagraph_max_frames_per_batch < 0
        ):
            raise ValueError(
                "encoder_cudagraph_max_frames_per_batch must be "
-                "non-negative (0 = auto-infer)"
+                "non-negative (None = auto-infer)"
            )

        if self.backend == "":

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1531,6 +1531,12 @@ class SupportsEncoderCudaGraph(Protocol):
        """Return the modality of the inputs."""
        ...

+    def get_max_frames_per_video(
+        self,
+    ) -> int:
+        """Return model-specific max frames per video."""
+        ...
+
    def get_encoder_cudagraph_budget_range(
        self,
        vllm_config: "VllmConfig",

--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1640,6 +1640,7 @@ class Qwen3VLForConditionalGeneration(
        multimodal_config = vllm_config.model_config.multimodal_config

        self.config = config
+        self.model_config = vllm_config.model_config
        self._tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
        self.multimodal_config = multimodal_config
        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
@@ -1783,6 +1784,15 @@ class Qwen3VLForConditionalGeneration(
            return "image"
        return "video"

+    def get_max_frames_per_video(self) -> int:
+        mm_registry = MULTIMODAL_REGISTRY
+        info = mm_registry.get_processing_info(self.model_config)
+        max_frames_per_video = info.get_num_frames_with_most_features(
+            seq_len=self.model_config.max_model_len,
+            mm_counts={"video": self.multimodal_config.get_limit_per_prompt("video")},
+        )
+        return max_frames_per_video
+
    def get_encoder_cudagraph_budget_range(
        self,
        vllm_config,

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -193,6 +193,9 @@ class MultiModalRegistry:
        ctx = self._create_processing_ctx(model_config, tokenizer)
        return factories.info(ctx)

+    def get_processing_info(self, model_config: "ModelConfig") -> BaseProcessingInfo:
+        return self._create_processing_info(model_config, tokenizer=None)
+
    def create_processor(
        self,
        model_config: "ModelConfig",

--- a/vllm/v1/worker/encoder_cudagraph.py
+++ b/vllm/v1/worker/encoder_cudagraph.py
@@ -67,13 +67,15 @@ class EncoderCudaGraphManager:

        comp_config = vllm_config.compilation_config
        user_budgets = comp_config.encoder_cudagraph_token_budgets
-        user_max_mm_items = comp_config.encoder_cudagraph_max_vision_items_per_batch
+        user_max_vision_items = comp_config.encoder_cudagraph_max_vision_items_per_batch
        user_max_frames = comp_config.encoder_cudagraph_max_frames_per_batch

-        if user_budgets and user_max_mm_items > 0:
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        if user_budgets and user_max_vision_items > 0:
            # Fully user-specified
            self.token_budgets = sorted(user_budgets)
-            self.max_batch_size = user_max_mm_items
+            self.max_batch_size = user_max_vision_items
        else:
            # Auto-infer missing values from model
            min_budget, max_budget = model.get_encoder_cudagraph_budget_range(
@@ -85,14 +87,20 @@ class EncoderCudaGraphManager:
                else self._generate_budgets(min_budget, max_budget)
            )
            self.max_batch_size = (
-                user_max_mm_items if user_max_mm_items > 0 else max_budget // min_budget
+                user_max_vision_items
+                if user_max_vision_items > 0
+                else max_budget // min_budget
            )

-        if user_max_frames > 0:
+        assert multimodal_config is not None
+        if multimodal_config.get_limit_per_prompt("video") == 0:
+            self.max_frames_per_batch = 0
+        elif user_max_frames is not None:
            self.max_frames_per_batch = user_max_frames
        else:
-            # TODO(shen-shanshan): optimize this auto-infer for max_frames_per_batch.
-            self.max_frames_per_batch = self.max_batch_size * 2
+            # Set it to the model-specific value according to its `processing_info`.
+            max_frames_per_video = self.model.get_max_frames_per_video()
+            self.max_frames_per_batch = self.max_batch_size * max_frames_per_video

        mm_config = vllm_config.model_config.multimodal_config
        self.use_dp = (
@@ -111,7 +119,7 @@ class EncoderCudaGraphManager:
            "budgets=%s, max_batch_size=%d, max_frames_per_batch=%s, use_dp=%s",
            self.token_budgets,
            self.max_batch_size,
-            self.max_frames_per_batch if self.max_frames_per_batch > 0 else "auto",
+            self.max_frames_per_batch,
            self.use_dp,
        )