[MM][CG] Support `--enable-vit-cuda-graph` option for VLM examples (#40580)

Signed-off-by: shen-shanshan <467638484@qq.com>

[MM][CG] Support `--enable-vit-cuda-graph` option for VLM examples (#40580)
Signed-off-by: shen-shanshan <467638484@qq.com>
fe57be78 · Shanshan Shen · GitHub · 8317cedc · fe57be78 · fe57be78
Unverified Commit fe57be78 authored Apr 23, 2026 by Shanshan Shen Committed by GitHub Apr 22, 2026
Showing with 40 additions and 6 deletions

examples/offline_inference/vision_language.py examples/offline_inference/vision_language.py +35 -5

vllm/model_executor/models/qwen3_vl.py vllm/model_executor/models/qwen3_vl.py +5 -1

No files found.
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -2463,6 +2463,12 @@ MODELS_NEED_VIDEO_METADATA = [
 ]
+MODELS_SUPPORT_VIT_CUDA_GRAPH = [
+    "qwen3_vl",
+    "qwen3_vl_moe",
+]
 def get_multi_modal_input(args):
    """
    return {
@@ -2575,6 +2581,29 @@ def apply_image_repeat(
    return inputs, inputs_with_empty_media
+def maybe_add_vit_cuda_graph_compilation_config(args, engine_args):
+    model = args.model_type
+    modality = args.modality
+    enable_vit_cuda_graph = args.enable_vit_cuda_graph
+    if enable_vit_cuda_graph and model in MODELS_SUPPORT_VIT_CUDA_GRAPH:
+        if modality == "image" or modality == "video":
+            vision_items_per_batch = 1
+        elif modality == "image+video":
+            vision_items_per_batch = 2
+        else:
+            raise ValueError(
+                f"modality={modality} is not supported for vit cuda graph."
+            )
+        engine_args.compilation_config = {
+            "cudagraph_mm_encoder": True,
+            "encoder_cudagraph_max_vision_items_per_batch": vision_items_per_batch,
+        }
+    return engine_args
 @contextmanager
 def time_counter(enable: bool):
    if enable:
@@ -2625,33 +2654,28 @@ def parse_args():
        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    parser.add_argument(
        "--image-repeat-prob",
        type=float,
        default=None,
        help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
    )
    parser.add_argument(
        "--disable-mm-processor-cache",
        action="store_true",
        help="If True, disables caching of multi-modal processor.",
    )
    parser.add_argument(
        "--time-generate",
        action="store_true",
        help="If True, then print the total generate() call time",
    )
    parser.add_argument(
        "--use-different-prompt-per-request",
        action="store_true",
        help="If True, then use different prompt (with the same multi-modal "
        "data) for each request.",
    )
    parser.add_argument(
        "--verify-mm-cache-hit-with-uuids",
        action="store_true",
@@ -2665,6 +2689,11 @@ def parse_args():
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
+    parser.add_argument(
+        "--enable-vit-cuda-graph",
+        action="store_true",
+        help="If True, will enable vit cuda graph capture and replay for the model.",
+    )
    return parser.parse_args()
@@ -2698,6 +2727,7 @@ def main(args):
    engine_args.mm_processor_cache_gb = mm_processor_cache_gb
    if args.tensor_parallel_size is not None:
        engine_args.tensor_parallel_size = args.tensor_parallel_size
+    engine_args = maybe_add_vit_cuda_graph_compilation_config(args, engine_args)
    llm = LLM.from_engine_args(engine_args)
    # Don't want to check the flag multiple times, so just hijack `prompts`.

--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1802,7 +1802,11 @@ class Qwen3VLForConditionalGeneration(
        #                 spatial_merge_size=2 → 8x8 = 64 tokens
        min_budget = 64
        # Max: capped by max_num_batched_tokens
-        max_budget = vllm_config.scheduler_config.max_num_batched_tokens
+        # TODO(shen-shanshan): the max_budget auto-infer needs to be optimized later.
+        max_budget = min(
+            vllm_config.scheduler_config.max_num_batched_tokens,
+            self.model_config.max_model_len,
+        )
        return (min_budget, max_budget)
    def _get_pixel_values_by_modality(