chore(multimodal): Add XPU aggregated video vLLM launch example (#7855)

Signed-off-by: Yi Yao <yi.a.yao@intel.com>

chore(multimodal): Add XPU aggregated video vLLM launch example (#7855)
Signed-off-by: Yi Yao <yi.a.yao@intel.com>
934b49ef · Yi Yao · GitHub · 59df10d1 · 934b49ef · 934b49ef
Unverified Commit 934b49ef authored Apr 13, 2026 by Yi Yao Committed by GitHub Apr 12, 2026
Showing with 180 additions and 24 deletions

examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh +3 -2

tests/serve/test_vllm_xpu.py tests/serve/test_vllm_xpu.py +177 -22

No files found.
--- a/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
+++ b/examples/backends/vllm/launch/xpu/agg_multimodal_xpu.sh
@@ -76,12 +76,13 @@ GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 # Start vLLM worker with vision model
 # --enforce-eager: Quick deployment (remove for production)
 # Extra args from command line come last to allow overrides
+ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
-ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
+    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
    --block-size "${BLOCK_SIZE:-64}" \
-    $GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}"
+    $GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}" &
 # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
 wait_any_exit
--- a/tests/serve/test_vllm_xpu.py
+++ b/tests/serve/test_vllm_xpu.py
@@ -7,6 +7,7 @@ import logging
 import os
 import random
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Optional
 import pytest
@@ -44,6 +45,10 @@ class VLLMConfig(EngineConfig):
 vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/vllm"
 )
+LOCAL_VIDEO_TEST_PATH = Path(
+    WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
+).resolve()
+LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
 # vLLM test configurations
@@ -54,8 +59,14 @@ vllm_configs = {
        script_name="xpu/agg_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(
+                360
+            ),  # ~8.5x observed 42.2s; bumped for GPU-parallel headroom
            pytest.mark.pre_merge,
-            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
@@ -79,7 +90,15 @@ vllm_configs = {
        name="aggregated_logprobs_xpu",
        directory=vllm_dir,
        script_name="xpu/agg_xpu.sh",
-        marks=[pytest.mark.xpu_1, pytest.mark.post_merge],
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(120),  # ~5x observed 24.3s; CI machines are slower
+            pytest.mark.post_merge,
+        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_with_logprobs(
@@ -103,9 +122,14 @@ vllm_configs = {
        directory=vllm_dir,
        script_name="xpu/agg_lmcache_xpu.sh",
        marks=[
+            pytest.mark.lmcache,
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(360),  # ~7x observed 49.0s; old value before profiling
            pytest.mark.pre_merge,
-            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
@@ -120,9 +144,14 @@ vllm_configs = {
        directory=vllm_dir,
        script_name="xpu/agg_lmcache_multiproc_xpu.sh",
        marks=[
+            pytest.mark.lmcache,
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(360),  # ~7x observed 49.3s; old value before profiling
            pytest.mark.pre_merge,
-            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        env={
@@ -141,8 +170,14 @@ vllm_configs = {
        script_name="xpu/agg_request_planes_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(
+                360
+            ),  # ~8x observed 43.0s; bumped for GPU-parallel headroom
            pytest.mark.pre_merge,
-            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
@@ -157,8 +192,14 @@ vllm_configs = {
        script_name="xpu/agg_request_planes_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(
+                360
+            ),  # ~8.5x observed 42.3s; bumped for GPU-parallel headroom
            pytest.mark.pre_merge,
-            pytest.mark.timeout(300),  # 3x measured time (43s) + download time (150s)
        ],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
@@ -173,7 +214,8 @@ vllm_configs = {
        script_name="xpu/agg_router_xpu.sh",
        marks=[
            pytest.mark.xpu_2,
-            pytest.mark.post_merge,
+            pytest.mark.pre_merge,
+            pytest.mark.skip(reason="DYN-2263"),
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
@@ -230,8 +272,12 @@ vllm_configs = {
        script_name="xpu/agg_multimodal_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
-            pytest.mark.pre_merge,
+            pytest.mark.profiled_vram_gib(9.6),  # actual profiled peak with kv-bytes
-            pytest.mark.skip("skip for XPU"),
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_710_490_000
+            ),  # KV cache cap (2x safety over min=855_244_800)
+            pytest.mark.timeout(220),  # ~5x observed 43.7s; 2B model loads slower on CI
+            pytest.mark.post_merge,
        ],
        model="Qwen/Qwen2-VL-2B-Instruct",
        # Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
@@ -265,8 +311,14 @@ vllm_configs = {
        script_name="xpu/agg_multimodal_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
-            pytest.mark.pre_merge,
+            pytest.mark.profiled_vram_gib(19.9),  # actual profiled peak with kv-bytes
-            pytest.mark.skip(reason="skip for XPU"),
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                922_354_000
+            ),  # KV cache cap (2x safety over min=461_176_832)
+            pytest.mark.timeout(
+                360
+            ),  # ~7x observed 50.0s; 7B model loads ~48s on CI (A10G/L4)
+            pytest.mark.post_merge,
        ],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
@@ -285,7 +337,7 @@ vllm_configs = {
                    },
                ],
                repeat_count=1,
-                expected_response=["Green, White"],
+                expected_response=["purple"],
                max_tokens=100,
            ),
        ],
@@ -296,6 +348,13 @@ vllm_configs = {
        script_name="xpu/agg_multimodal_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(14.9),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                922_354_000
+            ),  # KV cache cap (2x safety over min=461_176_832)
+            pytest.mark.timeout(
+                300
+            ),  # ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
            pytest.mark.nightly,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
@@ -335,7 +394,6 @@ vllm_configs = {
            pytest.mark.xpu_2,
            pytest.mark.multimodal,
            pytest.mark.nightly,
-            pytest.mark.skip(reason="skip for XPU"),
        ],
        model="Qwen/Qwen3-VL-8B-Instruct",
        script_args=[
@@ -406,17 +464,50 @@ vllm_configs = {
            )
        ],
    ),
+    # Video multimodal tests for CI using the vLLM video launch scripts.
+    "multimodal_video_agg": VLLMConfig(
+        name="multimodal_video_agg_xpu",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(600),  # TODO: profile to get tighter timeout
+        ],  # TODO: profile to get max_vram
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        delayed_start=60,  # Video models require longer loading time
+        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
+        timeout=600,  # 10 minutes for video processing overhead
+        request_payloads=[
+            chat_payload(
+                [
+                    {"type": "text", "text": "Describe the video in detail"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": LOCAL_VIDEO_TEST_URI},
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["red", "static", "still"],
+                temperature=0.0,
+                max_tokens=100,
+            )
+        ],
+    ),
    "completions_only": VLLMConfig(
        name="completions_only_xpu",
        directory=vllm_dir,
        script_name="xpu/agg_xpu.sh",
        marks=[
            pytest.mark.xpu_1,
-            pytest.mark.post_merge,
+            pytest.mark.profiled_vram_gib(18.3),  # actual profiled peak with kv-bytes
-            pytest.mark.skip(reason="skip for XPU"),
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                4_074_898_000
+            ),  # KV cache cap (2x safety over min=2_037_448_704)
            pytest.mark.timeout(
                420
-            ),  # 3x estimated time (60s) + download time (240s) for 7B model
+            ),  # 7B model loads ~48s on CI (A10G/L4) vs ~15s locally
+            pytest.mark.post_merge,
        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
@@ -433,7 +524,15 @@ vllm_configs = {
        name="guided_decoding_xpu",
        directory=vllm_dir,
        script_name="xpu/agg_xpu.sh",
-        marks=[pytest.mark.xpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.xpu_1,
+            pytest.mark.profiled_vram_gib(3.8),  # actual profiled peak with kv-bytes
+            pytest.mark.requested_vllm_kv_cache_bytes(
+                1_119_388_000
+            ),  # KV cache cap (2x safety over min=559_693_824)
+            pytest.mark.timeout(110),  # ~5x observed 22.3s; CI machines are slower
+            pytest.mark.pre_merge,
+        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload(
@@ -501,9 +600,8 @@ def test_serve_deployment(
 @pytest.mark.vllm
 @pytest.mark.e2e
-@pytest.mark.xpu_1
+@pytest.mark.xpu_2
 @pytest.mark.nightly
-@pytest.mark.skip(reason="skip for XPU")
 @pytest.mark.timeout(360)  # Match VLLMConfig.timeout for this multimodal deployment
 def test_multimodal_b64(
    request,
@@ -533,7 +631,7 @@ def test_multimodal_b64(
            },
        ],
        repeat_count=1,
-        expected_response=["Green, White"],
+        expected_response=["purple"],
        max_tokens=100,
    )
@@ -556,6 +654,65 @@ def test_multimodal_b64(
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
+@pytest.mark.vllm
+@pytest.mark.e2e
+@pytest.mark.xpu_1
+@pytest.mark.pre_merge
+@pytest.mark.timeout(220)
+def test_multimodal_b64_frontend_decoding(
+    request,
+    runtime_services_dynamic_ports,
+    dynamo_dynamic_ports,
+    predownload_models,
+):
+    """
+    Test multimodal inference with base64 images through frontend decoding path.
+    This exercises the Rust frontend image decode + NIXL RDMA transfer path
+    with inline base64 data: URIs (not HTTP URLs). Verifies that the
+    strip_inline_data_urls optimization does not break correctness.
+    """
+    b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
+    b64_payload = chat_payload(
+        [
+            {
+                "type": "text",
+                "text": "What colors are in the following image? Respond only with the colors.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
+            },
+        ],
+        repeat_count=1,
+        expected_response=["green"],
+        temperature=0.0,
+        max_tokens=100,
+    )
+    config = VLLMConfig(
+        name="test_multimodal_b64_frontend_decoding",
+        directory=vllm_dir,
+        script_name="xpu/agg_multimodal_xpu.sh",
+        marks=[],
+        model="Qwen/Qwen3-VL-2B-Instruct",
+        script_args=[
+            "--model",
+            "Qwen/Qwen3-VL-2B-Instruct",
+            "--frontend-decoding",
+        ],
+        delayed_start=0,
+        timeout=220,
+        request_payloads=[b64_payload],
+    )
+    config = dataclasses.replace(
+        config, frontend_port=dynamo_dynamic_ports.frontend_port
+    )
+    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
 # LoRA Test Directory
 lora_dir = os.path.join(vllm_dir, "launch/lora")
@@ -599,7 +756,6 @@ def lora_chat_payload(
 @pytest.mark.xpu_1
 @pytest.mark.model("Qwen/Qwen3-0.6B")
 @pytest.mark.timeout(600)
-@pytest.mark.skip(reason="skip for XPU")
 @pytest.mark.post_merge
 def test_lora_aggregated(
    request,
@@ -656,7 +812,6 @@ def test_lora_aggregated(
 @pytest.mark.xpu_2
 @pytest.mark.model("Qwen/Qwen3-0.6B")
 @pytest.mark.timeout(600)
-@pytest.mark.skip(reason="skip for XPU")
 @pytest.mark.post_merge
 @pytest.mark.parametrize("num_system_ports", [2], indirect=True)
 def test_lora_aggregated_router(