Unverified Commit 934b49ef authored by Yi Yao's avatar Yi Yao Committed by GitHub
Browse files

chore(multimodal): Add XPU aggregated video vLLM launch example (#7855)


Signed-off-by: default avatarYi Yao <yi.a.yao@intel.com>
parent 59df10d1
...@@ -76,12 +76,13 @@ GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) ...@@ -76,12 +76,13 @@ GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model # Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production) # --enforce-eager: Quick deployment (remove for production)
# Extra args from command line come last to allow overrides # Extra args from command line come last to allow overrides
ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0} python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \ python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
--block-size "${BLOCK_SIZE:-64}" \ --block-size "${BLOCK_SIZE:-64}" \
$GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}" $GPU_MEM_ARGS $MODEL_EXTRA_ARGS "${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest # Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit wait_any_exit
...@@ -7,6 +7,7 @@ import logging ...@@ -7,6 +7,7 @@ import logging
import os import os
import random import random
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -44,6 +45,10 @@ class VLLMConfig(EngineConfig): ...@@ -44,6 +45,10 @@ class VLLMConfig(EngineConfig):
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join( vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/vllm" WORKSPACE_DIR, "examples/backends/vllm"
) )
LOCAL_VIDEO_TEST_PATH = Path(
WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
).resolve()
LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
# vLLM test configurations # vLLM test configurations
...@@ -54,8 +59,14 @@ vllm_configs = { ...@@ -54,8 +59,14 @@ vllm_configs = {
script_name="xpu/agg_xpu.sh", script_name="xpu/agg_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(
360
), # ~8.5x observed 42.2s; bumped for GPU-parallel headroom
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
...@@ -79,7 +90,15 @@ vllm_configs = { ...@@ -79,7 +90,15 @@ vllm_configs = {
name="aggregated_logprobs_xpu", name="aggregated_logprobs_xpu",
directory=vllm_dir, directory=vllm_dir,
script_name="xpu/agg_xpu.sh", script_name="xpu/agg_xpu.sh",
marks=[pytest.mark.xpu_1, pytest.mark.post_merge], marks=[
pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(120), # ~5x observed 24.3s; CI machines are slower
pytest.mark.post_merge,
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload_with_logprobs( chat_payload_with_logprobs(
...@@ -103,9 +122,14 @@ vllm_configs = { ...@@ -103,9 +122,14 @@ vllm_configs = {
directory=vllm_dir, directory=vllm_dir,
script_name="xpu/agg_lmcache_xpu.sh", script_name="xpu/agg_lmcache_xpu.sh",
marks=[ marks=[
pytest.mark.lmcache,
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(360), # ~7x observed 49.0s; old value before profiling
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
...@@ -120,9 +144,14 @@ vllm_configs = { ...@@ -120,9 +144,14 @@ vllm_configs = {
directory=vllm_dir, directory=vllm_dir,
script_name="xpu/agg_lmcache_multiproc_xpu.sh", script_name="xpu/agg_lmcache_multiproc_xpu.sh",
marks=[ marks=[
pytest.mark.lmcache,
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(360), # ~7x observed 49.3s; old value before profiling
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={ env={
...@@ -141,8 +170,14 @@ vllm_configs = { ...@@ -141,8 +170,14 @@ vllm_configs = {
script_name="xpu/agg_request_planes_xpu.sh", script_name="xpu/agg_request_planes_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(
360
), # ~8x observed 43.0s; bumped for GPU-parallel headroom
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--tcp"], script_args=["--tcp"],
...@@ -157,8 +192,14 @@ vllm_configs = { ...@@ -157,8 +192,14 @@ vllm_configs = {
script_name="xpu/agg_request_planes_xpu.sh", script_name="xpu/agg_request_planes_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(
360
), # ~8.5x observed 42.3s; bumped for GPU-parallel headroom
pytest.mark.pre_merge, pytest.mark.pre_merge,
pytest.mark.timeout(300), # 3x measured time (43s) + download time (150s)
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--http"], script_args=["--http"],
...@@ -173,7 +214,8 @@ vllm_configs = { ...@@ -173,7 +214,8 @@ vllm_configs = {
script_name="xpu/agg_router_xpu.sh", script_name="xpu/agg_router_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_2, pytest.mark.xpu_2,
pytest.mark.post_merge, pytest.mark.pre_merge,
pytest.mark.skip(reason="DYN-2263"),
], ],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
...@@ -230,8 +272,12 @@ vllm_configs = { ...@@ -230,8 +272,12 @@ vllm_configs = {
script_name="xpu/agg_multimodal_xpu.sh", script_name="xpu/agg_multimodal_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.pre_merge, pytest.mark.profiled_vram_gib(9.6), # actual profiled peak with kv-bytes
pytest.mark.skip("skip for XPU"), pytest.mark.requested_vllm_kv_cache_bytes(
1_710_490_000
), # KV cache cap (2x safety over min=855_244_800)
pytest.mark.timeout(220), # ~5x observed 43.7s; 2B model loads slower on CI
pytest.mark.post_merge,
], ],
model="Qwen/Qwen2-VL-2B-Instruct", model="Qwen/Qwen2-VL-2B-Instruct",
# Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer # Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
...@@ -265,8 +311,14 @@ vllm_configs = { ...@@ -265,8 +311,14 @@ vllm_configs = {
script_name="xpu/agg_multimodal_xpu.sh", script_name="xpu/agg_multimodal_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.pre_merge, pytest.mark.profiled_vram_gib(19.9), # actual profiled peak with kv-bytes
pytest.mark.skip(reason="skip for XPU"), pytest.mark.requested_vllm_kv_cache_bytes(
922_354_000
), # KV cache cap (2x safety over min=461_176_832)
pytest.mark.timeout(
360
), # ~7x observed 50.0s; 7B model loads ~48s on CI (A10G/L4)
pytest.mark.post_merge,
], ],
model="Qwen/Qwen2.5-VL-7B-Instruct", model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"], script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
...@@ -285,7 +337,7 @@ vllm_configs = { ...@@ -285,7 +337,7 @@ vllm_configs = {
}, },
], ],
repeat_count=1, repeat_count=1,
expected_response=["Green, White"], expected_response=["purple"],
max_tokens=100, max_tokens=100,
), ),
], ],
...@@ -296,6 +348,13 @@ vllm_configs = { ...@@ -296,6 +348,13 @@ vllm_configs = {
script_name="xpu/agg_multimodal_xpu.sh", script_name="xpu/agg_multimodal_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(14.9), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
922_354_000
), # KV cache cap (2x safety over min=461_176_832)
pytest.mark.timeout(
300
), # ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
pytest.mark.nightly, pytest.mark.nightly,
# https://github.com/ai-dynamo/dynamo/issues/4501 # https://github.com/ai-dynamo/dynamo/issues/4501
pytest.mark.xfail(strict=False), pytest.mark.xfail(strict=False),
...@@ -335,7 +394,6 @@ vllm_configs = { ...@@ -335,7 +394,6 @@ vllm_configs = {
pytest.mark.xpu_2, pytest.mark.xpu_2,
pytest.mark.multimodal, pytest.mark.multimodal,
pytest.mark.nightly, pytest.mark.nightly,
pytest.mark.skip(reason="skip for XPU"),
], ],
model="Qwen/Qwen3-VL-8B-Instruct", model="Qwen/Qwen3-VL-8B-Instruct",
script_args=[ script_args=[
...@@ -406,17 +464,50 @@ vllm_configs = { ...@@ -406,17 +464,50 @@ vllm_configs = {
) )
], ],
), ),
# Video multimodal tests for CI using the vLLM video launch scripts.
"multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg_xpu",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[
pytest.mark.xpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(600), # TODO: profile to get tighter timeout
], # TODO: profile to get max_vram
model="Qwen/Qwen3-VL-2B-Instruct",
delayed_start=60, # Video models require longer loading time
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
timeout=600, # 10 minutes for video processing overhead
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["red", "static", "still"],
temperature=0.0,
max_tokens=100,
)
],
),
"completions_only": VLLMConfig( "completions_only": VLLMConfig(
name="completions_only_xpu", name="completions_only_xpu",
directory=vllm_dir, directory=vllm_dir,
script_name="xpu/agg_xpu.sh", script_name="xpu/agg_xpu.sh",
marks=[ marks=[
pytest.mark.xpu_1, pytest.mark.xpu_1,
pytest.mark.post_merge, pytest.mark.profiled_vram_gib(18.3), # actual profiled peak with kv-bytes
pytest.mark.skip(reason="skip for XPU"), pytest.mark.requested_vllm_kv_cache_bytes(
4_074_898_000
), # KV cache cap (2x safety over min=2_037_448_704)
pytest.mark.timeout( pytest.mark.timeout(
420 420
), # 3x estimated time (60s) + download time (240s) for 7B model ), # 7B model loads ~48s on CI (A10G/L4) vs ~15s locally
pytest.mark.post_merge,
], ],
model="deepseek-ai/deepseek-llm-7b-base", model="deepseek-ai/deepseek-llm-7b-base",
script_args=[ script_args=[
...@@ -433,7 +524,15 @@ vllm_configs = { ...@@ -433,7 +524,15 @@ vllm_configs = {
name="guided_decoding_xpu", name="guided_decoding_xpu",
directory=vllm_dir, directory=vllm_dir,
script_name="xpu/agg_xpu.sh", script_name="xpu/agg_xpu.sh",
marks=[pytest.mark.xpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.xpu_1,
pytest.mark.profiled_vram_gib(3.8), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
1_119_388_000
), # KV cache cap (2x safety over min=559_693_824)
pytest.mark.timeout(110), # ~5x observed 22.3s; CI machines are slower
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload( chat_payload(
...@@ -501,9 +600,8 @@ def test_serve_deployment( ...@@ -501,9 +600,8 @@ def test_serve_deployment(
@pytest.mark.vllm @pytest.mark.vllm
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.xpu_1 @pytest.mark.xpu_2
@pytest.mark.nightly @pytest.mark.nightly
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.timeout(360) # Match VLLMConfig.timeout for this multimodal deployment @pytest.mark.timeout(360) # Match VLLMConfig.timeout for this multimodal deployment
def test_multimodal_b64( def test_multimodal_b64(
request, request,
...@@ -533,7 +631,7 @@ def test_multimodal_b64( ...@@ -533,7 +631,7 @@ def test_multimodal_b64(
}, },
], ],
repeat_count=1, repeat_count=1,
expected_response=["Green, White"], expected_response=["purple"],
max_tokens=100, max_tokens=100,
) )
...@@ -556,6 +654,65 @@ def test_multimodal_b64( ...@@ -556,6 +654,65 @@ def test_multimodal_b64(
run_serve_deployment(config, request, ports=dynamo_dynamic_ports) run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.xpu_1
@pytest.mark.pre_merge
@pytest.mark.timeout(220)
def test_multimodal_b64_frontend_decoding(
request,
runtime_services_dynamic_ports,
dynamo_dynamic_ports,
predownload_models,
):
"""
Test multimodal inference with base64 images through frontend decoding path.
This exercises the Rust frontend image decode + NIXL RDMA transfer path
with inline base64 data: URIs (not HTTP URLs). Verifies that the
strip_inline_data_urls optimization does not break correctness.
"""
b64_img = base64.b64encode(get_multimodal_test_image_bytes()).decode()
b64_payload = chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64_img}"},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
config = VLLMConfig(
name="test_multimodal_b64_frontend_decoding",
directory=vllm_dir,
script_name="xpu/agg_multimodal_xpu.sh",
marks=[],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=[
"--model",
"Qwen/Qwen3-VL-2B-Instruct",
"--frontend-decoding",
],
delayed_start=0,
timeout=220,
request_payloads=[b64_payload],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
# LoRA Test Directory # LoRA Test Directory
lora_dir = os.path.join(vllm_dir, "launch/lora") lora_dir = os.path.join(vllm_dir, "launch/lora")
...@@ -599,7 +756,6 @@ def lora_chat_payload( ...@@ -599,7 +756,6 @@ def lora_chat_payload(
@pytest.mark.xpu_1 @pytest.mark.xpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B") @pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600) @pytest.mark.timeout(600)
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.post_merge @pytest.mark.post_merge
def test_lora_aggregated( def test_lora_aggregated(
request, request,
...@@ -656,7 +812,6 @@ def test_lora_aggregated( ...@@ -656,7 +812,6 @@ def test_lora_aggregated(
@pytest.mark.xpu_2 @pytest.mark.xpu_2
@pytest.mark.model("Qwen/Qwen3-0.6B") @pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600) @pytest.mark.timeout(600)
@pytest.mark.skip(reason="skip for XPU")
@pytest.mark.post_merge @pytest.mark.post_merge
@pytest.mark.parametrize("num_system_ports", [2], indirect=True) @pytest.mark.parametrize("num_system_ports", [2], indirect=True)
def test_lora_aggregated_router( def test_lora_aggregated_router(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment