Unverified Commit f45a6985 authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

feat(test): add generalized multimodal model coverage framework (#7975)


Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent e45bb0fe
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from dynamo.common.utils.paths import WORKSPACE_DIR
from tests.utils.multimodal import (
MultimodalModelProfile,
TopologyConfig,
make_audio_payload,
make_image_payload,
make_video_payload,
)
VLLM_TOPOLOGY_SCRIPTS: dict[str, str] = {
"agg": "agg_multimodal.sh",
"e_pd": "disagg_multimodal_e_pd.sh",
"epd": "disagg_multimodal_epd.sh",
"p_d": "disagg_multimodal_p_d.sh",
"audio_agg": "audio_agg.sh",
"audio_disagg": "audio_disagg.sh",
}
_AUDIO_DIR = os.path.join(WORKSPACE_DIR, "examples/multimodal")
VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
MultimodalModelProfile(
name="Qwen/Qwen3-VL-2B-Instruct",
short_name="qwen3-vl-2b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=220,
profiled_vram_gib=9.6,
),
"e_pd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=340,
single_gpu=True,
),
"epd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=300,
single_gpu=True,
),
"p_d": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=300,
single_gpu=True,
),
},
request_payloads=[make_image_payload(["green"])],
),
MultimodalModelProfile(
name="Qwen/Qwen3-VL-2B-Instruct",
short_name="qwen3-vl-2b-video",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=600,
delayed_start=60,
),
"epd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=600,
delayed_start=60,
single_gpu=True,
),
},
request_payloads=[make_video_payload(["red", "static", "still"])],
),
MultimodalModelProfile(
name="Qwen/Qwen2.5-VL-7B-Instruct",
short_name="qwen2.5-vl-7b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=360,
profiled_vram_gib=19.9,
requested_vllm_kv_cache_bytes=922_354_000,
),
},
request_payloads=[make_image_payload(["purple"])],
),
MultimodalModelProfile(
name="Qwen/Qwen2-Audio-7B-Instruct",
short_name="qwen2-audio-7b",
topologies={
"audio_agg": TopologyConfig(
marks=[pytest.mark.nightly],
timeout_s=600,
directory=_AUDIO_DIR,
),
"audio_disagg": TopologyConfig(
marks=[pytest.mark.nightly],
timeout_s=600,
directory=_AUDIO_DIR,
gpu_marker="gpu_4",
),
},
gpu_marker="gpu_2",
request_payloads=[make_audio_payload(["Hester", "Pynne"])],
),
MultimodalModelProfile(
name="google/gemma-3-4b-it",
short_name="gemma3-4b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=300,
profiled_vram_gib=12.0,
),
},
request_payloads=[make_image_payload(["green"])],
extra_vllm_args=["--dtype", "bfloat16"],
gated=True,
),
]
...@@ -7,7 +7,6 @@ import logging ...@@ -7,7 +7,6 @@ import logging
import os import os
import random import random
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -19,8 +18,13 @@ from tests.serve.common import ( ...@@ -19,8 +18,13 @@ from tests.serve.common import (
) )
from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
from tests.serve.lora_utils import MinioLoraConfig from tests.serve.lora_utils import MinioLoraConfig
from tests.serve.multimodal_profiles.vllm import (
VLLM_MULTIMODAL_PROFILES,
VLLM_TOPOLOGY_SCRIPTS,
)
from tests.utils.constants import DefaultPort from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig from tests.utils.engine_process import EngineConfig
from tests.utils.multimodal import make_multimodal_configs
from tests.utils.payload_builder import ( from tests.utils.payload_builder import (
cached_tokens_chat_payload, cached_tokens_chat_payload,
chat_payload, chat_payload,
...@@ -51,11 +55,13 @@ class VLLMConfig(EngineConfig): ...@@ -51,11 +55,13 @@ class VLLMConfig(EngineConfig):
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join( vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/vllm" WORKSPACE_DIR, "examples/backends/vllm"
) )
LOCAL_VIDEO_TEST_PATH = Path(
WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
).resolve()
LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
# Generated multimodal configs from profile definitions
_mm_configs: dict[str, VLLMConfig] = {}
for _profile in VLLM_MULTIMODAL_PROFILES:
_mm_configs.update(
make_multimodal_configs(_profile, VLLMConfig, vllm_dir, VLLM_TOPOLOGY_SCRIPTS)
)
# vLLM test configurations # vLLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached) # NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
...@@ -64,6 +70,7 @@ LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri() ...@@ -64,6 +70,7 @@ LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
# A future collector/launcher can sum profiled_vram_gib values to decide how many tests fit # A future collector/launcher can sum profiled_vram_gib values to decide how many tests fit
# concurrently without exceeding available VRAM. # concurrently without exceeding available VRAM.
vllm_configs = { vllm_configs = {
**_mm_configs,
"aggregated": VLLMConfig( "aggregated": VLLMConfig(
name="aggregated", name="aggregated",
directory=vllm_dir, directory=vllm_dir,
...@@ -327,44 +334,6 @@ vllm_configs = { ...@@ -327,44 +334,6 @@ vllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
# NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
# PD worker honors build_vllm_gpu_mem_args for parallel execution.
"multimodal_e_pd_qwen": VLLMConfig(
name="multimodal_e_pd_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_e_pd.sh",
marks=[
pytest.mark.gpu_1,
# No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
# uses hardcoded fractions (encode=0.1, PD=0.7) that scale with GPU size.
pytest.mark.timeout(340), # ~5x observed 68.4s; 2B model loads slower on CI
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
# With proper prompt templating, the model actually only returns "green",
# verified behavior with native vLLM.
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_frontend_decoding": VLLMConfig( "multimodal_agg_frontend_decoding": VLLMConfig(
name="multimodal_agg_frontend_decoding", name="multimodal_agg_frontend_decoding",
directory=vllm_dir, directory=vllm_dir,
...@@ -405,117 +374,6 @@ vllm_configs = { ...@@ -405,117 +374,6 @@ vllm_configs = {
) )
], ],
), ),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
# NOTE: disagg_multimodal_epd.sh uses --kv-cache-memory-bytes=512MB for P/D
# workers. Per vLLM CacheConfig, kv_cache_memory_bytes (when not-None) ignores
# gpu_memory_utilization (ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/),
# so KV cache overrides have no effect. Regardless of GPU_MEM
# fractions (0.1/0.4/0.4), the 3 workers combined consistently use ~17.6 GiB
# total on this GPU.
# NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
# P/D workers honor build_vllm_gpu_mem_args for parallel execution.
"multimodal_disagg_qwen": VLLMConfig(
name="multimodal_disagg_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_epd.sh",
marks=[
pytest.mark.gpu_1,
# No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
# uses hardcoded fractions via DYN_*_GPU_MEM that scale with GPU size.
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=300,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
# P/D multimodal (no encoder): prefill loads images via PIL,
# computes grid_thw for decode using smart_resize.
"multimodal_p_d_qwen": VLLMConfig(
name="multimodal_p_d_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_p_d.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=300,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(19.9), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
922_354_000
), # KV cache cap (2x safety over min=461_176_832)
pytest.mark.timeout(
360
), # ~7x observed 50.0s; 7B model loads ~48s on CI (A10G/L4)
pytest.mark.post_merge,
],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["purple"],
max_tokens=100,
),
],
),
"multimodal_agg_llava": VLLMConfig( "multimodal_agg_llava": VLLMConfig(
name="multimodal_agg_llava", name="multimodal_agg_llava",
directory=vllm_dir, directory=vllm_dir,
...@@ -560,125 +418,6 @@ vllm_configs = { ...@@ -560,125 +418,6 @@ vllm_configs = {
), ),
], ],
), ),
# Video multimodal tests for CI use the canonical aggregated multimodal launcher.
"multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
], # TODO: profile to get max_vram and timeout
model="Qwen/Qwen3-VL-2B-Instruct",
delayed_start=60, # Video models require longer loading time
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
timeout=600, # 10 minutes for video processing overhead
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["red", "static", "still"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_video_disagg": VLLMConfig(
name="multimodal_video_disagg",
directory=vllm_dir,
script_name="disagg_multimodal_epd.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
], # TODO: profile to get max_vram and timeout
model="Qwen/Qwen3-VL-2B-Instruct",
delayed_start=60, # Video models require longer loading time
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=600, # 10 minutes for video processing overhead
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["red", "static", "still"],
temperature=0.0,
max_tokens=100,
)
],
),
# Audio multimodal tests for nightly CI pipeline
# These tests validate audio inference capabilities with Qwen2-Audio model
"multimodal_audio_agg": VLLMConfig(
name="multimodal_audio_agg",
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_agg.sh",
marks=[
pytest.mark.gpu_2, # encode worker loads Qwen2Audio on GPU (~19 GiB)
pytest.mark.nightly,
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {
"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
},
},
],
repeat_count=1,
expected_response=["Hester", "Pynne"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_audio_disagg": VLLMConfig(
name="multimodal_audio_disagg",
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_disagg.sh",
marks=[
pytest.mark.gpu_4, # needs 3 GPUs (encode loads Qwen2Audio ~19 GiB + prefill + decode)
pytest.mark.nightly,
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {
"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
},
},
],
repeat_count=1,
expected_response=["Hester", "Pynne"],
temperature=0.0,
max_tokens=100,
)
],
),
"aggregated_toolcalling": VLLMConfig( "aggregated_toolcalling": VLLMConfig(
name="aggregated_toolcalling", name="aggregated_toolcalling",
directory=vllm_dir, directory=vllm_dir,
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional, Type
import pytest
from dynamo.common.utils.paths import WORKSPACE_DIR
from tests.serve.conftest import MULTIMODAL_IMG_URL
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import chat_payload
from tests.utils.payloads import BasePayload, ChatPayload
LOCAL_VIDEO_TEST_PATH = Path(
WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
).resolve()
LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
AUDIO_TEST_URL = (
"https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client"
"/main/datasets/mini_en/wav/1221-135766-0002.wav"
)
# ---------------------------------------------------------------------------
# Payload factories
# ---------------------------------------------------------------------------
def make_image_payload(expected_response: list[str]) -> ChatPayload:
"""Standard image color-identification payload using MULTIMODAL_IMG_URL."""
return chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? "
"Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
def make_video_payload(expected_response: list[str]) -> ChatPayload:
"""Standard video description payload using the local test video."""
return chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
def make_audio_payload(expected_response: list[str]) -> ChatPayload:
"""Standard audio transcription payload using the remote test WAV."""
return chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TEST_URL},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
# ---------------------------------------------------------------------------
# Config dataclasses
# ---------------------------------------------------------------------------
@dataclass
class TopologyConfig:
"""Per-topology overrides for marks, timeout, and VRAM profiling."""
marks: list[Any] = field(default_factory=list)
timeout_s: int = 300
profiled_vram_gib: Optional[float] = None
requested_vllm_kv_cache_bytes: Optional[int] = None
delayed_start: int = 0
directory: Optional[str] = None # override profile-level directory
gpu_marker: Optional[str] = None # override profile-level gpu_marker
single_gpu: bool = False # append --single-gpu to script_args
@dataclass
class MultimodalModelProfile:
"""Describes a multimodal model's test-relevant properties.
Each profile generates one config per topology in ``topologies``
via :func:`make_multimodal_configs`.
"""
name: str # HuggingFace model ID
short_name: str # kebab-case slug for config key
topologies: dict[str, TopologyConfig]
request_payloads: list[BasePayload]
gpu_marker: str = "gpu_1"
extra_vllm_args: list[str] = field(default_factory=list)
marks: list[Any] = field(default_factory=list) # shared across all topologies
gated: bool = False # if True, skip unless DYN_HF_GATED_MODELS_ENABLED=1
# ---------------------------------------------------------------------------
# Config generator
# ---------------------------------------------------------------------------
def make_multimodal_configs(
profile: MultimodalModelProfile,
config_cls: Type[EngineConfig],
directory: str,
topology_scripts: dict[str, str],
) -> dict[str, EngineConfig]:
"""Generate config entries for each topology in *profile*.
Parameters
----------
config_cls:
The concrete config class to instantiate (e.g. ``VLLMConfig``).
directory:
Default directory; overridden by ``TopologyConfig.directory`` if set.
topology_scripts:
Mapping from topology key to shell script filename.
"""
configs: dict[str, EngineConfig] = {}
for topology, topo_cfg in profile.topologies.items():
script_name = topology_scripts[topology]
script_args = ["--model", profile.name] + profile.extra_vllm_args
if topo_cfg.single_gpu:
script_args.append("--single-gpu")
gpu = topo_cfg.gpu_marker or profile.gpu_marker
marks: list[Any] = [
getattr(pytest.mark, gpu),
pytest.mark.timeout(topo_cfg.timeout_s),
]
marks.extend(topo_cfg.marks)
if topo_cfg.profiled_vram_gib is not None:
marks.append(pytest.mark.profiled_vram_gib(topo_cfg.profiled_vram_gib))
if topo_cfg.requested_vllm_kv_cache_bytes is not None:
marks.append(
pytest.mark.requested_vllm_kv_cache_bytes(
topo_cfg.requested_vllm_kv_cache_bytes
)
)
if profile.gated:
marks.append(
pytest.mark.skipif(
not os.environ.get("DYN_HF_GATED_MODELS_ENABLED"),
reason=(
f"{profile.name} is gated; set DYN_HF_GATED_MODELS_ENABLED=1 "
"with an HF_TOKEN that has accepted the license"
),
)
)
marks.extend(profile.marks)
key = f"mm_{topology}_{profile.short_name}"
configs[key] = config_cls(
name=key,
directory=topo_cfg.directory or directory,
script_name=script_name,
model=profile.name,
script_args=script_args,
marks=marks,
delayed_start=topo_cfg.delayed_start,
request_payloads=profile.request_payloads,
)
return configs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment