Unverified Commit f45a6985 authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

feat(test): add generalized multimodal model coverage framework (#7975)


Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent e45bb0fe
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from dynamo.common.utils.paths import WORKSPACE_DIR
from tests.utils.multimodal import (
MultimodalModelProfile,
TopologyConfig,
make_audio_payload,
make_image_payload,
make_video_payload,
)
VLLM_TOPOLOGY_SCRIPTS: dict[str, str] = {
"agg": "agg_multimodal.sh",
"e_pd": "disagg_multimodal_e_pd.sh",
"epd": "disagg_multimodal_epd.sh",
"p_d": "disagg_multimodal_p_d.sh",
"audio_agg": "audio_agg.sh",
"audio_disagg": "audio_disagg.sh",
}
_AUDIO_DIR = os.path.join(WORKSPACE_DIR, "examples/multimodal")
VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
MultimodalModelProfile(
name="Qwen/Qwen3-VL-2B-Instruct",
short_name="qwen3-vl-2b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=220,
profiled_vram_gib=9.6,
),
"e_pd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=340,
single_gpu=True,
),
"epd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=300,
single_gpu=True,
),
"p_d": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=300,
single_gpu=True,
),
},
request_payloads=[make_image_payload(["green"])],
),
MultimodalModelProfile(
name="Qwen/Qwen3-VL-2B-Instruct",
short_name="qwen3-vl-2b-video",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=600,
delayed_start=60,
),
"epd": TopologyConfig(
marks=[pytest.mark.pre_merge],
timeout_s=600,
delayed_start=60,
single_gpu=True,
),
},
request_payloads=[make_video_payload(["red", "static", "still"])],
),
MultimodalModelProfile(
name="Qwen/Qwen2.5-VL-7B-Instruct",
short_name="qwen2.5-vl-7b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=360,
profiled_vram_gib=19.9,
requested_vllm_kv_cache_bytes=922_354_000,
),
},
request_payloads=[make_image_payload(["purple"])],
),
MultimodalModelProfile(
name="Qwen/Qwen2-Audio-7B-Instruct",
short_name="qwen2-audio-7b",
topologies={
"audio_agg": TopologyConfig(
marks=[pytest.mark.nightly],
timeout_s=600,
directory=_AUDIO_DIR,
),
"audio_disagg": TopologyConfig(
marks=[pytest.mark.nightly],
timeout_s=600,
directory=_AUDIO_DIR,
gpu_marker="gpu_4",
),
},
gpu_marker="gpu_2",
request_payloads=[make_audio_payload(["Hester", "Pynne"])],
),
MultimodalModelProfile(
name="google/gemma-3-4b-it",
short_name="gemma3-4b",
topologies={
"agg": TopologyConfig(
marks=[pytest.mark.post_merge],
timeout_s=300,
profiled_vram_gib=12.0,
),
},
request_payloads=[make_image_payload(["green"])],
extra_vllm_args=["--dtype", "bfloat16"],
gated=True,
),
]
......@@ -7,7 +7,6 @@ import logging
import os
import random
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import pytest
......@@ -19,8 +18,13 @@ from tests.serve.common import (
)
from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
from tests.serve.lora_utils import MinioLoraConfig
from tests.serve.multimodal_profiles.vllm import (
VLLM_MULTIMODAL_PROFILES,
VLLM_TOPOLOGY_SCRIPTS,
)
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.multimodal import make_multimodal_configs
from tests.utils.payload_builder import (
cached_tokens_chat_payload,
chat_payload,
......@@ -51,11 +55,13 @@ class VLLMConfig(EngineConfig):
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/vllm"
)
LOCAL_VIDEO_TEST_PATH = Path(
WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
).resolve()
LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
# Generated multimodal configs from profile definitions
_mm_configs: dict[str, VLLMConfig] = {}
for _profile in VLLM_MULTIMODAL_PROFILES:
_mm_configs.update(
make_multimodal_configs(_profile, VLLMConfig, vllm_dir, VLLM_TOPOLOGY_SCRIPTS)
)
# vLLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
......@@ -64,6 +70,7 @@ LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
# A future collector/launcher can sum profiled_vram_gib values to decide how many tests fit
# concurrently without exceeding available VRAM.
vllm_configs = {
**_mm_configs,
"aggregated": VLLMConfig(
name="aggregated",
directory=vllm_dir,
......@@ -327,44 +334,6 @@ vllm_configs = {
completion_payload_default(),
],
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
# NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
# PD worker honors build_vllm_gpu_mem_args for parallel execution.
"multimodal_e_pd_qwen": VLLMConfig(
name="multimodal_e_pd_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_e_pd.sh",
marks=[
pytest.mark.gpu_1,
# No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
# uses hardcoded fractions (encode=0.1, PD=0.7) that scale with GPU size.
pytest.mark.timeout(340), # ~5x observed 68.4s; 2B model loads slower on CI
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
# With proper prompt templating, the model actually only returns "green",
# verified behavior with native vLLM.
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_frontend_decoding": VLLMConfig(
name="multimodal_agg_frontend_decoding",
directory=vllm_dir,
......@@ -405,117 +374,6 @@ vllm_configs = {
)
],
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
# NOTE: disagg_multimodal_epd.sh uses --kv-cache-memory-bytes=512MB for P/D
# workers. Per vLLM CacheConfig, kv_cache_memory_bytes (when not-None) ignores
# gpu_memory_utilization (ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/),
# so KV cache overrides have no effect. Regardless of GPU_MEM
# fractions (0.1/0.4/0.4), the 3 workers combined consistently use ~17.6 GiB
# total on this GPU.
# NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
# DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
# P/D workers honor build_vllm_gpu_mem_args for parallel execution.
"multimodal_disagg_qwen": VLLMConfig(
name="multimodal_disagg_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_epd.sh",
marks=[
pytest.mark.gpu_1,
# No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
# uses hardcoded fractions via DYN_*_GPU_MEM that scale with GPU size.
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=300,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
# P/D multimodal (no encoder): prefill loads images via PIL,
# computes grid_thw for decode using smart_resize.
"multimodal_p_d_qwen": VLLMConfig(
name="multimodal_p_d_qwen",
directory=vllm_dir,
script_name="disagg_multimodal_p_d.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
],
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=300,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["green"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_agg_qwen": VLLMConfig(
name="multimodal_agg_qwen",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(19.9), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes(
922_354_000
), # KV cache cap (2x safety over min=461_176_832)
pytest.mark.timeout(
360
), # ~7x observed 50.0s; 7B model loads ~48s on CI (A10G/L4)
pytest.mark.post_merge,
],
model="Qwen/Qwen2.5-VL-7B-Instruct",
script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
delayed_start=0,
timeout=360,
request_payloads=[
chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=["purple"],
max_tokens=100,
),
],
),
"multimodal_agg_llava": VLLMConfig(
name="multimodal_agg_llava",
directory=vllm_dir,
......@@ -560,125 +418,6 @@ vllm_configs = {
),
],
),
# Video multimodal tests for CI use the canonical aggregated multimodal launcher.
"multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg",
directory=vllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
], # TODO: profile to get max_vram and timeout
model="Qwen/Qwen3-VL-2B-Instruct",
delayed_start=60, # Video models require longer loading time
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
timeout=600, # 10 minutes for video processing overhead
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["red", "static", "still"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_video_disagg": VLLMConfig(
name="multimodal_video_disagg",
directory=vllm_dir,
script_name="disagg_multimodal_epd.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
], # TODO: profile to get max_vram and timeout
model="Qwen/Qwen3-VL-2B-Instruct",
delayed_start=60, # Video models require longer loading time
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=600, # 10 minutes for video processing overhead
request_payloads=[
chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=["red", "static", "still"],
temperature=0.0,
max_tokens=100,
)
],
),
# Audio multimodal tests for nightly CI pipeline
# These tests validate audio inference capabilities with Qwen2-Audio model
"multimodal_audio_agg": VLLMConfig(
name="multimodal_audio_agg",
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_agg.sh",
marks=[
pytest.mark.gpu_2, # encode worker loads Qwen2Audio on GPU (~19 GiB)
pytest.mark.nightly,
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {
"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
},
},
],
repeat_count=1,
expected_response=["Hester", "Pynne"],
temperature=0.0,
max_tokens=100,
)
],
),
"multimodal_audio_disagg": VLLMConfig(
name="multimodal_audio_disagg",
directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
script_name="audio_disagg.sh",
marks=[
pytest.mark.gpu_4, # needs 3 GPUs (encode loads Qwen2Audio ~19 GiB + prefill + decode)
pytest.mark.nightly,
pytest.mark.timeout(600),
],
model="Qwen/Qwen2-Audio-7B-Instruct",
delayed_start=0,
script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
request_payloads=[
chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {
"url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
},
},
],
repeat_count=1,
expected_response=["Hester", "Pynne"],
temperature=0.0,
max_tokens=100,
)
],
),
"aggregated_toolcalling": VLLMConfig(
name="aggregated_toolcalling",
directory=vllm_dir,
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional, Type
import pytest
from dynamo.common.utils.paths import WORKSPACE_DIR
from tests.serve.conftest import MULTIMODAL_IMG_URL
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import chat_payload
from tests.utils.payloads import BasePayload, ChatPayload
LOCAL_VIDEO_TEST_PATH = Path(
WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
).resolve()
LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
AUDIO_TEST_URL = (
"https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client"
"/main/datasets/mini_en/wav/1221-135766-0002.wav"
)
# ---------------------------------------------------------------------------
# Payload factories
# ---------------------------------------------------------------------------
def make_image_payload(expected_response: list[str]) -> ChatPayload:
"""Standard image color-identification payload using MULTIMODAL_IMG_URL."""
return chat_payload(
[
{
"type": "text",
"text": "What colors are in the following image? "
"Respond only with the colors.",
},
{
"type": "image_url",
"image_url": {"url": MULTIMODAL_IMG_URL},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
def make_video_payload(expected_response: list[str]) -> ChatPayload:
"""Standard video description payload using the local test video."""
return chat_payload(
[
{"type": "text", "text": "Describe the video in detail"},
{
"type": "video_url",
"video_url": {"url": LOCAL_VIDEO_TEST_URI},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
def make_audio_payload(expected_response: list[str]) -> ChatPayload:
"""Standard audio transcription payload using the remote test WAV."""
return chat_payload(
[
{"type": "text", "text": "What is recited in the audio?"},
{
"type": "audio_url",
"audio_url": {"url": AUDIO_TEST_URL},
},
],
repeat_count=1,
expected_response=expected_response,
temperature=0.0,
max_tokens=100,
)
# ---------------------------------------------------------------------------
# Config dataclasses
# ---------------------------------------------------------------------------
@dataclass
class TopologyConfig:
"""Per-topology overrides for marks, timeout, and VRAM profiling."""
marks: list[Any] = field(default_factory=list)
timeout_s: int = 300
profiled_vram_gib: Optional[float] = None
requested_vllm_kv_cache_bytes: Optional[int] = None
delayed_start: int = 0
directory: Optional[str] = None # override profile-level directory
gpu_marker: Optional[str] = None # override profile-level gpu_marker
single_gpu: bool = False # append --single-gpu to script_args
@dataclass
class MultimodalModelProfile:
"""Describes a multimodal model's test-relevant properties.
Each profile generates one config per topology in ``topologies``
via :func:`make_multimodal_configs`.
"""
name: str # HuggingFace model ID
short_name: str # kebab-case slug for config key
topologies: dict[str, TopologyConfig]
request_payloads: list[BasePayload]
gpu_marker: str = "gpu_1"
extra_vllm_args: list[str] = field(default_factory=list)
marks: list[Any] = field(default_factory=list) # shared across all topologies
gated: bool = False # if True, skip unless DYN_HF_GATED_MODELS_ENABLED=1
# ---------------------------------------------------------------------------
# Config generator
# ---------------------------------------------------------------------------
def make_multimodal_configs(
profile: MultimodalModelProfile,
config_cls: Type[EngineConfig],
directory: str,
topology_scripts: dict[str, str],
) -> dict[str, EngineConfig]:
"""Generate config entries for each topology in *profile*.
Parameters
----------
config_cls:
The concrete config class to instantiate (e.g. ``VLLMConfig``).
directory:
Default directory; overridden by ``TopologyConfig.directory`` if set.
topology_scripts:
Mapping from topology key to shell script filename.
"""
configs: dict[str, EngineConfig] = {}
for topology, topo_cfg in profile.topologies.items():
script_name = topology_scripts[topology]
script_args = ["--model", profile.name] + profile.extra_vllm_args
if topo_cfg.single_gpu:
script_args.append("--single-gpu")
gpu = topo_cfg.gpu_marker or profile.gpu_marker
marks: list[Any] = [
getattr(pytest.mark, gpu),
pytest.mark.timeout(topo_cfg.timeout_s),
]
marks.extend(topo_cfg.marks)
if topo_cfg.profiled_vram_gib is not None:
marks.append(pytest.mark.profiled_vram_gib(topo_cfg.profiled_vram_gib))
if topo_cfg.requested_vllm_kv_cache_bytes is not None:
marks.append(
pytest.mark.requested_vllm_kv_cache_bytes(
topo_cfg.requested_vllm_kv_cache_bytes
)
)
if profile.gated:
marks.append(
pytest.mark.skipif(
not os.environ.get("DYN_HF_GATED_MODELS_ENABLED"),
reason=(
f"{profile.name} is gated; set DYN_HF_GATED_MODELS_ENABLED=1 "
"with an HF_TOKEN that has accepted the license"
),
)
)
marks.extend(profile.marks)
key = f"mm_{topology}_{profile.short_name}"
configs[key] = config_cls(
name=key,
directory=topo_cfg.directory or directory,
script_name=script_name,
model=profile.name,
script_args=script_args,
marks=marks,
delayed_start=topo_cfg.delayed_start,
request_payloads=profile.request_payloads,
)
return configs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment