feat(test): add generalized multimodal model coverage framework (#7975)

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

feat(test): add generalized multimodal model coverage framework (#7975)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
f45a6985 · Qi Wang · GitHub · e45bb0fe · f45a6985 · f45a6985
Unverified Commit f45a6985 authored Apr 08, 2026 by Qi Wang Committed by GitHub Apr 09, 2026
4 changed files
--- a/tests/serve/multimodal_profiles/__init__.py
+++ b/tests/serve/multimodal_profiles/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
--- a/tests/serve/multimodal_profiles/vllm.py
+++ b/tests/serve/multimodal_profiles/vllm.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+
+from dynamo.common.utils.paths import WORKSPACE_DIR
+from tests.utils.multimodal import (
+    MultimodalModelProfile,
+    TopologyConfig,
+    make_audio_payload,
+    make_image_payload,
+    make_video_payload,
+)
+
+VLLM_TOPOLOGY_SCRIPTS: dict[str, str] = {
+    "agg": "agg_multimodal.sh",
+    "e_pd": "disagg_multimodal_e_pd.sh",
+    "epd": "disagg_multimodal_epd.sh",
+    "p_d": "disagg_multimodal_p_d.sh",
+    "audio_agg": "audio_agg.sh",
+    "audio_disagg": "audio_disagg.sh",
+}
+
+_AUDIO_DIR = os.path.join(WORKSPACE_DIR, "examples/multimodal")
+
+VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
+    MultimodalModelProfile(
+        name="Qwen/Qwen3-VL-2B-Instruct",
+        short_name="qwen3-vl-2b",
+        topologies={
+            "agg": TopologyConfig(
+                marks=[pytest.mark.post_merge],
+                timeout_s=220,
+                profiled_vram_gib=9.6,
+            ),
+            "e_pd": TopologyConfig(
+                marks=[pytest.mark.pre_merge],
+                timeout_s=340,
+                single_gpu=True,
+            ),
+            "epd": TopologyConfig(
+                marks=[pytest.mark.pre_merge],
+                timeout_s=300,
+                single_gpu=True,
+            ),
+            "p_d": TopologyConfig(
+                marks=[pytest.mark.pre_merge],
+                timeout_s=300,
+                single_gpu=True,
+            ),
+        },
+        request_payloads=[make_image_payload(["green"])],
+    ),
+    MultimodalModelProfile(
+        name="Qwen/Qwen3-VL-2B-Instruct",
+        short_name="qwen3-vl-2b-video",
+        topologies={
+            "agg": TopologyConfig(
+                marks=[pytest.mark.pre_merge],
+                timeout_s=600,
+                delayed_start=60,
+            ),
+            "epd": TopologyConfig(
+                marks=[pytest.mark.pre_merge],
+                timeout_s=600,
+                delayed_start=60,
+                single_gpu=True,
+            ),
+        },
+        request_payloads=[make_video_payload(["red", "static", "still"])],
+    ),
+    MultimodalModelProfile(
+        name="Qwen/Qwen2.5-VL-7B-Instruct",
+        short_name="qwen2.5-vl-7b",
+        topologies={
+            "agg": TopologyConfig(
+                marks=[pytest.mark.post_merge],
+                timeout_s=360,
+                profiled_vram_gib=19.9,
+                requested_vllm_kv_cache_bytes=922_354_000,
+            ),
+        },
+        request_payloads=[make_image_payload(["purple"])],
+    ),
+    MultimodalModelProfile(
+        name="Qwen/Qwen2-Audio-7B-Instruct",
+        short_name="qwen2-audio-7b",
+        topologies={
+            "audio_agg": TopologyConfig(
+                marks=[pytest.mark.nightly],
+                timeout_s=600,
+                directory=_AUDIO_DIR,
+            ),
+            "audio_disagg": TopologyConfig(
+                marks=[pytest.mark.nightly],
+                timeout_s=600,
+                directory=_AUDIO_DIR,
+                gpu_marker="gpu_4",
+            ),
+        },
+        gpu_marker="gpu_2",
+        request_payloads=[make_audio_payload(["Hester", "Pynne"])],
+    ),
+    MultimodalModelProfile(
+        name="google/gemma-3-4b-it",
+        short_name="gemma3-4b",
+        topologies={
+            "agg": TopologyConfig(
+                marks=[pytest.mark.post_merge],
+                timeout_s=300,
+                profiled_vram_gib=12.0,
+            ),
+        },
+        request_payloads=[make_image_payload(["green"])],
+        extra_vllm_args=["--dtype", "bfloat16"],
+        gated=True,
+    ),
+]
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -7,7 +7,6 @@ import logging
 import os
 import random
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import Optional

 import pytest
@@ -19,8 +18,13 @@ from tests.serve.common import (
 )
 from tests.serve.conftest import MULTIMODAL_IMG_URL, get_multimodal_test_image_bytes
 from tests.serve.lora_utils import MinioLoraConfig
+from tests.serve.multimodal_profiles.vllm import (
+    VLLM_MULTIMODAL_PROFILES,
+    VLLM_TOPOLOGY_SCRIPTS,
+)
 from tests.utils.constants import DefaultPort
 from tests.utils.engine_process import EngineConfig
+from tests.utils.multimodal import make_multimodal_configs
 from tests.utils.payload_builder import (
    cached_tokens_chat_payload,
    chat_payload,
@@ -51,11 +55,13 @@ class VLLMConfig(EngineConfig):
 vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/vllm"
 )
-LOCAL_VIDEO_TEST_PATH = Path(
-    WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
-).resolve()
-LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()

+# Generated multimodal configs from profile definitions
+_mm_configs: dict[str, VLLMConfig] = {}
+for _profile in VLLM_MULTIMODAL_PROFILES:
+    _mm_configs.update(
+        make_multimodal_configs(_profile, VLLMConfig, vllm_dir, VLLM_TOPOLOGY_SCRIPTS)
+    )

 # vLLM test configurations
 # NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
@@ -64,6 +70,7 @@ LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
 # A future collector/launcher can sum profiled_vram_gib values to decide how many tests fit
 # concurrently without exceeding available VRAM.
 vllm_configs = {
+    **_mm_configs,
    "aggregated": VLLMConfig(
        name="aggregated",
        directory=vllm_dir,
@@ -327,44 +334,6 @@ vllm_configs = {
            completion_payload_default(),
        ],
    ),
-    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements
-    # NOTE: disagg_multimodal_e_pd.sh uses explicit --gpu-memory-utilization via
-    # DYN_ENCODE_GPU_MEM / DYN_PD_GPU_MEM env vars in single-GPU mode.
-    # PD worker honors build_vllm_gpu_mem_args for parallel execution.
-    "multimodal_e_pd_qwen": VLLMConfig(
-        name="multimodal_e_pd_qwen",
-        directory=vllm_dir,
-        script_name="disagg_multimodal_e_pd.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            # No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
-            # uses hardcoded fractions (encode=0.1, PD=0.7) that scale with GPU size.
-            pytest.mark.timeout(340),  # ~5x observed 68.4s; 2B model loads slower on CI
-            pytest.mark.pre_merge,
-        ],
-        model="Qwen/Qwen3-VL-2B-Instruct",
-        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
-        request_payloads=[
-            chat_payload(
-                [
-                    {
-                        "type": "text",
-                        "text": "What colors are in the following image? Respond only with the colors.",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                    },
-                ],
-                repeat_count=1,
-                # With proper prompt templating, the model actually only returns "green",
-                # verified behavior with native vLLM.
-                expected_response=["green"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
    "multimodal_agg_frontend_decoding": VLLMConfig(
        name="multimodal_agg_frontend_decoding",
        directory=vllm_dir,
@@ -405,117 +374,6 @@ vllm_configs = {
            )
        ],
    ),
-    # NOTE: Pack all workers on 1 GPU for lower CI resource requirements.
-    # NOTE: disagg_multimodal_epd.sh uses --kv-cache-memory-bytes=512MB for P/D
-    # workers. Per vLLM CacheConfig, kv_cache_memory_bytes (when not-None) ignores
-    # gpu_memory_utilization (ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/),
-    # so KV cache overrides have no effect. Regardless of GPU_MEM
-    # fractions (0.1/0.4/0.4), the 3 workers combined consistently use ~17.6 GiB
-    # total on this GPU.
-    # NOTE: disagg_multimodal_epd.sh uses explicit --gpu-memory-utilization via
-    # DYN_ENCODE_GPU_MEM / DYN_PREFILL_GPU_MEM / DYN_DECODE_GPU_MEM env vars.
-    # P/D workers honor build_vllm_gpu_mem_args for parallel execution.
-    "multimodal_disagg_qwen": VLLMConfig(
-        name="multimodal_disagg_qwen",
-        directory=vllm_dir,
-        script_name="disagg_multimodal_epd.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            # No profiled_vram_gib / requested_vllm_kv_cache_bytes: single-GPU mode
-            # uses hardcoded fractions via DYN_*_GPU_MEM that scale with GPU size.
-            pytest.mark.pre_merge,
-        ],
-        model="Qwen/Qwen3-VL-2B-Instruct",
-        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
-        timeout=300,
-        request_payloads=[
-            chat_payload(
-                [
-                    {
-                        "type": "text",
-                        "text": "What colors are in the following image? Respond only with the colors.",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["green"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
-    # P/D multimodal (no encoder): prefill loads images via PIL,
-    # computes grid_thw for decode using smart_resize.
-    "multimodal_p_d_qwen": VLLMConfig(
-        name="multimodal_p_d_qwen",
-        directory=vllm_dir,
-        script_name="disagg_multimodal_p_d.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            pytest.mark.pre_merge,
-        ],
-        model="Qwen/Qwen3-VL-2B-Instruct",
-        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
-        timeout=300,
-        request_payloads=[
-            chat_payload(
-                [
-                    {
-                        "type": "text",
-                        "text": "What colors are in the following image? Respond only with the colors.",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["green"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
-    "multimodal_agg_qwen": VLLMConfig(
-        name="multimodal_agg_qwen",
-        directory=vllm_dir,
-        script_name="agg_multimodal.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            pytest.mark.profiled_vram_gib(19.9),  # actual profiled peak with kv-bytes
-            pytest.mark.requested_vllm_kv_cache_bytes(
-                922_354_000
-            ),  # KV cache cap (2x safety over min=461_176_832)
-            pytest.mark.timeout(
-                360
-            ),  # ~7x observed 50.0s; 7B model loads ~48s on CI (A10G/L4)
-            pytest.mark.post_merge,
-        ],
-        model="Qwen/Qwen2.5-VL-7B-Instruct",
-        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
-        delayed_start=0,
-        timeout=360,
-        request_payloads=[
-            chat_payload(
-                [
-                    {
-                        "type": "text",
-                        "text": "What colors are in the following image? Respond only with the colors.",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": MULTIMODAL_IMG_URL},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["purple"],
-                max_tokens=100,
-            ),
-        ],
-    ),
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
@@ -560,125 +418,6 @@ vllm_configs = {
            ),
        ],
    ),
-    # Video multimodal tests for CI use the canonical aggregated multimodal launcher.
-    "multimodal_video_agg": VLLMConfig(
-        name="multimodal_video_agg",
-        directory=vllm_dir,
-        script_name="agg_multimodal.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            pytest.mark.pre_merge,
-        ],  # TODO: profile to get max_vram and timeout
-        model="Qwen/Qwen3-VL-2B-Instruct",
-        delayed_start=60,  # Video models require longer loading time
-        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct"],
-        timeout=600,  # 10 minutes for video processing overhead
-        request_payloads=[
-            chat_payload(
-                [
-                    {"type": "text", "text": "Describe the video in detail"},
-                    {
-                        "type": "video_url",
-                        "video_url": {"url": LOCAL_VIDEO_TEST_URI},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["red", "static", "still"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
-    "multimodal_video_disagg": VLLMConfig(
-        name="multimodal_video_disagg",
-        directory=vllm_dir,
-        script_name="disagg_multimodal_epd.sh",
-        marks=[
-            pytest.mark.gpu_1,
-            pytest.mark.pre_merge,
-        ],  # TODO: profile to get max_vram and timeout
-        model="Qwen/Qwen3-VL-2B-Instruct",
-        delayed_start=60,  # Video models require longer loading time
-        script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
-        timeout=600,  # 10 minutes for video processing overhead
-        request_payloads=[
-            chat_payload(
-                [
-                    {"type": "text", "text": "Describe the video in detail"},
-                    {
-                        "type": "video_url",
-                        "video_url": {"url": LOCAL_VIDEO_TEST_URI},
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["red", "static", "still"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
-    # Audio multimodal tests for nightly CI pipeline
-    # These tests validate audio inference capabilities with Qwen2-Audio model
-    "multimodal_audio_agg": VLLMConfig(
-        name="multimodal_audio_agg",
-        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
-        script_name="audio_agg.sh",
-        marks=[
-            pytest.mark.gpu_2,  # encode worker loads Qwen2Audio on GPU (~19 GiB)
-            pytest.mark.nightly,
-            pytest.mark.timeout(600),
-        ],
-        model="Qwen/Qwen2-Audio-7B-Instruct",
-        delayed_start=0,
-        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
-        request_payloads=[
-            chat_payload(
-                [
-                    {"type": "text", "text": "What is recited in the audio?"},
-                    {
-                        "type": "audio_url",
-                        "audio_url": {
-                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
-                        },
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["Hester", "Pynne"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
-    "multimodal_audio_disagg": VLLMConfig(
-        name="multimodal_audio_disagg",
-        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
-        script_name="audio_disagg.sh",
-        marks=[
-            pytest.mark.gpu_4,  # needs 3 GPUs (encode loads Qwen2Audio ~19 GiB + prefill + decode)
-            pytest.mark.nightly,
-            pytest.mark.timeout(600),
-        ],
-        model="Qwen/Qwen2-Audio-7B-Instruct",
-        delayed_start=0,
-        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
-        request_payloads=[
-            chat_payload(
-                [
-                    {"type": "text", "text": "What is recited in the audio?"},
-                    {
-                        "type": "audio_url",
-                        "audio_url": {
-                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
-                        },
-                    },
-                ],
-                repeat_count=1,
-                expected_response=["Hester", "Pynne"],
-                temperature=0.0,
-                max_tokens=100,
-            )
-        ],
-    ),
    "aggregated_toolcalling": VLLMConfig(
        name="aggregated_toolcalling",
        directory=vllm_dir,

--- a/tests/utils/multimodal.py
+++ b/tests/utils/multimodal.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional, Type
+
+import pytest
+
+from dynamo.common.utils.paths import WORKSPACE_DIR
+from tests.serve.conftest import MULTIMODAL_IMG_URL
+from tests.utils.engine_process import EngineConfig
+from tests.utils.payload_builder import chat_payload
+from tests.utils.payloads import BasePayload, ChatPayload
+
+LOCAL_VIDEO_TEST_PATH = Path(
+    WORKSPACE_DIR, "lib/llm/tests/data/media/240p_10.mp4"
+).resolve()
+LOCAL_VIDEO_TEST_URI = LOCAL_VIDEO_TEST_PATH.as_uri()
+
+AUDIO_TEST_URL = (
+    "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client"
+    "/main/datasets/mini_en/wav/1221-135766-0002.wav"
+)
+
+
+# ---------------------------------------------------------------------------
+# Payload factories
+# ---------------------------------------------------------------------------
+
+
+def make_image_payload(expected_response: list[str]) -> ChatPayload:
+    """Standard image color-identification payload using MULTIMODAL_IMG_URL."""
+    return chat_payload(
+        [
+            {
+                "type": "text",
+                "text": "What colors are in the following image? "
+                "Respond only with the colors.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": MULTIMODAL_IMG_URL},
+            },
+        ],
+        repeat_count=1,
+        expected_response=expected_response,
+        temperature=0.0,
+        max_tokens=100,
+    )
+
+
+def make_video_payload(expected_response: list[str]) -> ChatPayload:
+    """Standard video description payload using the local test video."""
+    return chat_payload(
+        [
+            {"type": "text", "text": "Describe the video in detail"},
+            {
+                "type": "video_url",
+                "video_url": {"url": LOCAL_VIDEO_TEST_URI},
+            },
+        ],
+        repeat_count=1,
+        expected_response=expected_response,
+        temperature=0.0,
+        max_tokens=100,
+    )
+
+
+def make_audio_payload(expected_response: list[str]) -> ChatPayload:
+    """Standard audio transcription payload using the remote test WAV."""
+    return chat_payload(
+        [
+            {"type": "text", "text": "What is recited in the audio?"},
+            {
+                "type": "audio_url",
+                "audio_url": {"url": AUDIO_TEST_URL},
+            },
+        ],
+        repeat_count=1,
+        expected_response=expected_response,
+        temperature=0.0,
+        max_tokens=100,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Config dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TopologyConfig:
+    """Per-topology overrides for marks, timeout, and VRAM profiling."""
+
+    marks: list[Any] = field(default_factory=list)
+    timeout_s: int = 300
+    profiled_vram_gib: Optional[float] = None
+    requested_vllm_kv_cache_bytes: Optional[int] = None
+    delayed_start: int = 0
+    directory: Optional[str] = None  # override profile-level directory
+    gpu_marker: Optional[str] = None  # override profile-level gpu_marker
+    single_gpu: bool = False  # append --single-gpu to script_args
+
+
+@dataclass
+class MultimodalModelProfile:
+    """Describes a multimodal model's test-relevant properties.
+
+    Each profile generates one config per topology in ``topologies``
+    via :func:`make_multimodal_configs`.
+    """
+
+    name: str  # HuggingFace model ID
+    short_name: str  # kebab-case slug for config key
+    topologies: dict[str, TopologyConfig]
+    request_payloads: list[BasePayload]
+    gpu_marker: str = "gpu_1"
+    extra_vllm_args: list[str] = field(default_factory=list)
+    marks: list[Any] = field(default_factory=list)  # shared across all topologies
+    gated: bool = False  # if True, skip unless DYN_HF_GATED_MODELS_ENABLED=1
+
+
+# ---------------------------------------------------------------------------
+# Config generator
+# ---------------------------------------------------------------------------
+
+
+def make_multimodal_configs(
+    profile: MultimodalModelProfile,
+    config_cls: Type[EngineConfig],
+    directory: str,
+    topology_scripts: dict[str, str],
+) -> dict[str, EngineConfig]:
+    """Generate config entries for each topology in *profile*.
+
+    Parameters
+    ----------
+    config_cls:
+        The concrete config class to instantiate (e.g. ``VLLMConfig``).
+    directory:
+        Default directory; overridden by ``TopologyConfig.directory`` if set.
+    topology_scripts:
+        Mapping from topology key to shell script filename.
+    """
+    configs: dict[str, EngineConfig] = {}
+    for topology, topo_cfg in profile.topologies.items():
+        script_name = topology_scripts[topology]
+        script_args = ["--model", profile.name] + profile.extra_vllm_args
+        if topo_cfg.single_gpu:
+            script_args.append("--single-gpu")
+
+        gpu = topo_cfg.gpu_marker or profile.gpu_marker
+        marks: list[Any] = [
+            getattr(pytest.mark, gpu),
+            pytest.mark.timeout(topo_cfg.timeout_s),
+        ]
+        marks.extend(topo_cfg.marks)
+        if topo_cfg.profiled_vram_gib is not None:
+            marks.append(pytest.mark.profiled_vram_gib(topo_cfg.profiled_vram_gib))
+        if topo_cfg.requested_vllm_kv_cache_bytes is not None:
+            marks.append(
+                pytest.mark.requested_vllm_kv_cache_bytes(
+                    topo_cfg.requested_vllm_kv_cache_bytes
+                )
+            )
+        if profile.gated:
+            marks.append(
+                pytest.mark.skipif(
+                    not os.environ.get("DYN_HF_GATED_MODELS_ENABLED"),
+                    reason=(
+                        f"{profile.name} is gated; set DYN_HF_GATED_MODELS_ENABLED=1 "
+                        "with an HF_TOKEN that has accepted the license"
+                    ),
+                )
+            )
+        marks.extend(profile.marks)
+
+        key = f"mm_{topology}_{profile.short_name}"
+        configs[key] = config_cls(
+            name=key,
+            directory=topo_cfg.directory or directory,
+            script_name=script_name,
+            model=profile.name,
+            script_args=script_args,
+            marks=marks,
+            delayed_start=topo_cfg.delayed_start,
+            request_payloads=profile.request_payloads,
+        )
+    return configs