chore: Add vllm multimodal tests to pre_merge (#4889)

fb1b4f92 · Kris Hung · GitHub · e83847c0 · fb1b4f92 · fb1b4f92
Unverified Commit fb1b4f92 authored Dec 18, 2025 by Kris Hung Committed by GitHub Dec 17, 2025
6 changed files
--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -143,6 +143,8 @@ jobs:
          echo ${K8S_NODE_NAME}
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
+        with:
+          lfs: true
      - name: Docker Login
        uses: ./.github/actions/docker-login
        with:

--- a/components/src/dynamo/vllm/multimodal_utils/encode_utils.py
+++ b/components/src/dynamo/vllm/multimodal_utils/encode_utils.py
@@ -18,7 +18,7 @@ from typing import Any, Dict, Optional

 import torch

-from .model import SupportedModels, is_model_supported
+from .model import SupportedModels, is_model_supported, is_qwen_vl_model

 logger = logging.getLogger(__name__)

@@ -88,7 +88,7 @@ def encode_image_embeddings(

            embeddings = projector(vision_outputs.last_hidden_state)

-        elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
+        elif is_qwen_vl_model(model_name):
            embeddings = get_qwen_image_features(vision_encoder, image_embeds)

        else:
@@ -123,7 +123,7 @@ def get_encoder_components(
        projector = getattr(vision_model, "multi_modal_projector", None)
        return vision_encoder, projector

-    elif is_model_supported(model_name, SupportedModels.QWEN_2_5_VL_7B):
+    elif is_qwen_vl_model(model_name):
        vision_encoder = vision_model
        projector = None
        return vision_encoder, projector

--- a/components/src/dynamo/vllm/multimodal_utils/model.py
+++ b/components/src/dynamo/vllm/multimodal_utils/model.py
@@ -27,6 +27,7 @@ class SupportedModels:
    """Supported multimodal model identifiers"""

    LLAVA_1_5_7B = "llava-hf/llava-1.5-7b-hf"
+    QWEN_2_VL_2B = "Qwen/Qwen2-VL-2B-Instruct"
    QWEN_2_5_VL_7B = "Qwen/Qwen2.5-VL-7B-Instruct"
    LLAVA_NEXT_VIDEO_7B = "llava-hf/LLaVA-NeXT-Video-7B-hf"

@@ -100,6 +101,28 @@ def is_model_supported(model_name: str, supported_model: str) -> bool:
    return normalized_name == normalized_supported


+# List of all Qwen VL model variants for easy extension
+QWEN_VL_MODELS = [
+    SupportedModels.QWEN_2_VL_2B,
+    SupportedModels.QWEN_2_5_VL_7B,
+]
+
+
+def is_qwen_vl_model(model_name: str) -> bool:
+    """
+    Check if a model is any Qwen VL variant.
+
+    Args:
+        model_name: The model name to check
+
+    Returns:
+        True if the model is a Qwen VL variant, False otherwise
+    """
+    return any(
+        is_model_supported(model_name, qwen_model) for qwen_model in QWEN_VL_MODELS
+    )
+
+
 def load_vision_model(model_id: str) -> torch.nn.Module:
    """
    Load a vision model from a HuggingFace model ID.
@@ -132,7 +155,7 @@ def construct_mm_data(
    image_embeds = image_embeds.to(embeddings_dtype)

    # Model-specific image handling
-    if is_model_supported(model, SupportedModels.QWEN_2_5_VL_7B):
+    if is_qwen_vl_model(model):
        return _construct_qwen_image_data(image_embeds, image_grid_thw)
    else:
        # Default image handling for other models (e.g., LLAVA_1_5_7B)

--- a/examples/backends/vllm/launch/agg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -19,6 +19,7 @@ trap 'echo Cleaning up...; kill 0' EXIT
 MODEL_NAME="llava-hf/llava-1.5-7b-hf"
 PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
 PROVIDED_PROMPT_TEMPLATE=""
+SINGLE_GPU=false

 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -31,11 +32,16 @@ while [[ $# -gt 0 ]]; do
            PROVIDED_PROMPT_TEMPLATE=$2
            shift 2
            ;;
+        --single-gpu)
+            SINGLE_GPU=true
+            shift
+            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
            echo "  --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
+            echo "  --single-gpu         Run both encode and PD workers on GPU 0 (for pre-merge CI)"
            echo "  -h, --help           Show this help message"
            exit 0
            ;;
@@ -54,7 +60,7 @@ elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
    PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
 elif [[ "$MODEL_NAME" == "microsoft/Phi-3.5-vision-instruct" ]]; then
    PROMPT_TEMPLATE="<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
-elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
+elif [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]] || [[ "$MODEL_NAME" == "Qwen/Qwen2-VL-2B-Instruct" ]]; then
    PROMPT_TEMPLATE="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"
 else
    echo "No multi-modal prompt template is defined for the model: $MODEL_NAME"
@@ -67,11 +73,14 @@ fi
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &

-# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
+# Set GPU memory utilization and model length based on deployment mode
+# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
+# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
 EXTRA_ARGS=""
-if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
-elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    EXTRA_ARGS="--gpu-memory-utilization 0.3 --max-model-len 3072 --enforce-eager"
+else
+    # Multi-GPU mode: standard memory settings
    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 fi

@@ -79,8 +88,15 @@ fi
 python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &

 # run E/P/D workers
-CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
+# Use single GPU (GPU 0) for pre-merge CI, otherwise use GPU 0 for encode and GPU 1 for PD
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    # Single GPU mode: both workers share GPU 0 with reduced memory
+    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
+    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
+else
+    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
+    CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
+fi

 # Wait for all background processes to complete
 wait
--- a/examples/backends/vllm/launch/disagg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -79,7 +79,6 @@ python -m dynamo.frontend &
 echo "Starting processor..."
 python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &

-# Configure GPU memory optimization for specific models
 EXTRA_ARGS=""

 # Start encode worker

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -227,6 +227,32 @@ vllm_configs = {
            completion_payload_default(),
        ],
    ),
+    "multimodal_agg_qwen2vl_2b_epd": VLLMConfig(
+        name="multimodal_agg_qwen2vl_2b_epd",
+        directory=vllm_dir,
+        script_name="agg_multimodal_epd.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        model="Qwen/Qwen2-VL-2B-Instruct",
+        script_args=["--model", "Qwen/Qwen2-VL-2B-Instruct", "--single-gpu"],
+        request_payloads=[
+            chat_payload(
+                [
+                    {
+                        "type": "text",
+                        "text": "What colors are in the following image? Respond only with the colors.",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": MULTIMODAL_IMG_URL},
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["purple"],
+                temperature=0.0,
+                max_tokens=100,
+            )
+        ],
+    ),
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
        directory=vllm_dir,
@@ -284,7 +310,7 @@ vllm_configs = {
        name="multimodal_agg_qwen",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
@@ -312,7 +338,7 @@ vllm_configs = {
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
-            pytest.mark.gpu_2,
+            pytest.mark.gpu_1,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
@@ -374,7 +400,7 @@ vllm_configs = {
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],