ci: Add vllm multimodal example to pytest (#2451)

32521261 · Kris Hung · GitHub · b5cf1ad3 · 32521261
Unverified Commit 32521261 authored Aug 15, 2025 by Kris Hung Committed by GitHub Aug 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 23 deletions

tests/serve/test_vllm.py tests/serve/test_vllm.py +87 -23

No files found.
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -5,7 +5,7 @@ import logging
 import os
 import time
 from dataclasses import dataclass
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Optional
 import pytest
 import requests
@@ -24,28 +24,55 @@ text_prompt = "Tell me a short joke about AI."
 def create_payload_for_config(config: "VLLMConfig") -> Payload:
    """Create a payload using the model from the vLLM config"""
-    return Payload(
+    if "multimodal" in config.name:
-        payload_chat={
+        return Payload(
-            "model": config.model,
+            payload_chat={
-            "messages": [
+                "model": config.model,
-                {
+                "messages": [
-                    "role": "user",
+                    {
-                    "content": text_prompt,
+                        "role": "user",
-                }
+                        "content": [
-            ],
+                            {"type": "text", "text": "What is in this image?"},
-            "max_tokens": 150,
+                            {
-            "temperature": 0.1,
+                                "type": "image_url",
-        },
+                                "image_url": {
-        payload_completions={
+                                    "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
-            "model": config.model,
+                                },
-            "prompt": text_prompt,
+                            },
-            "max_tokens": 150,
+                        ],
-            "temperature": 0.1,
+                    }
-        },
+                ],
-        repeat_count=1,
+                "max_tokens": 300,
-        expected_log=[],
+                "temperature": 0.0,
-        expected_response=["AI"],
+                "stream": False,
-    )
+            },
+            repeat_count=1,
+            expected_log=[],
+            expected_response=["bus"],
+        )
+    else:
+        return Payload(
+            payload_chat={
+                "model": config.model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": text_prompt,
+                    }
+                ],
+                "max_tokens": 150,
+                "temperature": 0.1,
+            },
+            payload_completions={
+                "model": config.model,
+                "prompt": text_prompt,
+                "max_tokens": 150,
+                "temperature": 0.1,
+            },
+            repeat_count=1,
+            expected_log=[],
+            expected_response=["AI"],
+        )
 @dataclass
@@ -61,6 +88,7 @@ class VLLMConfig:
    model: str
    timeout: int = 120
    delayed_start: int = 0
+    args: Optional[List[str]] = None
 class VLLMProcess(ManagedProcess):
@@ -76,6 +104,8 @@ class VLLMProcess(ManagedProcess):
            raise FileNotFoundError(f"vLLM script not found: {script_path}")
        command = ["bash", script_path]
+        if config.args:
+            command.extend(config.args)
        super().__init__(
            command=command,
@@ -148,6 +178,13 @@ class VLLMProcess(ManagedProcess):
                    logger.warning("Retrying due to no instances available")
                    time.sleep(retry_delay)
                    continue
+                elif (
+                    "multimodal" in self.config.name
+                    and "Failed to fold chat completions stream" in error
+                ):
+                    logger.warning("Retrying due to endpoint not ready for multimodal")
+                    time.sleep(retry_delay)
+                    continue
            if response.status_code == 404:
                error = response.json().get("error", "")
                if "Model not found" in error:
@@ -223,6 +260,33 @@ vllm_configs = {
        model="Qwen/Qwen3-0.6B",
        delayed_start=45,
    ),
+    "multimodal_agg": VLLMConfig(
+        name="multimodal_agg",
+        directory="/workspace/examples/multimodal_v1",
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_2, pytest.mark.vllm],
+        endpoints=["v1/chat/completions"],
+        response_handlers=[
+            chat_completions_response_handler,
+        ],
+        model="llava-hf/llava-1.5-7b-hf",
+        delayed_start=45,
+        args=["--model", "llava-hf/llava-1.5-7b-hf"],
+    ),
+    # TODO: Enable this test case when we have 4 GPUs runners.
+    # "multimodal_disagg": VLLMConfig(
+    #     name="multimodal_disagg",
+    #     directory="/workspace/examples/multimodal_v1",
+    #     script_name="disagg.sh",
+    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
+    #     endpoints=["v1/chat/completions"],
+    #     response_handlers=[
+    #         chat_completions_response_handler,
+    #     ],
+    #     model="llava-hf/llava-1.5-7b-hf",
+    #     delayed_start=45,
+    #     args=["--model", "llava-hf/llava-1.5-7b-hf"],
+    # ),
 }