Unverified Commit 32521261 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

ci: Add vllm multimodal example to pytest (#2451)

parent b5cf1ad3
...@@ -5,7 +5,7 @@ import logging ...@@ -5,7 +5,7 @@ import logging
import os import os
import time import time
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Callable, List from typing import Any, Callable, List, Optional
import pytest import pytest
import requests import requests
...@@ -24,28 +24,55 @@ text_prompt = "Tell me a short joke about AI." ...@@ -24,28 +24,55 @@ text_prompt = "Tell me a short joke about AI."
def create_payload_for_config(config: "VLLMConfig") -> Payload: def create_payload_for_config(config: "VLLMConfig") -> Payload:
"""Create a payload using the model from the vLLM config""" """Create a payload using the model from the vLLM config"""
return Payload( if "multimodal" in config.name:
payload_chat={ return Payload(
"model": config.model, payload_chat={
"messages": [ "model": config.model,
{ "messages": [
"role": "user", {
"content": text_prompt, "role": "user",
} "content": [
], {"type": "text", "text": "What is in this image?"},
"max_tokens": 150, {
"temperature": 0.1, "type": "image_url",
}, "image_url": {
payload_completions={ "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
"model": config.model, },
"prompt": text_prompt, },
"max_tokens": 150, ],
"temperature": 0.1, }
}, ],
repeat_count=1, "max_tokens": 300,
expected_log=[], "temperature": 0.0,
expected_response=["AI"], "stream": False,
) },
repeat_count=1,
expected_log=[],
expected_response=["bus"],
)
else:
return Payload(
payload_chat={
"model": config.model,
"messages": [
{
"role": "user",
"content": text_prompt,
}
],
"max_tokens": 150,
"temperature": 0.1,
},
payload_completions={
"model": config.model,
"prompt": text_prompt,
"max_tokens": 150,
"temperature": 0.1,
},
repeat_count=1,
expected_log=[],
expected_response=["AI"],
)
@dataclass @dataclass
...@@ -61,6 +88,7 @@ class VLLMConfig: ...@@ -61,6 +88,7 @@ class VLLMConfig:
model: str model: str
timeout: int = 120 timeout: int = 120
delayed_start: int = 0 delayed_start: int = 0
args: Optional[List[str]] = None
class VLLMProcess(ManagedProcess): class VLLMProcess(ManagedProcess):
...@@ -76,6 +104,8 @@ class VLLMProcess(ManagedProcess): ...@@ -76,6 +104,8 @@ class VLLMProcess(ManagedProcess):
raise FileNotFoundError(f"vLLM script not found: {script_path}") raise FileNotFoundError(f"vLLM script not found: {script_path}")
command = ["bash", script_path] command = ["bash", script_path]
if config.args:
command.extend(config.args)
super().__init__( super().__init__(
command=command, command=command,
...@@ -148,6 +178,13 @@ class VLLMProcess(ManagedProcess): ...@@ -148,6 +178,13 @@ class VLLMProcess(ManagedProcess):
logger.warning("Retrying due to no instances available") logger.warning("Retrying due to no instances available")
time.sleep(retry_delay) time.sleep(retry_delay)
continue continue
elif (
"multimodal" in self.config.name
and "Failed to fold chat completions stream" in error
):
logger.warning("Retrying due to endpoint not ready for multimodal")
time.sleep(retry_delay)
continue
if response.status_code == 404: if response.status_code == 404:
error = response.json().get("error", "") error = response.json().get("error", "")
if "Model not found" in error: if "Model not found" in error:
...@@ -223,6 +260,33 @@ vllm_configs = { ...@@ -223,6 +260,33 @@ vllm_configs = {
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
delayed_start=45, delayed_start=45,
), ),
"multimodal_agg": VLLMConfig(
name="multimodal_agg",
directory="/workspace/examples/multimodal_v1",
script_name="agg.sh",
marks=[pytest.mark.gpu_2, pytest.mark.vllm],
endpoints=["v1/chat/completions"],
response_handlers=[
chat_completions_response_handler,
],
model="llava-hf/llava-1.5-7b-hf",
delayed_start=45,
args=["--model", "llava-hf/llava-1.5-7b-hf"],
),
# TODO: Enable this test case when we have 4 GPUs runners.
# "multimodal_disagg": VLLMConfig(
# name="multimodal_disagg",
# directory="/workspace/examples/multimodal_v1",
# script_name="disagg.sh",
# marks=[pytest.mark.gpu_4, pytest.mark.vllm],
# endpoints=["v1/chat/completions"],
# response_handlers=[
# chat_completions_response_handler,
# ],
# model="llava-hf/llava-1.5-7b-hf",
# delayed_start=45,
# args=["--model", "llava-hf/llava-1.5-7b-hf"],
# ),
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment