Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
decode_pooling_output,
)
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
if current_platform.is_rocm():
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
@pytest.fixture(scope="module")
def server():
args = [
......@@ -53,6 +57,10 @@ def server():
DUMMY_CHAT_TEMPLATE,
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
......
......@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo(
......@@ -62,6 +57,10 @@ def server(model_info, dtype: str):
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
)
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server
......
......@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str:
"""Generate random text with approximately the specified word count."""
......@@ -221,13 +216,17 @@ def server_with_chunked_processing():
"512", # Set smaller max_model_len to trigger chunking mechanism
"--pooler-config",
(
'{"pooling_type": "MEAN", "normalize": true, '
'{"pooling_type": "MEAN", "use_activation": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'
),
"--gpu-memory-utilization",
"0.8",
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
......
......@@ -11,7 +11,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import fetch_image
from ...utils import models_path_prefix, urls_port
......@@ -55,14 +55,6 @@ def server():
yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4
......
......@@ -4,7 +4,7 @@ import os
import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS,
MTEB_RERANK_TOL,
......@@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
......@@ -30,6 +25,10 @@ st_main_score = 0.33457
def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
......
......@@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(scope="module")
def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
......@@ -30,6 +31,7 @@ def llm():
gpu_memory_utilization=0.75,
enforce_eager=True,
seed=0,
attention_config=attention_config,
)
yield weakref.proxy(llm)
......
......@@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
......@@ -24,6 +19,10 @@ DTYPE = "bfloat16"
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
......
......@@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
......@@ -44,6 +39,10 @@ def model(request):
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
......@@ -237,17 +236,14 @@ class TestModel:
"use_activation": use_activation,
},
)
if response.status_code != 200:
return response
outputs = response.json()
return torch.tensor([x["score"] for x in outputs["data"]])
if model["is_cross_encoder"]:
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
default = get_outputs(use_activation=None)
w_activation = get_outputs(use_activation=True)
wo_activation = get_outputs(use_activation=False)
if model["is_cross_encoder"]:
assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation."
)
......@@ -257,9 +253,3 @@ class TestModel:
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)."
)
else:
get_outputs(use_activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(use_activation=True)
assert response.status_code == 400
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.score_utils import get_score_prompt
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_sep_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_sep_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_sep_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_sep_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_sep_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_sep_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
......@@ -37,7 +37,7 @@ def server():
"--max-num-seqs",
"128",
"--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
"tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
]
with RemoteOpenAIServer(
MODEL_NAME,
......
......@@ -5,7 +5,7 @@ import os
import requests
from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer, models_path_prefix
from tests.utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
......
......@@ -26,9 +26,9 @@ from vllm.entrypoints.chat_utils import (
)
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import (
encode_audio_base64,
encode_image_base64,
encode_video_base64,
encode_audio_url,
encode_image_url,
encode_video_url,
)
from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
......@@ -142,22 +142,19 @@ def mistral_model_config():
@pytest.fixture(scope="module")
def image_url():
image = ImageAsset("cherry_blossom")
base64 = encode_image_base64(image.pil_image)
return f"data:image/jpeg;base64,{base64}"
return encode_image_url(image.pil_image)
@pytest.fixture(scope="module")
def video_url():
video = VideoAsset("baby_reading", 1)
base64 = encode_video_base64(video.np_ndarrays)
return f"data:video/jpeg;base64,{base64}"
return encode_video_url(video.np_ndarrays)
@pytest.fixture(scope="module")
def audio_url():
audio = AudioAsset("mary_had_lamb")
base64 = encode_audio_base64(*audio.audio_and_sample_rate)
return f"data:audio/ogg;base64,{base64}"
return encode_audio_url(*audio.audio_and_sample_rate)
def _assert_mm_data_is_image_input(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end tests for the vLLM gRPC server.
"""
import asyncio
import socket
import subprocess
import sys
import time
import grpc
import pytest
import pytest_asyncio
from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc
# Use a small model for fast testing
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def find_free_port() -> int:
"""Find a free port on localhost."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
s.listen(1)
port = s.getsockname()[1]
return port
async def wait_for_server(port: int, timeout: float = 60.0) -> bool:
"""Wait for the gRPC server to be ready by trying health checks."""
start_time = time.time()
print("waiting for server to start...")
while time.time() - start_time < timeout:
try:
channel = grpc.aio.insecure_channel(f"localhost:{port}")
stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
request = vllm_engine_pb2.HealthCheckRequest()
response = await stub.HealthCheck(request, timeout=5.0)
await channel.close()
if response.healthy:
print("server returned healthy=True")
return True
except Exception:
await asyncio.sleep(0.5)
return False
class GrpcServerProcess:
"""Manages a gRPC server running in a subprocess."""
def __init__(self):
self.process: subprocess.Popen | None = None
self.port: int | None = None
async def start(self):
"""Start the gRPC server process."""
self.port = find_free_port()
# Start the server as a subprocess
self.process = subprocess.Popen(
[
sys.executable,
"-m",
"vllm.entrypoints.grpc_server",
"--model",
MODEL_NAME,
"--host",
"localhost",
"--port",
str(self.port),
"--max-num-batched-tokens",
"512",
"--disable-log-stats-server",
],
)
# Wait for server to be ready
if not await wait_for_server(self.port):
self.stop()
raise RuntimeError("gRPC server failed to start within timeout")
def stop(self):
"""Stop the gRPC server process."""
if self.process:
self.process.terminate()
try:
self.process.wait(timeout=10)
except subprocess.TimeoutExpired:
self.process.kill()
self.process.wait()
@pytest_asyncio.fixture(scope="module")
async def grpc_server():
"""Fixture providing a running gRPC server in a subprocess."""
server = GrpcServerProcess()
await server.start()
yield server
server.stop()
@pytest_asyncio.fixture
async def grpc_client(grpc_server):
"""Fixture providing a gRPC client connected to the server."""
channel = grpc.aio.insecure_channel(f"localhost:{grpc_server.port}")
stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
yield stub
await channel.close()
@pytest.mark.asyncio
async def test_health_check(grpc_client):
"""Test the HealthCheck RPC."""
request = vllm_engine_pb2.HealthCheckRequest()
response = await grpc_client.HealthCheck(request)
assert response.healthy is True
assert response.message == "Health"
@pytest.mark.asyncio
async def test_get_model_info(grpc_client):
"""Test the GetModelInfo RPC."""
request = vllm_engine_pb2.GetModelInfoRequest()
response = await grpc_client.GetModelInfo(request)
assert response.model_path == MODEL_NAME
assert response.is_generation is True
assert response.max_context_length > 0
assert response.vocab_size > 0
assert response.supports_vision is False
@pytest.mark.asyncio
async def test_get_server_info(grpc_client):
"""Test the GetServerInfo RPC."""
request = vllm_engine_pb2.GetServerInfoRequest()
response = await grpc_client.GetServerInfo(request)
assert response.active_requests >= 0
assert response.is_paused is False
assert response.uptime_seconds >= 0
assert response.server_type == "vllm-grpc"
assert response.last_receive_timestamp > 0
@pytest.mark.asyncio
async def test_generate_non_streaming(grpc_client):
"""Test the Generate RPC in non-streaming mode."""
# Create a simple request
request = vllm_engine_pb2.GenerateRequest(
request_id="test-non-streaming-1",
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello, my name is",
input_ids=[15496, 11, 616, 1438, 318], # GPT-2 tokens for the prompt
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0,
max_tokens=10,
n=1,
),
stream=False,
)
# Collect all responses
responses = []
async for response in grpc_client.Generate(request):
responses.append(response)
# Should have exactly one response (complete)
assert len(responses) == 1
# Check the response
final_response = responses[0]
assert final_response.HasField("complete")
complete = final_response.complete
assert len(complete.output_ids) > 0
assert complete.finish_reason in ["stop", "length"]
assert complete.prompt_tokens > 0
assert complete.completion_tokens > 0
@pytest.mark.asyncio
async def test_generate_streaming(grpc_client):
"""Test the Generate RPC in streaming mode."""
request = vllm_engine_pb2.GenerateRequest(
request_id="test-streaming-1",
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="The capital of France is",
input_ids=[464, 3139, 286, 4881, 318], # GPT-2 tokens
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0, max_tokens=10, n=1
),
stream=True,
)
# Collect all responses
chunks = []
complete_response = None
async for response in grpc_client.Generate(request):
if response.HasField("chunk"):
chunks.append(response.chunk)
elif response.HasField("complete"):
complete_response = response.complete
# Should have received some chunks
assert len(chunks) >= 0 # May have 0 chunks if generation is very fast
# Should have a final complete response
assert complete_response is not None
assert complete_response.finish_reason in ["stop", "length"]
assert complete_response.prompt_tokens > 0
# Verify chunk structure
for chunk in chunks:
assert chunk.prompt_tokens > 0
assert chunk.completion_tokens >= 0
@pytest.mark.asyncio
async def test_generate_with_different_sampling_params(grpc_client):
"""Test Generate with various sampling parameters."""
# Test with temperature
request = vllm_engine_pb2.GenerateRequest(
request_id="test-sampling-temp",
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello",
input_ids=[15496],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.8, top_p=0.95, max_tokens=5
),
stream=False,
)
responses = [r async for r in grpc_client.Generate(request)]
assert len(responses) == 1
assert responses[0].HasField("complete")
# Test with top_k
request = vllm_engine_pb2.GenerateRequest(
request_id="test-sampling-topk",
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello",
input_ids=[15496],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=1.0, top_k=50, max_tokens=5
),
stream=False,
)
responses = [r async for r in grpc_client.Generate(request)]
assert len(responses) == 1
assert responses[0].HasField("complete")
@pytest.mark.asyncio
async def test_generate_with_stop_strings(grpc_client):
"""Test Generate with stop strings."""
request = vllm_engine_pb2.GenerateRequest(
request_id="test-stop-strings",
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello",
input_ids=[15496],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0,
max_tokens=20,
stop=["\n", "END"],
),
stream=False,
)
responses = [r async for r in grpc_client.Generate(request)]
assert len(responses) == 1
assert responses[0].HasField("complete")
complete = responses[0].complete
assert complete.finish_reason in ["stop", "length"]
@pytest.mark.asyncio
async def test_generate_multiple_requests(grpc_client):
"""Test handling multiple concurrent Generate requests."""
async def make_request(request_id: str):
request = vllm_engine_pb2.GenerateRequest(
request_id=request_id,
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello",
input_ids=[15496],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0, max_tokens=5
),
stream=False,
)
responses = [r async for r in grpc_client.Generate(request)]
return responses[0]
# Send multiple requests concurrently
tasks = [make_request(f"test-concurrent-{i}") for i in range(3)]
responses = await asyncio.gather(*tasks)
# Verify all requests completed successfully
assert len(responses) == 3
for i, response in enumerate(responses):
assert response.HasField("complete")
@pytest.mark.asyncio
async def test_generate_with_seed(grpc_client):
"""Test Generate with a fixed seed for reproducibility."""
def make_request(request_id: str, seed: int):
return vllm_engine_pb2.GenerateRequest(
request_id=request_id,
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="The future of AI is",
input_ids=[464, 2003, 286, 9552, 318],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=1.0, max_tokens=10, seed=seed
),
stream=False,
)
# Make two requests with the same seed
request1 = make_request("test-seed-1", 42)
request2 = make_request("test-seed-2", 42)
response_list1 = [r async for r in grpc_client.Generate(request1)]
response_list2 = [r async for r in grpc_client.Generate(request2)]
# Both should complete successfully
assert len(response_list1) == 1
assert len(response_list2) == 1
assert response_list1[0].HasField("complete")
assert response_list2[0].HasField("complete")
# With the same seed, outputs should be identical
output_ids1 = list(response_list1[0].complete.output_ids)
output_ids2 = list(response_list2[0].complete.output_ids)
assert output_ids1 == output_ids2
@pytest.mark.asyncio
async def test_generate_error_handling(grpc_client):
"""Test error handling in Generate RPC."""
# Request with invalid top_p value (-33)
request = vllm_engine_pb2.GenerateRequest(
request_id="test-error-invalid-topp",
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0, max_tokens=10, top_p=-33
),
stream=False,
)
# Should raise an error response
with pytest.raises(grpc.RpcError) as exc_info:
_ = [r async for r in grpc_client.Generate(request)]
assert exc_info.value.code() == grpc.StatusCode.INVALID_ARGUMENT
assert "top_p must be in (0, 1], got -33.0" in exc_info.value.details()
@pytest.mark.asyncio
async def test_abort_request(grpc_client):
"""Test the out-of-band Abort RPC."""
request_id = "test-abort-1"
# Start a long-running streaming generate request
generate_request = vllm_engine_pb2.GenerateRequest(
request_id=request_id,
tokenized=vllm_engine_pb2.TokenizedInput(
original_text="Hello",
input_ids=[15496],
),
sampling_params=vllm_engine_pb2.SamplingParams(
temperature=0.0,
min_tokens=500,
max_tokens=500, # Request many tokens to ensure it runs long enough
),
stream=True,
)
# Track whether we were aborted
was_aborted = False
received_chunks = 0
async def run_generate():
nonlocal was_aborted, received_chunks
async for response in grpc_client.Generate(generate_request):
if response.HasField("chunk"):
received_chunks += 1
if response.HasField("complete"):
complete = response.complete
was_aborted = complete.finish_reason == "abort"
else:
was_aborted = False
async def abort_after_delay():
# Small delay to ensure generate has started
await asyncio.sleep(0.1)
abort_request = vllm_engine_pb2.AbortRequest(request_ids=[request_id])
await grpc_client.Abort(abort_request)
# Run generate and abort concurrently
await asyncio.gather(run_generate(), abort_after_delay())
# The request should have been aborted (received final chunk with
# "abort" finish reason) and finished early due to the abort.
assert was_aborted and received_chunks < 500, (
"Request should have been aborted before generating all 500 tokens"
)
......@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from openai.types.chat import ChatCompletionMessageParam
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
......@@ -14,8 +15,10 @@ from openai.types.responses.response_reasoning_item import (
Summary,
)
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.responses_utils import (
_construct_single_message_from_response_item,
_maybe_combine_reasoning_and_tool_call,
construct_chat_messages_with_tool_call,
convert_tool_responses_to_completions_format,
)
......@@ -160,3 +163,118 @@ class TestResponsesUtils:
formatted_item = _construct_single_message_from_response_item(output_item)
assert formatted_item["role"] == "assistant"
assert formatted_item["content"] == "dongyi"
class TestMaybeCombineReasoningAndToolCall:
"""Tests for _maybe_combine_reasoning_and_tool_call function."""
def test_returns_none_when_item_id_is_none(self):
"""
Test fix from PR #31999: when item.id is None, should return None
instead of raising TypeError on startswith().
"""
item = ResponseFunctionToolCall(
type="function_call",
id=None, # This was causing TypeError before the fix
call_id="call_123",
name="test_function",
arguments="{}",
)
messages: list[ChatCompletionMessageParam] = []
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
def test_returns_none_when_id_does_not_start_with_mcp_prefix(self):
"""Test that non-MCP tool calls are not combined."""
item = ResponseFunctionToolCall(
type="function_call",
id="regular_id", # Does not start with MCP_PREFIX
call_id="call_123",
name="test_function",
arguments="{}",
)
messages = [{"role": "assistant", "reasoning": "some reasoning"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
def test_returns_none_when_last_message_is_not_assistant(self):
"""Test that non-assistant last message returns None."""
item = ResponseFunctionToolCall(
type="function_call",
id=f"{MCP_PREFIX}tool_id",
call_id="call_123",
name="test_function",
arguments="{}",
)
messages = [{"role": "user", "content": "hello"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
def test_returns_none_when_last_message_has_no_reasoning(self):
"""Test that assistant message without reasoning returns None."""
item = ResponseFunctionToolCall(
type="function_call",
id=f"{MCP_PREFIX}tool_id",
call_id="call_123",
name="test_function",
arguments="{}",
)
messages = [{"role": "assistant", "content": "some content"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
def test_combines_reasoning_and_mcp_tool_call(self):
"""Test successful combination of reasoning message and MCP tool call."""
item = ResponseFunctionToolCall(
type="function_call",
id=f"{MCP_PREFIX}tool_id",
call_id="call_123",
name="test_function",
arguments='{"arg": "value"}',
)
messages = [{"role": "assistant", "reasoning": "I need to call this tool"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is not None
assert result["role"] == "assistant"
assert result["reasoning"] == "I need to call this tool"
assert "tool_calls" in result
assert len(result["tool_calls"]) == 1
assert result["tool_calls"][0]["id"] == "call_123"
assert result["tool_calls"][0]["function"]["name"] == "test_function"
assert result["tool_calls"][0]["function"]["arguments"] == '{"arg": "value"}'
assert result["tool_calls"][0]["type"] == "function"
def test_returns_none_for_non_function_tool_call_type(self):
"""Test that non-ResponseFunctionToolCall items return None."""
# Pass a dict instead of ResponseFunctionToolCall
item = {"type": "message", "content": "hello"}
messages = [{"role": "assistant", "reasoning": "some reasoning"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
def test_returns_none_when_id_is_empty_string(self):
"""Test that empty string id returns None (falsy check)."""
item = ResponseFunctionToolCall(
type="function_call",
id="", # Empty string is falsy
call_id="call_123",
name="test_function",
arguments="{}",
)
messages = [{"role": "assistant", "reasoning": "some reasoning"}]
result = _maybe_combine_reasoning_and_tool_call(item, messages)
assert result is None
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.entrypoints.utils import sanitize_message
def test_sanitize_message():
assert (
sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
== "<_io.BytesIO object>"
)
......@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite)
```bash
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt
```
### Run standalone evaluation script
......@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold: 0.54 # Minimum expected accuracy
num_questions: 1319 # Number of questions (default: full test set)
num_fewshot: 5 # Few-shot examples from train set
max_model_len: 4096 # Model context length
server_args: "--max-model-len 4096 --tensor-parallel-size 2" # Server arguments
env: # Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4: "1"
```
The `server_args` field accepts any arguments that can be passed to `vllm serve`.
The `env` field accepts a dictionary of environment variables to set for the server process.
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 8
--enable-expert-parallel
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment