Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
...@@ -10,6 +10,12 @@ import pytest ...@@ -10,6 +10,12 @@ import pytest
import pytest_asyncio import pytest_asyncio
from openai import OpenAI from openai import OpenAI
from tests.entrypoints.openai.utils import (
accumulate_streaming_response,
verify_chat_response,
verify_harmony_messages,
)
from tests.utils import RemoteOpenAIServer
from vllm._aiter_ops import is_aiter_found_and_supported from vllm._aiter_ops import is_aiter_found_and_supported
from vllm.config import MultiModalConfig from vllm.config import MultiModalConfig
from vllm.entrypoints.openai.chat_completion.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
...@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
) )
from vllm.entrypoints.openai.models.serving import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.models.serving import (
BaseModelPath,
OpenAIModelRegistry,
OpenAIServingModels,
)
from vllm.entrypoints.openai.parser.harmony_utils import get_encoding from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.exceptions import VLLMValidationError
from vllm.inputs import TokensPrompt from vllm.inputs import TokensPrompt
from vllm.outputs import CompletionOutput, RequestOutput from vllm.outputs import CompletionOutput, RequestOutput
from vllm.renderers.hf import HfRenderer from vllm.renderers.hf import HfRenderer
...@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config ...@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config
from vllm.tool_parsers import ToolParserManager from vllm.tool_parsers import ToolParserManager
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import RemoteOpenAIServer
from .utils import (
accumulate_streaming_response,
verify_chat_response,
verify_harmony_messages,
)
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b" GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
GPT_OSS_SPECULATOR_NAME = "RedHatAI/gpt-oss-20b-speculator.eagle3" GPT_OSS_SPECULATOR_NAME = "RedHatAI/gpt-oss-20b-speculator.eagle3"
...@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]): ...@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]):
if is_aiter_found_and_supported(): if is_aiter_found_and_supported():
env_dict = {"VLLM_ROCM_USE_AITER": "1"} env_dict = {"VLLM_ROCM_USE_AITER": "1"}
with RemoteOpenAIServer( with RemoteOpenAIServer(
GPT_OSS_MODEL_NAME, server_args, env_dict=env_dict GPT_OSS_MODEL_NAME, server_args, env_dict=env_dict, max_wait_seconds=480
) as remote_server: ) as remote_server:
yield remote_server yield remote_server
...@@ -520,38 +525,67 @@ class MockModelConfig: ...@@ -520,38 +525,67 @@ class MockModelConfig:
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
logits_processor_pattern = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None allowed_media_domains: list[str] | None = None
encoder_config = None encoder_config = None
generation_config: str = "auto" generation_config: str = "auto"
override_generation_config: dict[str, Any] = field(default_factory=dict)
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init: bool = False skip_tokenizer_init: bool = False
is_encoder_decoder: bool = False is_encoder_decoder: bool = False
is_multimodal_model: bool = False
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
@dataclass
class MockParallelConfig:
_api_process_rank: int = 0
@dataclass
class MockVllmConfig:
model_config: MockModelConfig
parallel_config: MockParallelConfig
def _build_renderer(model_config: MockModelConfig): def _build_renderer(model_config: MockModelConfig):
_, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config)
return HfRenderer( return HfRenderer.from_config(
model_config, MockVllmConfig(model_config, parallel_config=MockParallelConfig()),
tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
) )
def _build_serving_render(
engine, model_registry: OpenAIModelRegistry
) -> OpenAIServingRender:
return OpenAIServingRender(
model_config=engine.model_config,
renderer=engine.renderer,
io_processor=engine.io_processor,
model_registry=model_registry,
request_logger=None,
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
)
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels( models = OpenAIServingModels(
engine_client=engine, engine_client=engine,
base_model_paths=BASE_MODEL_PATHS, base_model_paths=BASE_MODEL_PATHS,
) )
openai_serving_render = _build_serving_render(engine, models.registry)
serving_chat = OpenAIServingChat( serving_chat = OpenAIServingChat(
engine, engine,
models, models,
response_role="assistant", response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
request_logger=None, request_logger=None,
...@@ -572,10 +606,13 @@ async def _async_serving_chat_init(): ...@@ -572,10 +606,13 @@ async def _async_serving_chat_init():
engine = MockEngine() engine = MockEngine()
models = OpenAIServingModels(engine, BASE_MODEL_PATHS) models = OpenAIServingModels(engine, BASE_MODEL_PATHS)
openai_serving_render = _build_serving_render(engine, models.registry)
serving_completion = OpenAIServingChat( serving_completion = OpenAIServingChat(
engine, engine,
models, models,
response_role="assistant", response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
request_logger=None, request_logger=None,
...@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert mock_engine.generate.call_args.args[1].max_tokens == 10 assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Setting server's max_tokens in the generation_config.json # Model author's generation_config.json sets max_tokens (auto, no override)
# lower than context_window - prompt_tokens # — should act as fallback only, not ceiling
mock_model_config = MockModelConfig() mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = { mock_model_config.diff_sampling_param = {"max_tokens": 10}
"max_tokens": 10 # Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings # Reinitialize the engine with new settings
mock_engine = MagicMock(spec=AsyncLLM) mock_engine = MagicMock(spec=AsyncLLM)
...@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert mock_engine.generate.call_args.args[1].max_tokens == 10 assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Test Case 2: Request's max_tokens set higher than server accepts # Test Case 2: Request's max_tokens set higher than generation_config
# default so request-provided max_tokens takes precedence
req.max_tokens = 15 req.max_tokens = 15
with suppress(Exception): with suppress(Exception):
await serving_chat.create_chat_completion(req) await serving_chat.create_chat_completion(req)
assert mock_engine.generate.call_args.args[1].max_tokens == 10 assert mock_engine.generate.call_args.args[1].max_tokens == 15
# Test Case 3: Request's max_tokens set lower than server accepts # Test Case 3: Request's max_tokens set lower than server accepts
req.max_tokens = 5 req.max_tokens = 5
...@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert mock_engine.generate.call_args.args[1].max_tokens == 5 assert mock_engine.generate.call_args.args[1].max_tokens == 5
# User explicitly sets max_tokens via --override-generation-config
# — should act as a ceiling
mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = {"max_tokens": 10}
mock_model_config.override_generation_config = {"max_new_tokens": 10}
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
mock_engine.renderer = _build_renderer(mock_engine.model_config)
serving_chat = _build_serving_chat(mock_engine)
# Test Case 3.1: No max_tokens — uses override as default
req = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "what is 1+1?"}],
)
with suppress(Exception):
await serving_chat.create_chat_completion(req)
assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Test Case 3.2: Request max_tokens higher — capped by user ceiling from override
req.max_tokens = 15
with suppress(Exception):
await serving_chat.create_chat_completion(req)
assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Test Case 3.3: Request max_tokens lower — respected
req.max_tokens = 5
with suppress(Exception):
await serving_chat.create_chat_completion(req)
assert mock_engine.generate.call_args.args[1].max_tokens == 5
# Setting server's max_tokens in the generation_config.json # Setting server's max_tokens in the generation_config.json
# higher than context_window - prompt_tokens # higher than context_window - prompt_tokens
mock_model_config = MockModelConfig() mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = { mock_model_config.diff_sampling_param = {"max_tokens": 200}
"max_tokens": 200 # Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings # Reinitialize the engine with new settings
mock_engine = MagicMock(spec=AsyncLLM) mock_engine = MagicMock(spec=AsyncLLM)
...@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated(): ...@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
mock_tokenizer = MagicMock(spec=MistralTokenizer) mock_tokenizer = MagicMock(spec=MistralTokenizer)
mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) mock_renderer = MistralRenderer(
mock_renderer._tokenizer = mock_tokenizer MockVllmConfig(mock_engine.model_config, parallel_config=MockParallelConfig()),
tokenizer=mock_tokenizer,
)
# Force the Mistral chat template renderer to return token IDs. # Force the Mistral chat template renderer to return token IDs.
# Choose a prompt length that is < max_model_len, but large enough that # Choose a prompt length that is < max_model_len, but large enough that
# adding max_tokens should exceed the model context window. # adding max_tokens should exceed the model context window.
...@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated(): ...@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
max_tokens=10, max_tokens=10,
) )
resp = await serving_chat.create_chat_completion(req) with pytest.raises(VLLMValidationError):
assert isinstance(resp, ErrorResponse) await serving_chat.create_chat_completion(req)
assert "context length is only" in resp.error.message
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected(): ...@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
mock_tokenizer = MagicMock(spec=MistralTokenizer) mock_tokenizer = MagicMock(spec=MistralTokenizer)
mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) mock_renderer = MistralRenderer(
mock_renderer._tokenizer = mock_tokenizer MockVllmConfig(mock_engine.model_config, parallel_config=MockParallelConfig()),
tokenizer=mock_tokenizer,
)
# prompt_token_ids length == max_model_len should be rejected for # prompt_token_ids length == max_model_len should be rejected for
# completion-like requests (ChatCompletionRequest). # completion-like requests (ChatCompletionRequest).
mock_renderer.render_messages_async = AsyncMock( mock_renderer.render_messages_async = AsyncMock(
...@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected(): ...@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
max_tokens=1, max_tokens=1,
) )
resp = await serving_chat.create_chat_completion(req) with pytest.raises(VLLMValidationError):
assert isinstance(resp, ErrorResponse) await serving_chat.create_chat_completion(req)
assert "context length is only" in resp.error.message
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony: ...@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input # Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
[ [
...@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony: ...@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
[ [
...@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony: ...@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input # Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
[ [
...@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony: ...@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
[ [
...@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony: ...@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input # Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
[ [
...@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony: ...@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
[ [
...@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony: ...@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input # Test the Harmony messages for the first turn's input
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
[ [
...@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony: ...@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_2)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
[ [
...@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony: ...@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input # Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) input_messages_3, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_3)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_3, input_messages_3,
[ [
...@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony: ...@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input # Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) input_messages_4, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req_4)
)
verify_harmony_messages( verify_harmony_messages(
input_messages_4, input_messages_4,
[ [
...@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony: ...@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony:
}, },
] ]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
...@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony: ...@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony:
}, },
] ]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
...@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony: ...@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony:
}, },
] ]
req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
input_messages, _ = serving_chat._make_request_with_harmony(req) input_messages, _ = (
serving_chat.openai_serving_render._make_request_with_harmony(req)
)
verify_harmony_messages( verify_harmony_messages(
input_messages, input_messages,
...@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser(): ...@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser():
engine_client=mock_engine, engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS, base_model_paths=BASE_MODEL_PATHS,
) )
openai_serving_render = _build_serving_render(mock_engine, models.registry)
# Create serving_chat without tool_parser (enable_auto_tools=False) # Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat = OpenAIServingChat( serving_chat = OpenAIServingChat(
mock_engine, mock_engine,
models, models,
response_role="assistant", response_role="assistant",
openai_serving_render=openai_serving_render,
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
request_logger=None, request_logger=None,
......
...@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta: ...@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta:
assert delta_message.tool_calls[0].index == 1 assert delta_message.tool_calls[0].index == 1
@pytest.mark.parametrize( def test_returns_preambles_as_content(self):
"channel,recipient", """Test that commentary with no recipient (preamble) is user content."""
[
("commentary", None),
("commentary", "browser.search"),
],
)
def test_returns_tool_call_preambles(self, channel, recipient):
"""Test that invalid tool recipient on commentary is treated as content."""
parser = MockStreamableParser() parser = MockStreamableParser()
delta_text = "some text" delta_text = "some text"
token_states = [ token_states = [
TokenState(channel=channel, recipient=recipient, text=delta_text) TokenState(channel="commentary", recipient=None, text=delta_text)
] ]
delta_message, tools_streamed = extract_harmony_streaming_delta( delta_message, tools_streamed = extract_harmony_streaming_delta(
...@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta: ...@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta:
[ [
(None, None), (None, None),
("unknown_channel", None), ("unknown_channel", None),
("commentary", "browser.search"),
], ],
) )
def test_returns_none_for_invalid_inputs(self, channel, recipient): def test_returns_none_for_invalid_inputs(self, channel, recipient):
......
...@@ -7,7 +7,7 @@ import httpx ...@@ -7,7 +7,7 @@ import httpx
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteLaunchRenderServer
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
...@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" ...@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def server(): def server():
args: list[str] = [] args: list[str] = []
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteLaunchRenderServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
...@@ -43,23 +43,20 @@ async def test_completion_render_basic(client): ...@@ -43,23 +43,20 @@ async def test_completion_render_basic(client):
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
# Verify response structure # Verify response structure - list of GenerateRequest
assert isinstance(data, list) assert isinstance(data, list)
assert len(data) > 0 assert len(data) > 0
# Verify first prompt # Verify first prompt is a GenerateRequest
first_prompt = data[0] first_prompt = data[0]
assert "prompt_token_ids" in first_prompt assert "token_ids" in first_prompt
assert "prompt" in first_prompt assert "sampling_params" in first_prompt
assert isinstance(first_prompt["prompt_token_ids"], list) assert "model" in first_prompt
assert len(first_prompt["prompt_token_ids"]) > 0 assert "request_id" in first_prompt
assert isinstance(first_prompt["prompt"], str) assert isinstance(first_prompt["token_ids"], list)
assert len(first_prompt["token_ids"]) > 0
# Verify prompt text is preserved assert first_prompt["model"] == MODEL_NAME
assert ( assert first_prompt["request_id"].startswith("cmpl-")
"When should a chat-completions handler return an empty string?"
in first_prompt["prompt"]
)
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client): ...@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client):
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
# Verify response structure - should be [conversation, engine_prompts] # Verify response structure - should be a GenerateRequest
assert isinstance(data, list) assert isinstance(data, dict)
assert len(data) == 2 assert "token_ids" in data
assert isinstance(data["token_ids"], list)
conversation, engine_prompts = data assert len(data["token_ids"]) > 0
# Verify conversation
assert isinstance(conversation, list)
assert len(conversation) > 0
assert conversation[0]["role"] == "user"
assert "empty string" in conversation[0]["content"]
# Verify engine_prompts
assert isinstance(engine_prompts, list)
assert len(engine_prompts) > 0
first_prompt = engine_prompts[0] # Verify token IDs are integers and BOS token is present
assert "prompt_token_ids" in first_prompt token_ids = data["token_ids"]
assert "prompt" in first_prompt
assert isinstance(first_prompt["prompt_token_ids"], list)
assert len(first_prompt["prompt_token_ids"]) > 0
# Verify chat template was applied (should have instruction markers)
assert "[INST]" in first_prompt["prompt"]
assert "[/INST]" in first_prompt["prompt"]
# Verify token IDs are correctly preserved as integers
token_ids = first_prompt["prompt_token_ids"]
assert all(isinstance(tid, int) for tid in token_ids) assert all(isinstance(tid, int) for tid in token_ids)
# Verify BOS token (usually 1 for LLaMA models)
assert token_ids[0] == 1 assert token_ids[0] == 1
...@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client): ...@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client):
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
# Should return two prompts # Should return two GenerateRequest items
assert isinstance(data, list) assert isinstance(data, list)
assert len(data) == 2 assert len(data) == 2
# Verify both prompts have required fields # Verify both prompts have GenerateRequest fields
for prompt in data: for prompt in data:
assert "prompt_token_ids" in prompt assert "token_ids" in prompt
assert "prompt" in prompt assert "sampling_params" in prompt
assert len(prompt["prompt_token_ids"]) > 0 assert "model" in prompt
assert "request_id" in prompt
assert len(prompt["token_ids"]) > 0
assert prompt["request_id"].startswith("cmpl-")
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client): ...@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client):
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
conversation, engine_prompts = data # Verify tokenization occurred
assert isinstance(data, dict)
assert "token_ids" in data
assert isinstance(data["token_ids"], list)
assert len(data["token_ids"]) > 0
# Verify all messages preserved
assert len(conversation) == 3
assert conversation[0]["role"] == "user"
assert conversation[1]["role"] == "assistant"
assert conversation[2]["role"] == "user"
# Verify tokenization occurred @pytest.mark.asyncio
assert len(engine_prompts) > 0 async def test_chat_completion_render_with_stream_true(client):
assert len(engine_prompts[0]["prompt_token_ids"]) > 0 """Render accepts stream params but still returns JSON (non-streamed)."""
response = await client.post(
"/v1/chat/completions/render",
json={
"model": MODEL_NAME,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True,
},
"messages": [
{
"role": "user",
"content": "Stream options should be accepted by /render.",
}
],
},
)
assert response.status_code == 200
assert response.headers.get("content-type", "").startswith("application/json")
data = response.json()
assert isinstance(data, dict)
assert "token_ids" in data
assert isinstance(data["token_ids"], list)
assert len(data["token_ids"]) > 0
# /render should preserve stream fields on the returned token-in request.
assert data.get("stream") is True
assert isinstance(data.get("stream_options"), dict)
assert data["stream_options"].get("include_usage") is True
assert data["stream_options"].get("continuous_usage_stats") is True
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client): ...@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client):
assert response.status_code == 200 assert response.status_code == 200
# Render should be fast (< 1 second) since no generation # Render should be fast (< 1 second) since no generation
assert elapsed < 1.0 assert elapsed < 1.0
@pytest.mark.asyncio
async def test_chat_completion_render_with_sampling_params(client):
"""Verify sampling params are correctly returned by /render."""
response = await client.post(
"/v1/chat/completions/render",
json={
"model": MODEL_NAME,
"messages": [{"role": "user", "content": "Test sampling params"}],
"temperature": 0.123,
"top_p": 0.456,
"frequency_penalty": 1.1,
},
)
assert response.status_code == 200
data = response.json()
assert "sampling_params" in data
sampling_params = data["sampling_params"]
assert sampling_params.get("temperature") == 0.123
assert sampling_params.get("top_p") == 0.456
assert sampling_params.get("frequency_penalty") == 1.1
# Check that internal fields are not present
assert "_all_stop_token_ids" not in sampling_params
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""
import httpx
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_url
VISION_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
@pytest.fixture(scope="module")
def vision_server():
"""Vision-capable server used for multimodal /render tests."""
args = [
"--enforce-eager",
"--max-model-len",
"100",
"--max-num-seqs",
"1",
"--limit-mm-per-prompt.image",
"1",
"--limit-mm-per-prompt.video",
"0",
]
env_overrides: dict[str, str] = {}
with RemoteOpenAIServer(
VISION_MODEL_NAME,
args,
env_dict=env_overrides,
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def vision_client(vision_server):
async with httpx.AsyncClient(
base_url=vision_server.url_for(""), timeout=60.0
) as http_client:
yield http_client
@pytest.mark.asyncio
async def test_chat_completion_render_with_base64_image_url(
vision_client,
local_asset_server,
):
"""Render a multimodal chat request and verify tokens are returned."""
image = local_asset_server.get_image_asset("RGBA_comp.png")
data_url = encode_image_url(image, format="PNG")
assert data_url.startswith("data:image/")
assert ";base64," in data_url
response = await vision_client.post(
"/v1/chat/completions/render",
json={
"model": VISION_MODEL_NAME,
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": data_url}},
{"type": "text", "text": "What's in this image?"},
],
}
],
},
)
assert response.status_code == 200
data = response.json()
assert isinstance(data, dict)
assert "token_ids" in data
assert isinstance(data["token_ids"], list)
assert len(data["token_ids"]) > 0
# Verify multimodal features are populated
assert "features" in data
features = data["features"]
assert features is not None
# mm_hashes: should have an "image" key with a list of hash strings
assert "mm_hashes" in features
assert "image" in features["mm_hashes"]
image_hashes = features["mm_hashes"]["image"]
assert isinstance(image_hashes, list)
assert len(image_hashes) > 0
assert all(isinstance(h, str) for h in image_hashes)
# mm_placeholders: should have an "image" key with offset/length dicts
assert "mm_placeholders" in features
assert "image" in features["mm_placeholders"]
image_placeholders = features["mm_placeholders"]["image"]
assert isinstance(image_placeholders, list)
assert len(image_placeholders) > 0
for p in image_placeholders:
assert "offset" in p
assert "length" in p
assert isinstance(p["offset"], int)
assert isinstance(p["length"], int)
assert p["length"] > 0
@pytest.mark.asyncio
async def test_tokenize_matches_render_for_multimodal_input(
vision_client,
local_asset_server,
):
"""`/tokenize` should match `/v1/chat/completions/render` token output."""
image = local_asset_server.get_image_asset("RGBA_comp.png")
data_url = encode_image_url(image, format="PNG")
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": data_url}},
{"type": "text", "text": "What's in this image?"},
],
}
]
render_response = await vision_client.post(
"/v1/chat/completions/render",
json={
"model": VISION_MODEL_NAME,
"messages": messages,
},
)
assert render_response.status_code == 200
render_data = render_response.json()
tokenize_response = await vision_client.post(
"/tokenize",
json={
"model": VISION_MODEL_NAME,
"messages": messages,
},
)
assert tokenize_response.status_code == 200
tokenize_data = tokenize_response.json()
assert tokenize_data["tokens"] == render_data["token_ids"]
assert tokenize_data["count"] == len(render_data["token_ids"])
...@@ -2,31 +2,32 @@ ...@@ -2,31 +2,32 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem from openai_harmony import Message, Role
from openai.types.responses.response_output_item import McpCall
from openai_harmony import Author, Message, Role, TextContent
from tests.entrypoints.openai.utils import verify_harmony_messages from tests.entrypoints.openai.utils import verify_harmony_messages
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.harmony_utils import (
auto_drop_analysis_messages, auto_drop_analysis_messages,
get_encoding, get_encoding,
get_system_message,
has_custom_tools, has_custom_tools,
parse_chat_input_to_harmony_message, parse_chat_input_to_harmony_message,
parse_chat_output, parse_chat_output,
parse_input_to_harmony_message, )
parse_output_message, from vllm.entrypoints.openai.responses.harmony import (
response_input_to_harmony,
response_previous_input_to_harmony,
) )
class TestCommonParseInputToHarmonyMessage: class TestCommonParseInputToHarmonyMessage:
""" """
Tests for scenarios that are common to both Chat Completion Tests for scenarios that are common to both Chat Completion
parse_chat_input_to_harmony_message and Responsees API parse_chat_input_to_harmony_message and Responses API
parse_input_to_harmony_message functions. response_previous_input_to_harmony functions.
""" """
@pytest.fixture( @pytest.fixture(
params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message] params=[parse_chat_input_to_harmony_message, response_previous_input_to_harmony]
) )
def parse_function(self, request): def parse_function(self, request):
return request.param return request.param
...@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage: ...@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage:
assert messages[0].content[1].text == "actual text" assert messages[0].content[1].text == "actual text"
class TestParseInputToHarmonyMessage:
"""
Tests for scenarios that are specific to the Responses API
parse_input_to_harmony_message function.
"""
def test_message_with_empty_content(self):
"""Test parsing message with empty string content."""
chat_msg = {
"role": "user",
"content": "",
}
messages = parse_input_to_harmony_message(chat_msg)
assert len(messages) == 1
assert messages[0].content[0].text == ""
def test_tool_message_with_string_content(self):
"""Test parsing tool message with string content."""
chat_msg = {
"role": "tool",
"name": "get_weather",
"content": "The weather in San Francisco is sunny, 72°F",
}
messages = parse_input_to_harmony_message(chat_msg)
assert len(messages) == 1
assert messages[0].author.role == Role.TOOL
assert messages[0].author.name == "functions.get_weather"
assert (
messages[0].content[0].text == "The weather in San Francisco is sunny, 72°F"
)
assert messages[0].channel == "commentary"
def test_tool_message_with_array_content(self):
"""Test parsing tool message with array content."""
chat_msg = {
"role": "tool",
"name": "search_results",
"content": [
{"type": "text", "text": "Result 1: "},
{"type": "text", "text": "Result 2: "},
{
"type": "image",
"url": "http://example.com/img.png",
}, # Should be ignored
{"type": "text", "text": "Result 3"},
],
}
messages = parse_input_to_harmony_message(chat_msg)
assert len(messages) == 1
assert messages[0].author.role == Role.TOOL
assert messages[0].author.name == "functions.search_results"
assert messages[0].content[0].text == "Result 1: Result 2: Result 3"
def test_tool_message_with_empty_content(self):
"""Test parsing tool message with None content."""
chat_msg = {
"role": "tool",
"name": "empty_tool",
"content": None,
}
messages = parse_input_to_harmony_message(chat_msg)
assert len(messages) == 1
assert messages[0].author.role == Role.TOOL
assert messages[0].author.name == "functions.empty_tool"
assert messages[0].content[0].text == ""
class TestParseChatInputToHarmonyMessage: class TestParseChatInputToHarmonyMessage:
""" """
Tests for scenarios that are specific to the Chat Completion API Tests for scenarios that are specific to the Chat Completion API
...@@ -840,192 +766,47 @@ class TestParseChatOutput: ...@@ -840,192 +766,47 @@ class TestParseChatOutput:
assert reasoning == "I've thought hard about this." assert reasoning == "I've thought hard about this."
assert final_content == "The answer is 4." assert final_content == "The answer is 4."
def test_parse_chat_output_commentary_with_recipient_excluded(self) -> None:
"""Commentary with a recipient (tool call) should not appear in
final_content — those are handled separately by the tool parser.
class TestParseOutputMessage: The first message is a preamble (visible), the second is a tool
"""Tests for parse_output_message function.""" call (excluded). Only the preamble should appear in final_content.
def test_commentary_with_no_recipient_creates_reasoning(self):
"""Test that commentary with recipient=None (preambles) creates reasoning items.
Per Harmony format, commentary channel can contain preambles to calling
multiple functions - explanatory text with no recipient.
""" """
message = Message.from_role_and_content( harmony_str = (
Role.ASSISTANT, "I will now search for the weather information." "<|channel|>commentary"
) "<|message|>Let me check the weather.<|end|>"
message = message.with_channel("commentary") "<|start|>assistant to=functions.get_weather"
# recipient is None by default, representing a preamble "<|channel|>commentary"
'<|message|>{"location": "SF"}<|end|>'
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text
== "I will now search for the weather information."
)
assert output_items[0].content[0].type == "reasoning_text"
def test_commentary_with_function_recipient_creates_function_call(self):
"""Test commentary with recipient='functions.X' creates function calls."""
message = Message.from_role_and_content(
Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
)
message = message.with_channel("commentary")
message = message.with_recipient("functions.get_weather")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseFunctionToolCall)
assert output_items[0].type == "function_call"
assert output_items[0].name == "get_weather"
assert (
output_items[0].arguments
== '{"location": "San Francisco", "units": "celsius"}'
)
assert output_items[0].call_id.startswith("call_")
assert output_items[0].id.startswith("fc_")
def test_commentary_with_python_recipient_creates_reasoning(self):
"""Test that commentary with recipient='python' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
)
message = message.with_channel("commentary")
message = message.with_recipient("python")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text
== "import numpy as np\nprint(np.array([1, 2, 3]))"
)
def test_commentary_with_browser_recipient_creates_reasoning(self):
"""Test that commentary with recipient='browser' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Navigating to the specified URL"
)
message = message.with_channel("commentary")
message = message.with_recipient("browser")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert output_items[0].content[0].text == "Navigating to the specified URL"
def test_commentary_with_container_recipient_creates_reasoning(self):
"""Test that commentary with recipient='container' creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Running command in container"
)
message = message.with_channel("commentary")
message = message.with_recipient("container")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert output_items[0].content[0].text == "Running command in container"
def test_commentary_with_empty_content_and_no_recipient(self):
"""Test edge case: empty commentary with recipient=None."""
message = Message.from_role_and_content(Role.ASSISTANT, "")
message = message.with_channel("commentary")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].content[0].text == ""
def test_commentary_with_multiple_contents_and_no_recipient(self):
"""Test multiple content items in commentary with no recipient."""
contents = [
TextContent(text="Step 1: Analyze the request"),
TextContent(text="Step 2: Prepare to call functions"),
]
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
message = message.with_channel("commentary")
output_items = parse_output_message(message)
assert len(output_items) == 2
assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
assert output_items[0].content[0].text == "Step 1: Analyze the request"
assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
def test_commentary_with_multiple_function_calls(self):
"""Test multiple function calls in commentary channel."""
contents = [
TextContent(text='{"location": "San Francisco"}'),
TextContent(text='{"location": "New York"}'),
]
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
message = message.with_channel("commentary")
message = message.with_recipient("functions.get_weather")
output_items = parse_output_message(message)
assert len(output_items) == 2
assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
assert output_items[0].name == "get_weather"
assert output_items[1].name == "get_weather"
assert output_items[0].arguments == '{"location": "San Francisco"}'
assert output_items[1].arguments == '{"location": "New York"}'
def test_commentary_with_unknown_recipient_creates_mcp_call(self):
"""Test that commentary with unknown recipient creates MCP call."""
message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}')
message = message.with_channel("commentary")
message = message.with_recipient("custom_tool")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], McpCall)
assert output_items[0].type == "mcp_call"
assert output_items[0].name == "custom_tool"
assert output_items[0].server_label == "custom_tool"
def test_analysis_channel_creates_reasoning(self):
"""Test that analysis channel creates reasoning items."""
message = Message.from_role_and_content(
Role.ASSISTANT, "Analyzing the problem step by step..."
)
message = message.with_channel("analysis")
output_items = parse_output_message(message)
assert len(output_items) == 1
assert isinstance(output_items[0], ResponseReasoningItem)
assert output_items[0].type == "reasoning"
assert (
output_items[0].content[0].text == "Analyzing the problem step by step..."
) )
token_ids = get_encoding().encode(harmony_str, allowed_special="all")
reasoning, final_content, _ = parse_chat_output(token_ids)
assert reasoning is None
assert final_content == "Let me check the weather."
def test_non_assistant_message_returns_empty(self): def test_parse_chat_output_interrupted_preamble(self) -> None:
"""Test that non-assistant messages return empty list. """Partial/interrupted preamble (commentary without recipient) should
appear in final_content, not reasoning."""
harmony_str = "<|channel|>commentary<|message|>I'll search for that"
token_ids = get_encoding().encode(harmony_str, allowed_special="all")
reasoning, final_content, _ = parse_chat_output(token_ids)
assert reasoning is None
assert final_content == "I'll search for that"
Per the implementation, tool messages to assistant (e.g., search results) def test_parse_chat_output_preamble_then_final(self) -> None:
are not included in final output to align with OpenAI behavior. """Preamble followed by a final message should both appear in
""" final_content, joined by newline."""
message = Message.from_author_and_content( harmony_str = (
Author.new(Role.TOOL, "functions.get_weather"), "<|channel|>commentary"
"The weather is sunny, 72°F", "<|message|>Let me look that up.<|end|>"
"<|start|>assistant<|channel|>final"
"<|message|>The answer is 42.<|end|>"
) )
token_ids = get_encoding().encode(harmony_str, allowed_special="all")
output_items = parse_output_message(message) reasoning, final_content, _ = parse_chat_output(token_ids)
assert reasoning is None
assert len(output_items) == 0 assert final_content == "Let me look that up.\nThe answer is 42."
def test_has_custom_tools() -> None: def test_has_custom_tools() -> None:
...@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None: ...@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None:
) )
def test_parse_mcp_call_basic() -> None: class TestGetSystemMessage:
"""Test that MCP calls are parsed with correct type and server_label.""" """Tests for get_system_message channel configuration."""
message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}')
message = message.with_recipient("filesystem")
message = message.with_channel("commentary")
output_items = parse_output_message(message) def test_commentary_channel_present_without_custom_tools(self) -> None:
"""Commentary channel must be valid even without custom tools."""
sys_msg = get_system_message(with_custom_tools=False)
valid_channels = sys_msg.content[0].channel_config.valid_channels
assert "commentary" in valid_channels
assert len(output_items) == 1 def test_commentary_channel_present_with_custom_tools(self) -> None:
assert isinstance(output_items[0], McpCall) """Commentary channel present when custom tools are enabled."""
assert output_items[0].type == "mcp_call" sys_msg = get_system_message(with_custom_tools=True)
assert output_items[0].name == "filesystem" valid_channels = sys_msg.content[0].channel_config.valid_channels
assert output_items[0].server_label == "filesystem" assert "commentary" in valid_channels
assert output_items[0].arguments == '{"path": "/tmp"}'
assert output_items[0].status == "completed"
def test_all_standard_channels_present(self) -> None:
"""All three standard Harmony channels should always be valid."""
for with_tools in (True, False):
sys_msg = get_system_message(with_custom_tools=with_tools)
valid_channels = sys_msg.content[0].channel_config.valid_channels
for channel in ("analysis", "commentary", "final"):
assert channel in valid_channels, (
f"{channel} missing when with_custom_tools={with_tools}"
)
def test_parse_mcp_call_dotted_recipient() -> None:
"""Test that dotted recipients extract the tool name correctly."""
message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}')
message = message.with_recipient("repo_browser.list")
message = message.with_channel("commentary")
output_items = parse_output_message(message) class TestResponseInputToHarmonyReasoningItem:
"""Tests for response_input_to_harmony handling of reasoning input items.
assert len(output_items) == 1 Per the OpenAI spec, ResponseReasoningItem.content is
assert isinstance(output_items[0], McpCall) Optional[List[Content]] = None. Clients like langchain-openai may omit
assert output_items[0].name == "list" this field when constructing multi-turn input from previous responses.
assert output_items[0].server_label == "repo_browser"
Reasoning items with content are converted to Harmony messages on the
'analysis' channel. All content items are concatenated. Items without
content return None (skipped by the caller).
"""
def test_mcp_vs_function_call() -> None: def test_reasoning_with_single_content(self):
"""Test that function calls are not parsed as MCP calls.""" """Test reasoning item with a single content entry."""
func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') item = {
func_message = func_message.with_recipient("functions.my_tool") "type": "reasoning",
func_message = func_message.with_channel("commentary") "id": "rs_123",
"content": [{"type": "reasoning_text", "text": "Thinking step by step"}],
}
func_items = parse_output_message(func_message) msg = response_input_to_harmony(item, prev_responses=[])
assert len(func_items) == 1 assert msg is not None
assert not isinstance(func_items[0], McpCall) assert msg.author.role == Role.ASSISTANT
assert func_items[0].type == "function_call" assert msg.content[0].text == "Thinking step by step"
assert msg.channel == "analysis"
def test_reasoning_with_multiple_content_items(self):
"""Test reasoning item with multiple content entries concatenated."""
item = {
"type": "reasoning",
"id": "rs_123",
"content": [
{"type": "reasoning_text", "text": "First, let me analyze"},
{"type": "reasoning_text", "text": "Second, I should consider"},
{"type": "reasoning_text", "text": "Finally, the answer is"},
],
}
msg = response_input_to_harmony(item, prev_responses=[])
assert msg is not None
assert msg.author.role == Role.ASSISTANT
assert msg.content[0].text == (
"First, let me analyze\nSecond, I should consider\nFinally, the answer is"
)
assert msg.channel == "analysis"
def test_reasoning_without_content_returns_none(self):
"""Test reasoning item without content field returns None."""
item = {
"type": "reasoning",
"id": "rs_123",
"summary": [{"type": "summary_text", "text": "Thinking about math"}],
}
msg = response_input_to_harmony(item, prev_responses=[])
def test_mcp_vs_builtin_tools() -> None: assert msg is None
"""Test that built-in tools (python, container) are not parsed as MCP calls."""
# Test python (built-in tool) - should be reasoning, not MCP
python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')")
python_message = python_message.with_recipient("python")
python_message = python_message.with_channel("commentary")
python_items = parse_output_message(python_message) def test_reasoning_with_none_content_returns_none(self):
"""Test reasoning item with content=None returns None."""
item = {
"type": "reasoning",
"id": "rs_123",
"content": None,
"summary": [{"type": "summary_text", "text": "Thinking about math"}],
}
assert len(python_items) == 1 msg = response_input_to_harmony(item, prev_responses=[])
assert not isinstance(python_items[0], McpCall)
assert python_items[0].type == "reasoning" assert msg is None
def test_reasoning_with_empty_content_returns_none(self):
"""Test reasoning item with empty content list returns None."""
item = {
"type": "reasoning",
"id": "rs_123",
"content": [],
}
msg = response_input_to_harmony(item, prev_responses=[])
def test_parse_remaining_state_commentary_channel() -> None: assert msg is None
"""Test parse_remaining_state with commentary channel and various recipients."""
from unittest.mock import Mock
from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func = Mock()
parser_func.current_content = '{"arg": "value"}'
parser_func.current_role = Role.ASSISTANT
parser_func.current_channel = "commentary"
parser_func.current_recipient = "functions.my_tool"
func_items = parse_remaining_state(parser_func)
assert len(func_items) == 1
assert not isinstance(func_items[0], McpCall)
assert func_items[0].type == "function_call"
assert func_items[0].name == "my_tool"
assert func_items[0].status == "in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp = Mock()
parser_mcp.current_content = '{"path": "/tmp"}'
parser_mcp.current_role = Role.ASSISTANT
parser_mcp.current_channel = "commentary"
parser_mcp.current_recipient = "filesystem"
mcp_items = parse_remaining_state(parser_mcp)
assert len(mcp_items) == 1
assert isinstance(mcp_items[0], McpCall)
assert mcp_items[0].type == "mcp_call"
assert mcp_items[0].name == "filesystem"
assert mcp_items[0].server_label == "filesystem"
assert mcp_items[0].status == "in_progress"
# Test 3: Built-in tool (python)
# should NOT return MCP call, falls through to reasoning
parser_builtin = Mock()
parser_builtin.current_content = "print('hello')"
parser_builtin.current_role = Role.ASSISTANT
parser_builtin.current_channel = "commentary"
parser_builtin.current_recipient = "python"
builtin_items = parse_remaining_state(parser_builtin)
# Should fall through to reasoning logic
assert len(builtin_items) == 1
assert not isinstance(builtin_items[0], McpCall)
assert builtin_items[0].type == "reasoning"
def test_parse_remaining_state_analysis_channel() -> None:
"""Test parse_remaining_state with analysis channel and various recipients."""
from unittest.mock import Mock
from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func = Mock()
parser_func.current_content = '{"arg": "value"}'
parser_func.current_role = Role.ASSISTANT
parser_func.current_channel = "analysis"
parser_func.current_recipient = "functions.my_tool"
func_items = parse_remaining_state(parser_func)
assert len(func_items) == 1
assert not isinstance(func_items[0], McpCall)
assert func_items[0].type == "function_call"
assert func_items[0].name == "my_tool"
assert func_items[0].status == "in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp = Mock()
parser_mcp.current_content = '{"query": "test"}'
parser_mcp.current_role = Role.ASSISTANT
parser_mcp.current_channel = "analysis"
parser_mcp.current_recipient = "database"
mcp_items = parse_remaining_state(parser_mcp)
assert len(mcp_items) == 1
assert isinstance(mcp_items[0], McpCall)
assert mcp_items[0].type == "mcp_call"
assert mcp_items[0].name == "database"
assert mcp_items[0].server_label == "database"
assert mcp_items[0].status == "in_progress"
# Test 3: Built-in tool (container)
# should NOT return MCP call, falls through to reasoning
parser_builtin = Mock()
parser_builtin.current_content = "docker run"
parser_builtin.current_role = Role.ASSISTANT
parser_builtin.current_channel = "analysis"
parser_builtin.current_recipient = "container"
builtin_items = parse_remaining_state(parser_builtin)
# Should fall through to reasoning logic
assert len(builtin_items) == 1
assert not isinstance(builtin_items[0], McpCall)
assert builtin_items[0].type == "reasoning"
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import json
import logging
from collections.abc import Callable
from typing import Any
import pytest import pytest
logger = logging.getLogger(__name__)
BASE_TEST_ENV = {
# The day vLLM said "hello world" on arxiv 🚀
"VLLM_SYSTEM_START_DATE": "2023-09-12",
}
DEFAULT_MAX_RETRIES = 3
@pytest.fixture @pytest.fixture
def pairs_of_event_types() -> dict[str, str]: def pairs_of_event_types() -> dict[str, str]:
...@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]: ...@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]:
"response.mcp_call.completed": "response.mcp_call.in_progress", "response.mcp_call.completed": "response.mcp_call.in_progress",
"response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa: E501 "response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa: E501
"response.code_interpreter_call_code.done": "response.code_interpreter_call_code.delta", # noqa: E501 "response.code_interpreter_call_code.done": "response.code_interpreter_call_code.delta", # noqa: E501
"response.code_interpreter_call.completed": "response.code_interpreter_call.in_progress", # noqa: E501
"response.web_search_call.completed": "response.web_search_call.in_progress", "response.web_search_call.completed": "response.web_search_call.in_progress",
} }
# fmt: on # fmt: on
return event_pairs return event_pairs
async def retry_for_tool_call(
client,
*,
model: str,
expected_tool_type: str,
max_retries: int = DEFAULT_MAX_RETRIES,
**create_kwargs: Any,
):
"""Call ``client.responses.create`` up to *max_retries* times, returning
the first response that contains an output item of *expected_tool_type*.
Returns the **last** response if none match so the caller's assertions
fire with a clear diagnostic.
"""
last_response = None
for attempt in range(max_retries):
response = await client.responses.create(model=model, **create_kwargs)
last_response = response
if any(
getattr(item, "type", None) == expected_tool_type
for item in response.output
):
return response
assert last_response is not None
return last_response
async def retry_streaming_for(
client,
*,
model: str,
validate_events: Callable[[list], bool],
max_retries: int = DEFAULT_MAX_RETRIES,
**create_kwargs: Any,
) -> list:
"""Call ``client.responses.create(stream=True)`` up to *max_retries*
times, returning the first event list where *validate_events* returns
``True``.
"""
last_events: list = []
for attempt in range(max_retries):
stream = await client.responses.create(
model=model, stream=True, **create_kwargs
)
events: list = []
async for event in stream:
events.append(event)
last_events = events
if validate_events(events):
return events
return last_events
def has_output_type(response, type_name: str) -> bool:
"""Return True if *response* has at least one output item of *type_name*."""
return any(getattr(item, "type", None) == type_name for item in response.output)
def events_contain_type(events: list, type_substring: str) -> bool:
"""Return True if any event's type contains *type_substring*."""
return any(type_substring in getattr(e, "type", "") for e in events)
def _validate_event_pairing(events: list, pairs_of_event_types: dict[str, str]) -> None:
"""Validate that streaming events are properly nested/paired.
Derives push/pop sets from *pairs_of_event_types* so that every
start/end pair in the dict is handled automatically.
"""
start_events = set(pairs_of_event_types.values())
end_events = set(pairs_of_event_types.keys())
stack: list[str] = []
for event in events:
etype = event.type
if etype in end_events:
expected_start = pairs_of_event_types[etype]
assert stack and stack[-1] == expected_start, (
f"Stack mismatch for {etype}: "
f"expected {expected_start}, "
f"got {stack[-1] if stack else '<empty>'}"
)
stack.pop()
elif etype in start_events:
# Consecutive deltas of the same type share a single stack slot.
if etype.endswith("delta") and stack and stack[-1] == etype:
continue
stack.append(etype)
# else: passthrough event (e.g. response.in_progress,
# web_search_call.searching, code_interpreter_call.interpreting)
assert len(stack) == 0, f"Unclosed events on stack: {stack}"
def _validate_event_ordering(events: list) -> None:
"""Validate that envelope events appear in the correct positions."""
assert len(events) >= 2, f"Expected at least 2 events, got {len(events)}"
# First event must be response.created
assert events[0].type == "response.created", (
f"First event must be response.created, got {events[0].type}"
)
# Last event must be response.completed
assert events[-1].type == "response.completed", (
f"Last event must be response.completed, got {events[-1].type}"
)
# response.in_progress, if present, must be the second event
in_progress_indices = [
i for i, e in enumerate(events) if e.type == "response.in_progress"
]
if in_progress_indices:
assert in_progress_indices == [1], (
f"response.in_progress must be the second event, "
f"found at indices {in_progress_indices}"
)
# Exactly one created and one completed
created_count = sum(1 for e in events if e.type == "response.created")
completed_count = sum(1 for e in events if e.type == "response.completed")
assert created_count == 1, (
f"Expected exactly 1 response.created, got {created_count}"
)
assert completed_count == 1, (
f"Expected exactly 1 response.completed, got {completed_count}"
)
def _validate_field_consistency(events: list) -> None:
"""Validate item_id, output_index, and content_index consistency.
Tracks the active output item established by ``output_item.added``
and verifies that all subsequent events for that item carry matching
identifiers until ``output_item.done`` closes it.
"""
_SESSION_EVENTS = {
"response.created",
"response.in_progress",
"response.completed",
}
active_item_id: str | None = None
active_output_index: int | None = None
last_output_index: int = -1
active_content_index: int | None = None
for event in events:
etype = event.type
if etype in _SESSION_EVENTS:
continue
# --- output_item.added: opens a new item ------------------
if etype == "response.output_item.added":
item = getattr(event, "item", None)
output_index = getattr(event, "output_index", None)
assert item is not None, "output_item.added must have an item"
item_id = getattr(item, "id", None)
assert item_id, "output_item.added item must have an id"
# output_index must be non-decreasing across items
if output_index is not None:
assert output_index >= last_output_index, (
f"output_index went backwards: {output_index} < {last_output_index}"
)
last_output_index = output_index
active_item_id = item_id
active_output_index = output_index
active_content_index = None
continue
# --- output_item.done: closes the active item -------------
if etype == "response.output_item.done":
item = getattr(event, "item", None)
output_index = getattr(event, "output_index", None)
assert item is not None, "output_item.done must have an item"
done_item_id = getattr(item, "id", None)
if active_item_id is not None and done_item_id:
assert done_item_id == active_item_id, (
f"output_item.done item.id mismatch: "
f"expected {active_item_id}, got {done_item_id}"
)
if active_output_index is not None and output_index is not None:
assert output_index == active_output_index, (
f"output_item.done output_index mismatch: "
f"expected {active_output_index}, got {output_index}"
)
active_item_id = None
active_output_index = None
active_content_index = None
continue
# --- content_part / reasoning_part added: sets content_index
if etype in (
"response.content_part.added",
"response.reasoning_part.added",
):
_assert_item_fields(event, etype, active_item_id, active_output_index)
active_content_index = getattr(event, "content_index", None)
continue
# --- all other item-level events --------------------------
_assert_item_fields(event, etype, active_item_id, active_output_index)
# content_index (only meaningful on events that carry it)
content_index = getattr(event, "content_index", None)
if content_index is not None and active_content_index is not None:
assert content_index == active_content_index, (
f"{etype} content_index mismatch: "
f"expected {active_content_index}, got {content_index}"
)
def _assert_item_fields(
event,
etype: str,
active_item_id: str | None,
active_output_index: int | None,
) -> None:
"""Check that *event*'s item_id and output_index match the active item."""
event_item_id = getattr(event, "item_id", None)
output_index = getattr(event, "output_index", None)
if active_item_id is not None and event_item_id is not None:
assert event_item_id == active_item_id, (
f"{etype} item_id mismatch: expected {active_item_id}, got {event_item_id}"
)
if active_output_index is not None and output_index is not None:
assert output_index == active_output_index, (
f"{etype} output_index mismatch: "
f"expected {active_output_index}, got {output_index}"
)
def validate_streaming_event_stack(
events: list, pairs_of_event_types: dict[str, str]
) -> None:
"""Validate streaming events: pairing, ordering, and field consistency.
Checks three aspects:
1. **Event pairing** — start/end events are properly nested
(stack-based matching derived from *pairs_of_event_types*).
2. **Event ordering** — envelope events (``created``,
``in_progress``, ``completed``) appear at the correct positions.
3. **Field consistency** — ``item_id``, ``output_index``, and
``content_index`` are consistent across related events within
each output item's lifecycle.
"""
_validate_event_pairing(events, pairs_of_event_types)
_validate_event_ordering(events)
_validate_field_consistency(events)
def log_response_diagnostics(
response,
*,
label: str = "Response Diagnostics",
) -> dict[str, Any]:
"""Extract and log diagnostic info from a Responses API response.
Logs reasoning, tool-call attempts, MCP items, and output types so
that CI output (``pytest -s`` or ``--log-cli-level=INFO``) gives
full visibility into model behaviour even on passing runs.
Returns the extracted data so callers can make additional assertions
if needed.
"""
reasoning_texts = [
text
for item in response.output
if getattr(item, "type", None) == "reasoning"
for content in getattr(item, "content", [])
if (text := getattr(content, "text", None))
]
tool_call_attempts = [
{
"recipient": msg.get("recipient"),
"channel": msg.get("channel"),
}
for msg in response.output_messages
if (msg.get("recipient") or "").startswith("python")
]
mcp_items = [
{
"name": getattr(item, "name", None),
"status": getattr(item, "status", None),
}
for item in response.output
if getattr(item, "type", None) == "mcp_call"
]
output_types = [getattr(o, "type", None) for o in response.output]
diagnostics = {
"model_attempted_tool_calls": bool(tool_call_attempts),
"tool_call_attempts": tool_call_attempts,
"mcp_items": mcp_items,
"reasoning": reasoning_texts,
"output_text": response.output_text,
"output_types": output_types,
}
logger.info(
"\n====== %s ======\n%s\n==============================",
label,
json.dumps(diagnostics, indent=2, default=str),
)
return diagnostics
...@@ -6,7 +6,6 @@ from unittest.mock import MagicMock ...@@ -6,7 +6,6 @@ from unittest.mock import MagicMock
import pytest import pytest
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
...@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error(): ...@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
serving._raise_if_error(None, "test-request-id") # should not raise serving._raise_if_error(None, "test-request-id") # should not raise
@pytest.mark.asyncio
async def test_convert_generation_error_to_response():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine = MagicMock()
mock_engine.model_config = MagicMock()
mock_engine.model_config.max_model_len = 100
mock_models = MagicMock()
serving = OpenAIServing(
engine_client=mock_engine,
models=mock_models,
request_logger=None,
)
# create a GenerationError
gen_error = GenerationError("Internal server error")
# convert to ErrorResponse
error_response = serving._convert_generation_error_to_response(gen_error)
assert isinstance(error_response, ErrorResponse)
assert error_response.error.type == "InternalServerError"
assert error_response.error.message == "Internal server error"
assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_convert_generation_error_to_streaming_response(): async def test_convert_generation_error_to_streaming_response():
"""test _convert_generation_error_to_streaming_response output""" """test _convert_generation_error_to_streaming_response output"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment