Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
878 additions
and
548 deletions
+878
-548
tests/entrypoints/openai/chat_completion/test_serving_chat.py
...s/entrypoints/openai/chat_completion/test_serving_chat.py
+152
-45
tests/entrypoints/openai/chat_completion/test_serving_chat_stream_harmony.py
...penai/chat_completion/test_serving_chat_stream_harmony.py
+4
-10
tests/entrypoints/openai/cpu/__init__.py
tests/entrypoints/openai/cpu/__init__.py
+0
-0
tests/entrypoints/openai/cpu/test_render.py
tests/entrypoints/openai/cpu/test_render.py
+96
-57
tests/entrypoints/openai/cpu/test_render_multimodal.py
tests/entrypoints/openai/cpu/test_render_multimodal.py
+155
-0
tests/entrypoints/openai/parser/test_harmony_utils.py
tests/entrypoints/openai/parser/test_harmony_utils.py
+138
-409
tests/entrypoints/openai/responses/conftest.py
tests/entrypoints/openai/responses/conftest.py
+333
-0
tests/entrypoints/openai/responses/test_errors.py
tests/entrypoints/openai/responses/test_errors.py
+0
-27
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/test_serving_chat.py
→
tests/entrypoints/openai/
chat_completion/
test_serving_chat.py
View file @
3fb4b5fa
...
@@ -10,6 +10,12 @@ import pytest
...
@@ -10,6 +10,12 @@ import pytest
import
pytest_asyncio
import
pytest_asyncio
from
openai
import
OpenAI
from
openai
import
OpenAI
from
tests.entrypoints.openai.utils
import
(
accumulate_streaming_response
,
verify_chat_response
,
verify_harmony_messages
,
)
from
tests.utils
import
RemoteOpenAIServer
from
vllm._aiter_ops
import
is_aiter_found_and_supported
from
vllm._aiter_ops
import
is_aiter_found_and_supported
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
...
@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import (
...
@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse
,
ErrorResponse
,
RequestResponseMetadata
,
RequestResponseMetadata
,
)
)
from
vllm.entrypoints.openai.models.serving
import
BaseModelPath
,
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
(
BaseModelPath
,
OpenAIModelRegistry
,
OpenAIServingModels
,
)
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
TokensPrompt
from
vllm.inputs
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
from
vllm.renderers.hf
import
HfRenderer
...
@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config
...
@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config
from
vllm.tool_parsers
import
ToolParserManager
from
vllm.tool_parsers
import
ToolParserManager
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
RemoteOpenAIServer
from
.utils
import
(
accumulate_streaming_response
,
verify_chat_response
,
verify_harmony_messages
,
)
GPT_OSS_MODEL_NAME
=
"openai/gpt-oss-20b"
GPT_OSS_MODEL_NAME
=
"openai/gpt-oss-20b"
GPT_OSS_SPECULATOR_NAME
=
"RedHatAI/gpt-oss-20b-speculator.eagle3"
GPT_OSS_SPECULATOR_NAME
=
"RedHatAI/gpt-oss-20b-speculator.eagle3"
...
@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]):
...
@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]):
if
is_aiter_found_and_supported
():
if
is_aiter_found_and_supported
():
env_dict
=
{
"VLLM_ROCM_USE_AITER"
:
"1"
}
env_dict
=
{
"VLLM_ROCM_USE_AITER"
:
"1"
}
with
RemoteOpenAIServer
(
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
server_args
,
env_dict
=
env_dict
GPT_OSS_MODEL_NAME
,
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
480
)
as
remote_server
:
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -520,38 +525,67 @@ class MockModelConfig:
...
@@ -520,38 +525,67 @@ class MockModelConfig:
multimodal_config
=
MultiModalConfig
()
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
hf_config
=
MockHFConfig
()
logits_processors
:
list
[
str
]
|
None
=
None
logits_processors
:
list
[
str
]
|
None
=
None
logits_processor_pattern
=
None
diff_sampling_param
:
dict
|
None
=
None
diff_sampling_param
:
dict
|
None
=
None
allowed_local_media_path
:
str
=
""
allowed_local_media_path
:
str
=
""
allowed_media_domains
:
list
[
str
]
|
None
=
None
allowed_media_domains
:
list
[
str
]
|
None
=
None
encoder_config
=
None
encoder_config
=
None
generation_config
:
str
=
"auto"
generation_config
:
str
=
"auto"
override_generation_config
:
dict
[
str
,
Any
]
=
field
(
default_factory
=
dict
)
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
skip_tokenizer_init
:
bool
=
False
skip_tokenizer_init
:
bool
=
False
is_encoder_decoder
:
bool
=
False
is_encoder_decoder
:
bool
=
False
is_multimodal_model
:
bool
=
False
def
get_diff_sampling_param
(
self
):
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
return
self
.
diff_sampling_param
or
{}
@
dataclass
class
MockParallelConfig
:
_api_process_rank
:
int
=
0
@
dataclass
class
MockVllmConfig
:
model_config
:
MockModelConfig
parallel_config
:
MockParallelConfig
def
_build_renderer
(
model_config
:
MockModelConfig
):
def
_build_renderer
(
model_config
:
MockModelConfig
):
_
,
tokenizer_name
,
_
,
kwargs
=
tokenizer_args_from_config
(
model_config
)
_
,
tokenizer_name
,
_
,
kwargs
=
tokenizer_args_from_config
(
model_config
)
return
HfRenderer
(
return
HfRenderer
.
from_config
(
model_config
,
MockVllmConfig
(
model_config
,
parallel_config
=
MockParallelConfig
())
,
tokenizer_kwargs
=
{
**
kwargs
,
"tokenizer_name"
:
tokenizer_name
},
tokenizer_kwargs
=
{
**
kwargs
,
"tokenizer_name"
:
tokenizer_name
},
)
)
def
_build_serving_render
(
engine
,
model_registry
:
OpenAIModelRegistry
)
->
OpenAIServingRender
:
return
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
model_registry
,
request_logger
=
None
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
)
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
models
=
OpenAIServingModels
(
models
=
OpenAIServingModels
(
engine_client
=
engine
,
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
engine
,
engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
@@ -572,10 +606,13 @@ async def _async_serving_chat_init():
...
@@ -572,10 +606,13 @@ async def _async_serving_chat_init():
engine
=
MockEngine
()
engine
=
MockEngine
()
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_completion
=
OpenAIServingChat
(
serving_completion
=
OpenAIServingChat
(
engine
,
engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens():
...
@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
#
Setting server's max_tokens in the generation_config.json
#
Model author's generation_config.json sets max_tokens (auto, no override)
#
lower than context_window - prompt_tokens
#
— should act as fallback only, not ceiling
mock_model_config
=
MockModelConfig
()
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
10
}
"max_tokens"
:
10
# Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings
# Reinitialize the engine with new settings
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
...
@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens():
...
@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 2: Request's max_tokens set higher than server accepts
# Test Case 2: Request's max_tokens set higher than generation_config
# default so request-provided max_tokens takes precedence
req
.
max_tokens
=
15
req
.
max_tokens
=
15
with
suppress
(
Exception
):
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
1
0
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
1
5
# Test Case 3: Request's max_tokens set lower than server accepts
# Test Case 3: Request's max_tokens set lower than server accepts
req
.
max_tokens
=
5
req
.
max_tokens
=
5
...
@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens():
...
@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
5
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
5
# User explicitly sets max_tokens via --override-generation-config
# — should act as a ceiling
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
10
}
mock_model_config
.
override_generation_config
=
{
"max_new_tokens"
:
10
}
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
mock_model_config
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
mock_engine
.
renderer
=
_build_renderer
(
mock_engine
.
model_config
)
serving_chat
=
_build_serving_chat
(
mock_engine
)
# Test Case 3.1: No max_tokens — uses override as default
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
)
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 3.2: Request max_tokens higher — capped by user ceiling from override
req
.
max_tokens
=
15
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 3.3: Request max_tokens lower — respected
req
.
max_tokens
=
5
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
5
# Setting server's max_tokens in the generation_config.json
# Setting server's max_tokens in the generation_config.json
# higher than context_window - prompt_tokens
# higher than context_window - prompt_tokens
mock_model_config
=
MockModelConfig
()
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
200
}
"max_tokens"
:
200
# Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings
# Reinitialize the engine with new settings
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
...
@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
...
@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
mock_engine
.
io_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_renderer
=
MistralRenderer
(
mock_engine
.
model_config
,
tokenizer_kwargs
=
{})
mock_renderer
=
MistralRenderer
(
mock_renderer
.
_tokenizer
=
mock_tokenizer
MockVllmConfig
(
mock_engine
.
model_config
,
parallel_config
=
MockParallelConfig
()),
tokenizer
=
mock_tokenizer
,
)
# Force the Mistral chat template renderer to return token IDs.
# Force the Mistral chat template renderer to return token IDs.
# Choose a prompt length that is < max_model_len, but large enough that
# Choose a prompt length that is < max_model_len, but large enough that
# adding max_tokens should exceed the model context window.
# adding max_tokens should exceed the model context window.
...
@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
...
@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
max_tokens
=
10
,
max_tokens
=
10
,
)
)
resp
=
await
serving_chat
.
create_chat_completion
(
req
)
with
pytest
.
raises
(
VLLMValidationError
):
assert
isinstance
(
resp
,
ErrorResponse
)
await
serving_chat
.
create_chat_completion
(
req
)
assert
"context length is only"
in
resp
.
error
.
message
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
...
@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
mock_engine
.
io_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_renderer
=
MistralRenderer
(
mock_engine
.
model_config
,
tokenizer_kwargs
=
{})
mock_renderer
=
MistralRenderer
(
mock_renderer
.
_tokenizer
=
mock_tokenizer
MockVllmConfig
(
mock_engine
.
model_config
,
parallel_config
=
MockParallelConfig
()),
tokenizer
=
mock_tokenizer
,
)
# prompt_token_ids length == max_model_len should be rejected for
# prompt_token_ids length == max_model_len should be rejected for
# completion-like requests (ChatCompletionRequest).
# completion-like requests (ChatCompletionRequest).
mock_renderer
.
render_messages_async
=
AsyncMock
(
mock_renderer
.
render_messages_async
=
AsyncMock
(
...
@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
...
@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
max_tokens
=
1
,
max_tokens
=
1
,
)
)
resp
=
await
serving_chat
.
create_chat_completion
(
req
)
with
pytest
.
raises
(
VLLMValidationError
):
assert
isinstance
(
resp
,
ErrorResponse
)
await
serving_chat
.
create_chat_completion
(
req
)
assert
"context length is only"
in
resp
.
error
.
message
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony:
...
@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony:
...
@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony:
...
@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony:
...
@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony:
...
@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony:
...
@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony:
...
@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
[
[
...
@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony:
...
@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_2
,
input_messages_2
,
[
[
...
@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony:
...
@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
# Test the Harmony messages for the third turn's input
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_3
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
input_messages_3
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_3
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_3
,
input_messages_3
,
[
[
...
@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony:
...
@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
# Test the Harmony messages for the fourth turn's input
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_4
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
input_messages_4
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_4
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages_4
,
input_messages_4
,
[
[
...
@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony:
...
@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony:
...
@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony:
...
@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony:
},
},
]
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
verify_harmony_messages
(
input_messages
,
input_messages
,
...
@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser():
...
@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser():
engine_client
=
mock_engine
,
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
)
openai_serving_render
=
_build_serving_render
(
mock_engine
,
models
.
registry
)
# Create serving_chat without tool_parser (enable_auto_tools=False)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat
=
OpenAIServingChat
(
serving_chat
=
OpenAIServingChat
(
mock_engine
,
mock_engine
,
models
,
models
,
response_role
=
"assistant"
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
request_logger
=
None
,
...
...
tests/entrypoints/openai/test_serving_chat_stream_harmony.py
→
tests/entrypoints/openai/
chat_completion/
test_serving_chat_stream_harmony.py
View file @
3fb4b5fa
...
@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta:
...
@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta:
assert
delta_message
.
tool_calls
[
0
].
index
==
1
assert
delta_message
.
tool_calls
[
0
].
index
==
1
@
pytest
.
mark
.
parametrize
(
def
test_returns_preambles_as_content
(
self
):
"channel,recipient"
,
"""Test that commentary with no recipient (preamble) is user content."""
[
(
"commentary"
,
None
),
(
"commentary"
,
"browser.search"
),
],
)
def
test_returns_tool_call_preambles
(
self
,
channel
,
recipient
):
"""Test that invalid tool recipient on commentary is treated as content."""
parser
=
MockStreamableParser
()
parser
=
MockStreamableParser
()
delta_text
=
"some text"
delta_text
=
"some text"
token_states
=
[
token_states
=
[
TokenState
(
channel
=
channel
,
recipient
=
recipient
,
text
=
delta_text
)
TokenState
(
channel
=
"commentary"
,
recipient
=
None
,
text
=
delta_text
)
]
]
delta_message
,
tools_streamed
=
extract_harmony_streaming_delta
(
delta_message
,
tools_streamed
=
extract_harmony_streaming_delta
(
...
@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta:
...
@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta:
[
[
(
None
,
None
),
(
None
,
None
),
(
"unknown_channel"
,
None
),
(
"unknown_channel"
,
None
),
(
"commentary"
,
"browser.search"
),
],
],
)
)
def
test_returns_none_for_invalid_inputs
(
self
,
channel
,
recipient
):
def
test_returns_none_for_invalid_inputs
(
self
,
channel
,
recipient
):
...
...
vllm/model_executor/layers/quantization/kernels
/__init__.py
→
tests/entrypoints/openai/cpu
/__init__.py
View file @
3fb4b5fa
File moved
tests/entrypoints/openai/test_render.py
→
tests/entrypoints/openai/
cpu/
test_render.py
View file @
3fb4b5fa
...
@@ -7,7 +7,7 @@ import httpx
...
@@ -7,7 +7,7 @@ import httpx
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
from
..
.utils
import
Remote
OpenAI
Server
from
tests
.utils
import
Remote
LaunchRender
Server
MODEL_NAME
=
"hmellor/tiny-random-LlamaForCausalLM"
MODEL_NAME
=
"hmellor/tiny-random-LlamaForCausalLM"
...
@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
...
@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def
server
():
def
server
():
args
:
list
[
str
]
=
[]
args
:
list
[
str
]
=
[]
with
Remote
OpenAI
Server
(
MODEL_NAME
,
args
)
as
remote_server
:
with
Remote
LaunchRender
Server
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
@@ -43,23 +43,20 @@ async def test_completion_render_basic(client):
...
@@ -43,23 +43,20 @@ async def test_completion_render_basic(client):
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
data
=
response
.
json
()
data
=
response
.
json
()
# Verify response structure
# Verify response structure
- list of GenerateRequest
assert
isinstance
(
data
,
list
)
assert
isinstance
(
data
,
list
)
assert
len
(
data
)
>
0
assert
len
(
data
)
>
0
# Verify first prompt
# Verify first prompt
is a GenerateRequest
first_prompt
=
data
[
0
]
first_prompt
=
data
[
0
]
assert
"prompt_token_ids"
in
first_prompt
assert
"token_ids"
in
first_prompt
assert
"prompt"
in
first_prompt
assert
"sampling_params"
in
first_prompt
assert
isinstance
(
first_prompt
[
"prompt_token_ids"
],
list
)
assert
"model"
in
first_prompt
assert
len
(
first_prompt
[
"prompt_token_ids"
])
>
0
assert
"request_id"
in
first_prompt
assert
isinstance
(
first_prompt
[
"prompt"
],
str
)
assert
isinstance
(
first_prompt
[
"token_ids"
],
list
)
assert
len
(
first_prompt
[
"token_ids"
])
>
0
# Verify prompt text is preserved
assert
first_prompt
[
"model"
]
==
MODEL_NAME
assert
(
assert
first_prompt
[
"request_id"
].
startswith
(
"cmpl-"
)
"When should a chat-completions handler return an empty string?"
in
first_prompt
[
"prompt"
]
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client):
...
@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client):
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
data
=
response
.
json
()
data
=
response
.
json
()
# Verify response structure - should be [conversation, engine_prompts]
# Verify response structure - should be a GenerateRequest
assert
isinstance
(
data
,
list
)
assert
isinstance
(
data
,
dict
)
assert
len
(
data
)
==
2
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
conversation
,
engine_prompts
=
data
assert
len
(
data
[
"token_ids"
])
>
0
# Verify conversation
assert
isinstance
(
conversation
,
list
)
assert
len
(
conversation
)
>
0
assert
conversation
[
0
][
"role"
]
==
"user"
assert
"empty string"
in
conversation
[
0
][
"content"
]
# Verify engine_prompts
assert
isinstance
(
engine_prompts
,
list
)
assert
len
(
engine_prompts
)
>
0
first_prompt
=
engine_prompts
[
0
]
# Verify token IDs are integers and BOS token is present
assert
"prompt_token_ids"
in
first_prompt
token_ids
=
data
[
"token_ids"
]
assert
"prompt"
in
first_prompt
assert
isinstance
(
first_prompt
[
"prompt_token_ids"
],
list
)
assert
len
(
first_prompt
[
"prompt_token_ids"
])
>
0
# Verify chat template was applied (should have instruction markers)
assert
"[INST]"
in
first_prompt
[
"prompt"
]
assert
"[/INST]"
in
first_prompt
[
"prompt"
]
# Verify token IDs are correctly preserved as integers
token_ids
=
first_prompt
[
"prompt_token_ids"
]
assert
all
(
isinstance
(
tid
,
int
)
for
tid
in
token_ids
)
assert
all
(
isinstance
(
tid
,
int
)
for
tid
in
token_ids
)
# Verify BOS token (usually 1 for LLaMA models)
assert
token_ids
[
0
]
==
1
assert
token_ids
[
0
]
==
1
...
@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client):
...
@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client):
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
data
=
response
.
json
()
data
=
response
.
json
()
# Should return two
prompt
s
# Should return two
GenerateRequest item
s
assert
isinstance
(
data
,
list
)
assert
isinstance
(
data
,
list
)
assert
len
(
data
)
==
2
assert
len
(
data
)
==
2
# Verify both prompts have
required
fields
# Verify both prompts have
GenerateRequest
fields
for
prompt
in
data
:
for
prompt
in
data
:
assert
"prompt_token_ids"
in
prompt
assert
"token_ids"
in
prompt
assert
"prompt"
in
prompt
assert
"sampling_params"
in
prompt
assert
len
(
prompt
[
"prompt_token_ids"
])
>
0
assert
"model"
in
prompt
assert
"request_id"
in
prompt
assert
len
(
prompt
[
"token_ids"
])
>
0
assert
prompt
[
"request_id"
].
startswith
(
"cmpl-"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client):
...
@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client):
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
data
=
response
.
json
()
data
=
response
.
json
()
conversation
,
engine_prompts
=
data
# Verify tokenization occurred
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# Verify all messages preserved
assert
len
(
conversation
)
==
3
assert
conversation
[
0
][
"role"
]
==
"user"
assert
conversation
[
1
][
"role"
]
==
"assistant"
assert
conversation
[
2
][
"role"
]
==
"user"
# Verify tokenization occurred
@
pytest
.
mark
.
asyncio
assert
len
(
engine_prompts
)
>
0
async
def
test_chat_completion_render_with_stream_true
(
client
):
assert
len
(
engine_prompts
[
0
][
"prompt_token_ids"
])
>
0
"""Render accepts stream params but still returns JSON (non-streamed)."""
response
=
await
client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
MODEL_NAME
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Stream options should be accepted by /render."
,
}
],
},
)
assert
response
.
status_code
==
200
assert
response
.
headers
.
get
(
"content-type"
,
""
).
startswith
(
"application/json"
)
data
=
response
.
json
()
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# /render should preserve stream fields on the returned token-in request.
assert
data
.
get
(
"stream"
)
is
True
assert
isinstance
(
data
.
get
(
"stream_options"
),
dict
)
assert
data
[
"stream_options"
].
get
(
"include_usage"
)
is
True
assert
data
[
"stream_options"
].
get
(
"continuous_usage_stats"
)
is
True
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client):
...
@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client):
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
# Render should be fast (< 1 second) since no generation
# Render should be fast (< 1 second) since no generation
assert
elapsed
<
1.0
assert
elapsed
<
1.0
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_render_with_sampling_params
(
client
):
"""Verify sampling params are correctly returned by /render."""
response
=
await
client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
MODEL_NAME
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Test sampling params"
}],
"temperature"
:
0.123
,
"top_p"
:
0.456
,
"frequency_penalty"
:
1.1
,
},
)
assert
response
.
status_code
==
200
data
=
response
.
json
()
assert
"sampling_params"
in
data
sampling_params
=
data
[
"sampling_params"
]
assert
sampling_params
.
get
(
"temperature"
)
==
0.123
assert
sampling_params
.
get
(
"top_p"
)
==
0.456
assert
sampling_params
.
get
(
"frequency_penalty"
)
==
1.1
# Check that internal fields are not present
assert
"_all_stop_token_ids"
not
in
sampling_params
tests/entrypoints/openai/cpu/test_render_multimodal.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""
import
httpx
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
vllm.multimodal.utils
import
encode_image_url
VISION_MODEL_NAME
=
"Qwen/Qwen3-VL-2B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
vision_server
():
"""Vision-capable server used for multimodal /render tests."""
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--max-num-seqs"
,
"1"
,
"--limit-mm-per-prompt.image"
,
"1"
,
"--limit-mm-per-prompt.video"
,
"0"
,
]
env_overrides
:
dict
[
str
,
str
]
=
{}
with
RemoteOpenAIServer
(
VISION_MODEL_NAME
,
args
,
env_dict
=
env_overrides
,
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
vision_client
(
vision_server
):
async
with
httpx
.
AsyncClient
(
base_url
=
vision_server
.
url_for
(
""
),
timeout
=
60.0
)
as
http_client
:
yield
http_client
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_render_with_base64_image_url
(
vision_client
,
local_asset_server
,
):
"""Render a multimodal chat request and verify tokens are returned."""
image
=
local_asset_server
.
get_image_asset
(
"RGBA_comp.png"
)
data_url
=
encode_image_url
(
image
,
format
=
"PNG"
)
assert
data_url
.
startswith
(
"data:image/"
)
assert
";base64,"
in
data_url
response
=
await
vision_client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
data_url
}},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}
],
},
)
assert
response
.
status_code
==
200
data
=
response
.
json
()
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# Verify multimodal features are populated
assert
"features"
in
data
features
=
data
[
"features"
]
assert
features
is
not
None
# mm_hashes: should have an "image" key with a list of hash strings
assert
"mm_hashes"
in
features
assert
"image"
in
features
[
"mm_hashes"
]
image_hashes
=
features
[
"mm_hashes"
][
"image"
]
assert
isinstance
(
image_hashes
,
list
)
assert
len
(
image_hashes
)
>
0
assert
all
(
isinstance
(
h
,
str
)
for
h
in
image_hashes
)
# mm_placeholders: should have an "image" key with offset/length dicts
assert
"mm_placeholders"
in
features
assert
"image"
in
features
[
"mm_placeholders"
]
image_placeholders
=
features
[
"mm_placeholders"
][
"image"
]
assert
isinstance
(
image_placeholders
,
list
)
assert
len
(
image_placeholders
)
>
0
for
p
in
image_placeholders
:
assert
"offset"
in
p
assert
"length"
in
p
assert
isinstance
(
p
[
"offset"
],
int
)
assert
isinstance
(
p
[
"length"
],
int
)
assert
p
[
"length"
]
>
0
@
pytest
.
mark
.
asyncio
async
def
test_tokenize_matches_render_for_multimodal_input
(
vision_client
,
local_asset_server
,
):
"""`/tokenize` should match `/v1/chat/completions/render` token output."""
image
=
local_asset_server
.
get_image_asset
(
"RGBA_comp.png"
)
data_url
=
encode_image_url
(
image
,
format
=
"PNG"
)
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
data_url
}},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}
]
render_response
=
await
vision_client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
messages
,
},
)
assert
render_response
.
status_code
==
200
render_data
=
render_response
.
json
()
tokenize_response
=
await
vision_client
.
post
(
"/tokenize"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
messages
,
},
)
assert
tokenize_response
.
status_code
==
200
tokenize_data
=
tokenize_response
.
json
()
assert
tokenize_data
[
"tokens"
]
==
render_data
[
"token_ids"
]
assert
tokenize_data
[
"count"
]
==
len
(
render_data
[
"token_ids"
])
tests/entrypoints/openai/parser/test_harmony_utils.py
View file @
3fb4b5fa
...
@@ -2,31 +2,32 @@
...
@@ -2,31 +2,32 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
from
openai.types.responses
import
ResponseFunctionToolCall
,
ResponseReasoningItem
from
openai_harmony
import
Message
,
Role
from
openai.types.responses.response_output_item
import
McpCall
from
openai_harmony
import
Author
,
Message
,
Role
,
TextContent
from
tests.entrypoints.openai.utils
import
verify_harmony_messages
from
tests.entrypoints.openai.utils
import
verify_harmony_messages
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
auto_drop_analysis_messages
,
auto_drop_analysis_messages
,
get_encoding
,
get_encoding
,
get_system_message
,
has_custom_tools
,
has_custom_tools
,
parse_chat_input_to_harmony_message
,
parse_chat_input_to_harmony_message
,
parse_chat_output
,
parse_chat_output
,
parse_input_to_harmony_message
,
)
parse_output_message
,
from
vllm.entrypoints.openai.responses.harmony
import
(
response_input_to_harmony
,
response_previous_input_to_harmony
,
)
)
class
TestCommonParseInputToHarmonyMessage
:
class
TestCommonParseInputToHarmonyMessage
:
"""
"""
Tests for scenarios that are common to both Chat Completion
Tests for scenarios that are common to both Chat Completion
parse_chat_input_to_harmony_message and Response
e
s API
parse_chat_input_to_harmony_message and Responses API
parse
_input_to_harmony
_message
functions.
response_previous
_input_to_harmony functions.
"""
"""
@
pytest
.
fixture
(
@
pytest
.
fixture
(
params
=
[
parse_chat_input_to_harmony_message
,
parse
_input_to_harmony
_message
]
params
=
[
parse_chat_input_to_harmony_message
,
response_previous
_input_to_harmony
]
)
)
def
parse_function
(
self
,
request
):
def
parse_function
(
self
,
request
):
return
request
.
param
return
request
.
param
...
@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage:
...
@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage:
assert
messages
[
0
].
content
[
1
].
text
==
"actual text"
assert
messages
[
0
].
content
[
1
].
text
==
"actual text"
class
TestParseInputToHarmonyMessage
:
"""
Tests for scenarios that are specific to the Responses API
parse_input_to_harmony_message function.
"""
def
test_message_with_empty_content
(
self
):
"""Test parsing message with empty string content."""
chat_msg
=
{
"role"
:
"user"
,
"content"
:
""
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
content
[
0
].
text
==
""
def
test_tool_message_with_string_content
(
self
):
"""Test parsing tool message with string content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"get_weather"
,
"content"
:
"The weather in San Francisco is sunny, 72°F"
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.get_weather"
assert
(
messages
[
0
].
content
[
0
].
text
==
"The weather in San Francisco is sunny, 72°F"
)
assert
messages
[
0
].
channel
==
"commentary"
def
test_tool_message_with_array_content
(
self
):
"""Test parsing tool message with array content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"search_results"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Result 1: "
},
{
"type"
:
"text"
,
"text"
:
"Result 2: "
},
{
"type"
:
"image"
,
"url"
:
"http://example.com/img.png"
,
},
# Should be ignored
{
"type"
:
"text"
,
"text"
:
"Result 3"
},
],
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.search_results"
assert
messages
[
0
].
content
[
0
].
text
==
"Result 1: Result 2: Result 3"
def
test_tool_message_with_empty_content
(
self
):
"""Test parsing tool message with None content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"empty_tool"
,
"content"
:
None
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.empty_tool"
assert
messages
[
0
].
content
[
0
].
text
==
""
class
TestParseChatInputToHarmonyMessage
:
class
TestParseChatInputToHarmonyMessage
:
"""
"""
Tests for scenarios that are specific to the Chat Completion API
Tests for scenarios that are specific to the Chat Completion API
...
@@ -840,192 +766,47 @@ class TestParseChatOutput:
...
@@ -840,192 +766,47 @@ class TestParseChatOutput:
assert
reasoning
==
"I've thought hard about this."
assert
reasoning
==
"I've thought hard about this."
assert
final_content
==
"The answer is 4."
assert
final_content
==
"The answer is 4."
def
test_parse_chat_output_commentary_with_recipient_excluded
(
self
)
->
None
:
"""Commentary with a recipient (tool call) should not appear in
final_content — those are handled separately by the tool parser.
class
TestParseOutputMessage
:
The first message is a preamble (visible), the second is a tool
"""Tests for parse_output_message function."""
call (excluded). Only the preamble should appear in final_content.
def
test_commentary_with_no_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient=None (preambles) creates reasoning items.
Per Harmony format, commentary channel can contain preambles to calling
multiple functions - explanatory text with no recipient.
"""
"""
message
=
Message
.
from_role_and_content
(
harmony_str
=
(
Role
.
ASSISTANT
,
"I will now search for the weather information."
"<|channel|>commentary"
)
"<|message|>Let me check the weather.<|end|>"
message
=
message
.
with_channel
(
"commentary"
)
"<|start|>assistant to=functions.get_weather"
# recipient is None by default, representing a preamble
"<|channel|>commentary"
'<|message|>{"location": "SF"}<|end|>'
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"I will now search for the weather information."
)
assert
output_items
[
0
].
content
[
0
].
type
==
"reasoning_text"
def
test_commentary_with_function_recipient_creates_function_call
(
self
):
"""Test commentary with recipient='functions.X' creates function calls."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"location": "San Francisco", "units": "celsius"}'
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"functions.get_weather"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseFunctionToolCall
)
assert
output_items
[
0
].
type
==
"function_call"
assert
output_items
[
0
].
name
==
"get_weather"
assert
(
output_items
[
0
].
arguments
==
'{"location": "San Francisco", "units": "celsius"}'
)
assert
output_items
[
0
].
call_id
.
startswith
(
"call_"
)
assert
output_items
[
0
].
id
.
startswith
(
"fc_"
)
def
test_commentary_with_python_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='python' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"import numpy as np
\n
print(np.array([1, 2, 3]))"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"python"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"import numpy as np
\n
print(np.array([1, 2, 3]))"
)
def
test_commentary_with_browser_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='browser' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Navigating to the specified URL"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"browser"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
output_items
[
0
].
content
[
0
].
text
==
"Navigating to the specified URL"
def
test_commentary_with_container_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='container' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Running command in container"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"container"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
output_items
[
0
].
content
[
0
].
text
==
"Running command in container"
def
test_commentary_with_empty_content_and_no_recipient
(
self
):
"""Test edge case: empty commentary with recipient=None."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
""
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
content
[
0
].
text
==
""
def
test_commentary_with_multiple_contents_and_no_recipient
(
self
):
"""Test multiple content items in commentary with no recipient."""
contents
=
[
TextContent
(
text
=
"Step 1: Analyze the request"
),
TextContent
(
text
=
"Step 2: Prepare to call functions"
),
]
message
=
Message
.
from_role_and_contents
(
Role
.
ASSISTANT
,
contents
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
2
assert
all
(
isinstance
(
item
,
ResponseReasoningItem
)
for
item
in
output_items
)
assert
output_items
[
0
].
content
[
0
].
text
==
"Step 1: Analyze the request"
assert
output_items
[
1
].
content
[
0
].
text
==
"Step 2: Prepare to call functions"
def
test_commentary_with_multiple_function_calls
(
self
):
"""Test multiple function calls in commentary channel."""
contents
=
[
TextContent
(
text
=
'{"location": "San Francisco"}'
),
TextContent
(
text
=
'{"location": "New York"}'
),
]
message
=
Message
.
from_role_and_contents
(
Role
.
ASSISTANT
,
contents
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"functions.get_weather"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
2
assert
all
(
isinstance
(
item
,
ResponseFunctionToolCall
)
for
item
in
output_items
)
assert
output_items
[
0
].
name
==
"get_weather"
assert
output_items
[
1
].
name
==
"get_weather"
assert
output_items
[
0
].
arguments
==
'{"location": "San Francisco"}'
assert
output_items
[
1
].
arguments
==
'{"location": "New York"}'
def
test_commentary_with_unknown_recipient_creates_mcp_call
(
self
):
"""Test that commentary with unknown recipient creates MCP call."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"arg": "value"}'
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"custom_tool"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
McpCall
)
assert
output_items
[
0
].
type
==
"mcp_call"
assert
output_items
[
0
].
name
==
"custom_tool"
assert
output_items
[
0
].
server_label
==
"custom_tool"
def
test_analysis_channel_creates_reasoning
(
self
):
"""Test that analysis channel creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Analyzing the problem step by step..."
)
message
=
message
.
with_channel
(
"analysis"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"Analyzing the problem step by step..."
)
)
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
reasoning
,
final_content
,
_
=
parse_chat_output
(
token_ids
)
assert
reasoning
is
None
assert
final_content
==
"Let me check the weather."
def
test_non_assistant_message_returns_empty
(
self
):
def
test_parse_chat_output_interrupted_preamble
(
self
)
->
None
:
"""Test that non-assistant messages return empty list.
"""Partial/interrupted preamble (commentary without recipient) should
appear in final_content, not reasoning."""
harmony_str
=
"<|channel|>commentary<|message|>I'll search for that"
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
reasoning
,
final_content
,
_
=
parse_chat_output
(
token_ids
)
assert
reasoning
is
None
assert
final_content
==
"I'll search for that"
Per the implementation, tool messages to assistant (e.g., search results)
def
test_parse_chat_output_preamble_then_final
(
self
)
->
None
:
are not included in final output to align with OpenAI behavior.
"""Preamble followed by a final message should both appear in
"""
final_content, joined by newline."""
message
=
Message
.
from_author_and_content
(
harmony_str
=
(
Author
.
new
(
Role
.
TOOL
,
"functions.get_weather"
),
"<|channel|>commentary"
"The weather is sunny, 72°F"
,
"<|message|>Let me look that up.<|end|>"
"<|start|>assistant<|channel|>final"
"<|message|>The answer is 42.<|end|>"
)
)
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
output_items
=
parse_output
_message
(
message
)
reasoning
,
final_content
,
_
=
parse_
chat_
output
(
token_ids
)
assert
reasoning
is
None
assert
len
(
output_items
)
==
0
assert
final_content
==
"Let me look that up.
\n
The answer is 42."
def
test_has_custom_tools
()
->
None
:
def
test_has_custom_tools
()
->
None
:
...
@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None:
...
@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None:
)
)
def
test_parse_mcp_call_basic
()
->
None
:
class
TestGetSystemMessage
:
"""Test that MCP calls are parsed with correct type and server_label."""
"""Tests for get_system_message channel configuration."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"path": "/tmp"}'
)
message
=
message
.
with_recipient
(
"filesystem"
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
def
test_commentary_channel_present_without_custom_tools
(
self
)
->
None
:
"""Commentary channel must be valid even without custom tools."""
sys_msg
=
get_system_message
(
with_custom_tools
=
False
)
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
assert
"commentary"
in
valid_channels
assert
len
(
output_items
)
==
1
def
test_commentary_channel_present_with_custom_tools
(
self
)
->
None
:
assert
isinstance
(
output_items
[
0
],
McpCall
)
"""Commentary channel present when custom tools are enabled."""
assert
output_items
[
0
].
type
==
"mcp_call"
sys_msg
=
get_system_message
(
with_custom_tools
=
True
)
assert
output_items
[
0
].
name
==
"filesystem"
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
assert
output_items
[
0
].
server_label
==
"filesystem"
assert
"commentary"
in
valid_channels
assert
output_items
[
0
].
arguments
==
'{"path": "/tmp"}'
assert
output_items
[
0
].
status
==
"completed"
def
test_all_standard_channels_present
(
self
)
->
None
:
"""All three standard Harmony channels should always be valid."""
for
with_tools
in
(
True
,
False
):
sys_msg
=
get_system_message
(
with_custom_tools
=
with_tools
)
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
for
channel
in
(
"analysis"
,
"commentary"
,
"final"
):
assert
channel
in
valid_channels
,
(
f
"
{
channel
}
missing when with_custom_tools=
{
with_tools
}
"
)
def
test_parse_mcp_call_dotted_recipient
()
->
None
:
"""Test that dotted recipients extract the tool name correctly."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"cmd": "ls"}'
)
message
=
message
.
with_recipient
(
"repo_browser.list"
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
class
TestResponseInputToHarmonyReasoningItem
:
"""Tests for response_input_to_harmony handling of reasoning input items.
assert
len
(
output_items
)
==
1
Per the OpenAI spec, ResponseReasoningItem.content is
assert
isinstance
(
output_items
[
0
],
McpCall
)
Optional[List[Content]] = None. Clients like langchain-openai may omit
assert
output_items
[
0
].
name
==
"list"
this field when constructing multi-turn input from previous responses.
assert
output_items
[
0
].
server_label
==
"repo_browser"
Reasoning items with content are converted to Harmony messages on the
'analysis' channel. All content items are concatenated. Items without
content return None (skipped by the caller).
"""
def
test_mcp_vs_function_call
()
->
None
:
def
test_reasoning_with_single_content
(
self
):
"""Test that function calls are not parsed as MCP calls."""
"""Test reasoning item with a single content entry."""
func_message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"arg": "value"}'
)
item
=
{
func_message
=
func_message
.
with_recipient
(
"functions.my_tool"
)
"type"
:
"reasoning"
,
func_message
=
func_message
.
with_channel
(
"commentary"
)
"id"
:
"rs_123"
,
"content"
:
[{
"type"
:
"reasoning_text"
,
"text"
:
"Thinking step by step"
}],
}
func_ite
ms
=
parse_output_message
(
func_message
)
ms
g
=
response_input_to_harmony
(
item
,
prev_responses
=
[]
)
assert
len
(
func_items
)
==
1
assert
msg
is
not
None
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
msg
.
author
.
role
==
Role
.
ASSISTANT
assert
func_items
[
0
].
type
==
"function_call"
assert
msg
.
content
[
0
].
text
==
"Thinking step by step"
assert
msg
.
channel
==
"analysis"
def
test_reasoning_with_multiple_content_items
(
self
):
"""Test reasoning item with multiple content entries concatenated."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
[
{
"type"
:
"reasoning_text"
,
"text"
:
"First, let me analyze"
},
{
"type"
:
"reasoning_text"
,
"text"
:
"Second, I should consider"
},
{
"type"
:
"reasoning_text"
,
"text"
:
"Finally, the answer is"
},
],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
assert
msg
is
not
None
assert
msg
.
author
.
role
==
Role
.
ASSISTANT
assert
msg
.
content
[
0
].
text
==
(
"First, let me analyze
\n
Second, I should consider
\n
Finally, the answer is"
)
assert
msg
.
channel
==
"analysis"
def
test_reasoning_without_content_returns_none
(
self
):
"""Test reasoning item without content field returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"summary"
:
[{
"type"
:
"summary_text"
,
"text"
:
"Thinking about math"
}],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
def
test_mcp_vs_builtin_tools
()
->
None
:
assert
msg
is
None
"""Test that built-in tools (python, container) are not parsed as MCP calls."""
# Test python (built-in tool) - should be reasoning, not MCP
python_message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"print('hello')"
)
python_message
=
python_message
.
with_recipient
(
"python"
)
python_message
=
python_message
.
with_channel
(
"commentary"
)
python_items
=
parse_output_message
(
python_message
)
def
test_reasoning_with_none_content_returns_none
(
self
):
"""Test reasoning item with content=None returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
None
,
"summary"
:
[{
"type"
:
"summary_text"
,
"text"
:
"Thinking about math"
}],
}
assert
len
(
python_items
)
==
1
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
assert
not
isinstance
(
python_items
[
0
],
McpCall
)
assert
python_items
[
0
].
type
==
"reasoning"
assert
msg
is
None
def
test_reasoning_with_empty_content_returns_none
(
self
):
"""Test reasoning item with empty content list returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
[],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
def
test_parse_remaining_state_commentary_channel
()
->
None
:
assert
msg
is
None
"""Test parse_remaining_state with commentary channel and various recipients."""
from
unittest.mock
import
Mock
from
vllm.entrypoints.openai.parser.harmony_utils
import
parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func
=
Mock
()
parser_func
.
current_content
=
'{"arg": "value"}'
parser_func
.
current_role
=
Role
.
ASSISTANT
parser_func
.
current_channel
=
"commentary"
parser_func
.
current_recipient
=
"functions.my_tool"
func_items
=
parse_remaining_state
(
parser_func
)
assert
len
(
func_items
)
==
1
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
func_items
[
0
].
type
==
"function_call"
assert
func_items
[
0
].
name
==
"my_tool"
assert
func_items
[
0
].
status
==
"in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp
=
Mock
()
parser_mcp
.
current_content
=
'{"path": "/tmp"}'
parser_mcp
.
current_role
=
Role
.
ASSISTANT
parser_mcp
.
current_channel
=
"commentary"
parser_mcp
.
current_recipient
=
"filesystem"
mcp_items
=
parse_remaining_state
(
parser_mcp
)
assert
len
(
mcp_items
)
==
1
assert
isinstance
(
mcp_items
[
0
],
McpCall
)
assert
mcp_items
[
0
].
type
==
"mcp_call"
assert
mcp_items
[
0
].
name
==
"filesystem"
assert
mcp_items
[
0
].
server_label
==
"filesystem"
assert
mcp_items
[
0
].
status
==
"in_progress"
# Test 3: Built-in tool (python)
# should NOT return MCP call, falls through to reasoning
parser_builtin
=
Mock
()
parser_builtin
.
current_content
=
"print('hello')"
parser_builtin
.
current_role
=
Role
.
ASSISTANT
parser_builtin
.
current_channel
=
"commentary"
parser_builtin
.
current_recipient
=
"python"
builtin_items
=
parse_remaining_state
(
parser_builtin
)
# Should fall through to reasoning logic
assert
len
(
builtin_items
)
==
1
assert
not
isinstance
(
builtin_items
[
0
],
McpCall
)
assert
builtin_items
[
0
].
type
==
"reasoning"
def
test_parse_remaining_state_analysis_channel
()
->
None
:
"""Test parse_remaining_state with analysis channel and various recipients."""
from
unittest.mock
import
Mock
from
vllm.entrypoints.openai.parser.harmony_utils
import
parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func
=
Mock
()
parser_func
.
current_content
=
'{"arg": "value"}'
parser_func
.
current_role
=
Role
.
ASSISTANT
parser_func
.
current_channel
=
"analysis"
parser_func
.
current_recipient
=
"functions.my_tool"
func_items
=
parse_remaining_state
(
parser_func
)
assert
len
(
func_items
)
==
1
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
func_items
[
0
].
type
==
"function_call"
assert
func_items
[
0
].
name
==
"my_tool"
assert
func_items
[
0
].
status
==
"in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp
=
Mock
()
parser_mcp
.
current_content
=
'{"query": "test"}'
parser_mcp
.
current_role
=
Role
.
ASSISTANT
parser_mcp
.
current_channel
=
"analysis"
parser_mcp
.
current_recipient
=
"database"
mcp_items
=
parse_remaining_state
(
parser_mcp
)
assert
len
(
mcp_items
)
==
1
assert
isinstance
(
mcp_items
[
0
],
McpCall
)
assert
mcp_items
[
0
].
type
==
"mcp_call"
assert
mcp_items
[
0
].
name
==
"database"
assert
mcp_items
[
0
].
server_label
==
"database"
assert
mcp_items
[
0
].
status
==
"in_progress"
# Test 3: Built-in tool (container)
# should NOT return MCP call, falls through to reasoning
parser_builtin
=
Mock
()
parser_builtin
.
current_content
=
"docker run"
parser_builtin
.
current_role
=
Role
.
ASSISTANT
parser_builtin
.
current_channel
=
"analysis"
parser_builtin
.
current_recipient
=
"container"
builtin_items
=
parse_remaining_state
(
parser_builtin
)
# Should fall through to reasoning logic
assert
len
(
builtin_items
)
==
1
assert
not
isinstance
(
builtin_items
[
0
],
McpCall
)
assert
builtin_items
[
0
].
type
==
"reasoning"
tests/entrypoints/openai/responses/conftest.py
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
json
import
logging
from
collections.abc
import
Callable
from
typing
import
Any
import
pytest
import
pytest
logger
=
logging
.
getLogger
(
__name__
)
BASE_TEST_ENV
=
{
# The day vLLM said "hello world" on arxiv 🚀
"VLLM_SYSTEM_START_DATE"
:
"2023-09-12"
,
}
DEFAULT_MAX_RETRIES
=
3
@
pytest
.
fixture
@
pytest
.
fixture
def
pairs_of_event_types
()
->
dict
[
str
,
str
]:
def
pairs_of_event_types
()
->
dict
[
str
,
str
]:
...
@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]:
...
@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]:
"response.mcp_call.completed"
:
"response.mcp_call.in_progress"
,
"response.mcp_call.completed"
:
"response.mcp_call.in_progress"
,
"response.function_call_arguments.done"
:
"response.function_call_arguments.delta"
,
# noqa: E501
"response.function_call_arguments.done"
:
"response.function_call_arguments.delta"
,
# noqa: E501
"response.code_interpreter_call_code.done"
:
"response.code_interpreter_call_code.delta"
,
# noqa: E501
"response.code_interpreter_call_code.done"
:
"response.code_interpreter_call_code.delta"
,
# noqa: E501
"response.code_interpreter_call.completed"
:
"response.code_interpreter_call.in_progress"
,
# noqa: E501
"response.web_search_call.completed"
:
"response.web_search_call.in_progress"
,
"response.web_search_call.completed"
:
"response.web_search_call.in_progress"
,
}
}
# fmt: on
# fmt: on
return
event_pairs
return
event_pairs
async
def
retry_for_tool_call
(
client
,
*
,
model
:
str
,
expected_tool_type
:
str
,
max_retries
:
int
=
DEFAULT_MAX_RETRIES
,
**
create_kwargs
:
Any
,
):
"""Call ``client.responses.create`` up to *max_retries* times, returning
the first response that contains an output item of *expected_tool_type*.
Returns the **last** response if none match so the caller's assertions
fire with a clear diagnostic.
"""
last_response
=
None
for
attempt
in
range
(
max_retries
):
response
=
await
client
.
responses
.
create
(
model
=
model
,
**
create_kwargs
)
last_response
=
response
if
any
(
getattr
(
item
,
"type"
,
None
)
==
expected_tool_type
for
item
in
response
.
output
):
return
response
assert
last_response
is
not
None
return
last_response
async
def
retry_streaming_for
(
client
,
*
,
model
:
str
,
validate_events
:
Callable
[[
list
],
bool
],
max_retries
:
int
=
DEFAULT_MAX_RETRIES
,
**
create_kwargs
:
Any
,
)
->
list
:
"""Call ``client.responses.create(stream=True)`` up to *max_retries*
times, returning the first event list where *validate_events* returns
``True``.
"""
last_events
:
list
=
[]
for
attempt
in
range
(
max_retries
):
stream
=
await
client
.
responses
.
create
(
model
=
model
,
stream
=
True
,
**
create_kwargs
)
events
:
list
=
[]
async
for
event
in
stream
:
events
.
append
(
event
)
last_events
=
events
if
validate_events
(
events
):
return
events
return
last_events
def
has_output_type
(
response
,
type_name
:
str
)
->
bool
:
"""Return True if *response* has at least one output item of *type_name*."""
return
any
(
getattr
(
item
,
"type"
,
None
)
==
type_name
for
item
in
response
.
output
)
def
events_contain_type
(
events
:
list
,
type_substring
:
str
)
->
bool
:
"""Return True if any event's type contains *type_substring*."""
return
any
(
type_substring
in
getattr
(
e
,
"type"
,
""
)
for
e
in
events
)
def
_validate_event_pairing
(
events
:
list
,
pairs_of_event_types
:
dict
[
str
,
str
])
->
None
:
"""Validate that streaming events are properly nested/paired.
Derives push/pop sets from *pairs_of_event_types* so that every
start/end pair in the dict is handled automatically.
"""
start_events
=
set
(
pairs_of_event_types
.
values
())
end_events
=
set
(
pairs_of_event_types
.
keys
())
stack
:
list
[
str
]
=
[]
for
event
in
events
:
etype
=
event
.
type
if
etype
in
end_events
:
expected_start
=
pairs_of_event_types
[
etype
]
assert
stack
and
stack
[
-
1
]
==
expected_start
,
(
f
"Stack mismatch for
{
etype
}
: "
f
"expected
{
expected_start
}
, "
f
"got
{
stack
[
-
1
]
if
stack
else
'<empty>'
}
"
)
stack
.
pop
()
elif
etype
in
start_events
:
# Consecutive deltas of the same type share a single stack slot.
if
etype
.
endswith
(
"delta"
)
and
stack
and
stack
[
-
1
]
==
etype
:
continue
stack
.
append
(
etype
)
# else: passthrough event (e.g. response.in_progress,
# web_search_call.searching, code_interpreter_call.interpreting)
assert
len
(
stack
)
==
0
,
f
"Unclosed events on stack:
{
stack
}
"
def
_validate_event_ordering
(
events
:
list
)
->
None
:
"""Validate that envelope events appear in the correct positions."""
assert
len
(
events
)
>=
2
,
f
"Expected at least 2 events, got
{
len
(
events
)
}
"
# First event must be response.created
assert
events
[
0
].
type
==
"response.created"
,
(
f
"First event must be response.created, got
{
events
[
0
].
type
}
"
)
# Last event must be response.completed
assert
events
[
-
1
].
type
==
"response.completed"
,
(
f
"Last event must be response.completed, got
{
events
[
-
1
].
type
}
"
)
# response.in_progress, if present, must be the second event
in_progress_indices
=
[
i
for
i
,
e
in
enumerate
(
events
)
if
e
.
type
==
"response.in_progress"
]
if
in_progress_indices
:
assert
in_progress_indices
==
[
1
],
(
f
"response.in_progress must be the second event, "
f
"found at indices
{
in_progress_indices
}
"
)
# Exactly one created and one completed
created_count
=
sum
(
1
for
e
in
events
if
e
.
type
==
"response.created"
)
completed_count
=
sum
(
1
for
e
in
events
if
e
.
type
==
"response.completed"
)
assert
created_count
==
1
,
(
f
"Expected exactly 1 response.created, got
{
created_count
}
"
)
assert
completed_count
==
1
,
(
f
"Expected exactly 1 response.completed, got
{
completed_count
}
"
)
def
_validate_field_consistency
(
events
:
list
)
->
None
:
"""Validate item_id, output_index, and content_index consistency.
Tracks the active output item established by ``output_item.added``
and verifies that all subsequent events for that item carry matching
identifiers until ``output_item.done`` closes it.
"""
_SESSION_EVENTS
=
{
"response.created"
,
"response.in_progress"
,
"response.completed"
,
}
active_item_id
:
str
|
None
=
None
active_output_index
:
int
|
None
=
None
last_output_index
:
int
=
-
1
active_content_index
:
int
|
None
=
None
for
event
in
events
:
etype
=
event
.
type
if
etype
in
_SESSION_EVENTS
:
continue
# --- output_item.added: opens a new item ------------------
if
etype
==
"response.output_item.added"
:
item
=
getattr
(
event
,
"item"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
assert
item
is
not
None
,
"output_item.added must have an item"
item_id
=
getattr
(
item
,
"id"
,
None
)
assert
item_id
,
"output_item.added item must have an id"
# output_index must be non-decreasing across items
if
output_index
is
not
None
:
assert
output_index
>=
last_output_index
,
(
f
"output_index went backwards:
{
output_index
}
<
{
last_output_index
}
"
)
last_output_index
=
output_index
active_item_id
=
item_id
active_output_index
=
output_index
active_content_index
=
None
continue
# --- output_item.done: closes the active item -------------
if
etype
==
"response.output_item.done"
:
item
=
getattr
(
event
,
"item"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
assert
item
is
not
None
,
"output_item.done must have an item"
done_item_id
=
getattr
(
item
,
"id"
,
None
)
if
active_item_id
is
not
None
and
done_item_id
:
assert
done_item_id
==
active_item_id
,
(
f
"output_item.done item.id mismatch: "
f
"expected
{
active_item_id
}
, got
{
done_item_id
}
"
)
if
active_output_index
is
not
None
and
output_index
is
not
None
:
assert
output_index
==
active_output_index
,
(
f
"output_item.done output_index mismatch: "
f
"expected
{
active_output_index
}
, got
{
output_index
}
"
)
active_item_id
=
None
active_output_index
=
None
active_content_index
=
None
continue
# --- content_part / reasoning_part added: sets content_index
if
etype
in
(
"response.content_part.added"
,
"response.reasoning_part.added"
,
):
_assert_item_fields
(
event
,
etype
,
active_item_id
,
active_output_index
)
active_content_index
=
getattr
(
event
,
"content_index"
,
None
)
continue
# --- all other item-level events --------------------------
_assert_item_fields
(
event
,
etype
,
active_item_id
,
active_output_index
)
# content_index (only meaningful on events that carry it)
content_index
=
getattr
(
event
,
"content_index"
,
None
)
if
content_index
is
not
None
and
active_content_index
is
not
None
:
assert
content_index
==
active_content_index
,
(
f
"
{
etype
}
content_index mismatch: "
f
"expected
{
active_content_index
}
, got
{
content_index
}
"
)
def
_assert_item_fields
(
event
,
etype
:
str
,
active_item_id
:
str
|
None
,
active_output_index
:
int
|
None
,
)
->
None
:
"""Check that *event*'s item_id and output_index match the active item."""
event_item_id
=
getattr
(
event
,
"item_id"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
if
active_item_id
is
not
None
and
event_item_id
is
not
None
:
assert
event_item_id
==
active_item_id
,
(
f
"
{
etype
}
item_id mismatch: expected
{
active_item_id
}
, got
{
event_item_id
}
"
)
if
active_output_index
is
not
None
and
output_index
is
not
None
:
assert
output_index
==
active_output_index
,
(
f
"
{
etype
}
output_index mismatch: "
f
"expected
{
active_output_index
}
, got
{
output_index
}
"
)
def
validate_streaming_event_stack
(
events
:
list
,
pairs_of_event_types
:
dict
[
str
,
str
]
)
->
None
:
"""Validate streaming events: pairing, ordering, and field consistency.
Checks three aspects:
1. **Event pairing** — start/end events are properly nested
(stack-based matching derived from *pairs_of_event_types*).
2. **Event ordering** — envelope events (``created``,
``in_progress``, ``completed``) appear at the correct positions.
3. **Field consistency** — ``item_id``, ``output_index``, and
``content_index`` are consistent across related events within
each output item's lifecycle.
"""
_validate_event_pairing
(
events
,
pairs_of_event_types
)
_validate_event_ordering
(
events
)
_validate_field_consistency
(
events
)
def
log_response_diagnostics
(
response
,
*
,
label
:
str
=
"Response Diagnostics"
,
)
->
dict
[
str
,
Any
]:
"""Extract and log diagnostic info from a Responses API response.
Logs reasoning, tool-call attempts, MCP items, and output types so
that CI output (``pytest -s`` or ``--log-cli-level=INFO``) gives
full visibility into model behaviour even on passing runs.
Returns the extracted data so callers can make additional assertions
if needed.
"""
reasoning_texts
=
[
text
for
item
in
response
.
output
if
getattr
(
item
,
"type"
,
None
)
==
"reasoning"
for
content
in
getattr
(
item
,
"content"
,
[])
if
(
text
:
=
getattr
(
content
,
"text"
,
None
))
]
tool_call_attempts
=
[
{
"recipient"
:
msg
.
get
(
"recipient"
),
"channel"
:
msg
.
get
(
"channel"
),
}
for
msg
in
response
.
output_messages
if
(
msg
.
get
(
"recipient"
)
or
""
).
startswith
(
"python"
)
]
mcp_items
=
[
{
"name"
:
getattr
(
item
,
"name"
,
None
),
"status"
:
getattr
(
item
,
"status"
,
None
),
}
for
item
in
response
.
output
if
getattr
(
item
,
"type"
,
None
)
==
"mcp_call"
]
output_types
=
[
getattr
(
o
,
"type"
,
None
)
for
o
in
response
.
output
]
diagnostics
=
{
"model_attempted_tool_calls"
:
bool
(
tool_call_attempts
),
"tool_call_attempts"
:
tool_call_attempts
,
"mcp_items"
:
mcp_items
,
"reasoning"
:
reasoning_texts
,
"output_text"
:
response
.
output_text
,
"output_types"
:
output_types
,
}
logger
.
info
(
"
\n
====== %s ======
\n
%s
\n
=============================="
,
label
,
json
.
dumps
(
diagnostics
,
indent
=
2
,
default
=
str
),
)
return
diagnostics
tests/entrypoints/openai/responses/test_errors.py
View file @
3fb4b5fa
...
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock
...
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock
import
pytest
import
pytest
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.serving
import
GenerationError
,
OpenAIServing
from
vllm.entrypoints.openai.engine.serving
import
GenerationError
,
OpenAIServing
...
@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
...
@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
serving
.
_raise_if_error
(
None
,
"test-request-id"
)
# should not raise
serving
.
_raise_if_error
(
None
,
"test-request-id"
)
# should not raise
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_response
():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine
=
MagicMock
()
mock_engine
.
model_config
=
MagicMock
()
mock_engine
.
model_config
.
max_model_len
=
100
mock_models
=
MagicMock
()
serving
=
OpenAIServing
(
engine_client
=
mock_engine
,
models
=
mock_models
,
request_logger
=
None
,
)
# create a GenerationError
gen_error
=
GenerationError
(
"Internal server error"
)
# convert to ErrorResponse
error_response
=
serving
.
_convert_generation_error_to_response
(
gen_error
)
assert
isinstance
(
error_response
,
ErrorResponse
)
assert
error_response
.
error
.
type
==
"InternalServerError"
assert
error_response
.
error
.
message
==
"Internal server error"
assert
error_response
.
error
.
code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_streaming_response
():
async
def
test_convert_generation_error_to_streaming_response
():
"""test _convert_generation_error_to_streaming_response output"""
"""test _convert_generation_error_to_streaming_response output"""
...
...
Prev
1
…
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment