Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
878 additions
and
548 deletions
+878
-548
tests/entrypoints/openai/chat_completion/test_serving_chat.py
...s/entrypoints/openai/chat_completion/test_serving_chat.py
+152
-45
tests/entrypoints/openai/chat_completion/test_serving_chat_stream_harmony.py
...penai/chat_completion/test_serving_chat_stream_harmony.py
+4
-10
tests/entrypoints/openai/cpu/__init__.py
tests/entrypoints/openai/cpu/__init__.py
+0
-0
tests/entrypoints/openai/cpu/test_render.py
tests/entrypoints/openai/cpu/test_render.py
+96
-57
tests/entrypoints/openai/cpu/test_render_multimodal.py
tests/entrypoints/openai/cpu/test_render_multimodal.py
+155
-0
tests/entrypoints/openai/parser/test_harmony_utils.py
tests/entrypoints/openai/parser/test_harmony_utils.py
+138
-409
tests/entrypoints/openai/responses/conftest.py
tests/entrypoints/openai/responses/conftest.py
+333
-0
tests/entrypoints/openai/responses/test_errors.py
tests/entrypoints/openai/responses/test_errors.py
+0
-27
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/test_serving_chat.py
→
tests/entrypoints/openai/
chat_completion/
test_serving_chat.py
View file @
3fb4b5fa
...
...
@@ -10,6 +10,12 @@ import pytest
import
pytest_asyncio
from
openai
import
OpenAI
from
tests.entrypoints.openai.utils
import
(
accumulate_streaming_response
,
verify_chat_response
,
verify_harmony_messages
,
)
from
tests.utils
import
RemoteOpenAIServer
from
vllm._aiter_ops
import
is_aiter_found_and_supported
from
vllm.config
import
MultiModalConfig
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
...
...
@@ -21,8 +27,14 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse
,
RequestResponseMetadata
,
)
from
vllm.entrypoints.openai.models.serving
import
BaseModelPath
,
OpenAIServingModels
from
vllm.entrypoints.openai.models.serving
import
(
BaseModelPath
,
OpenAIModelRegistry
,
OpenAIServingModels
,
)
from
vllm.entrypoints.openai.parser.harmony_utils
import
get_encoding
from
vllm.entrypoints.serve.render.serving
import
OpenAIServingRender
from
vllm.exceptions
import
VLLMValidationError
from
vllm.inputs
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.renderers.hf
import
HfRenderer
...
...
@@ -33,13 +45,6 @@ from vllm.tokenizers.registry import tokenizer_args_from_config
from
vllm.tool_parsers
import
ToolParserManager
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
RemoteOpenAIServer
from
.utils
import
(
accumulate_streaming_response
,
verify_chat_response
,
verify_harmony_messages
,
)
GPT_OSS_MODEL_NAME
=
"openai/gpt-oss-20b"
GPT_OSS_SPECULATOR_NAME
=
"RedHatAI/gpt-oss-20b-speculator.eagle3"
...
...
@@ -126,7 +131,7 @@ def gptoss_speculative_server(default_server_args: list[str]):
if
is_aiter_found_and_supported
():
env_dict
=
{
"VLLM_ROCM_USE_AITER"
:
"1"
}
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
server_args
,
env_dict
=
env_dict
GPT_OSS_MODEL_NAME
,
server_args
,
env_dict
=
env_dict
,
max_wait_seconds
=
480
)
as
remote_server
:
yield
remote_server
...
...
@@ -520,38 +525,67 @@ class MockModelConfig:
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
logits_processors
:
list
[
str
]
|
None
=
None
logits_processor_pattern
=
None
diff_sampling_param
:
dict
|
None
=
None
allowed_local_media_path
:
str
=
""
allowed_media_domains
:
list
[
str
]
|
None
=
None
encoder_config
=
None
generation_config
:
str
=
"auto"
override_generation_config
:
dict
[
str
,
Any
]
=
field
(
default_factory
=
dict
)
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
skip_tokenizer_init
:
bool
=
False
is_encoder_decoder
:
bool
=
False
is_multimodal_model
:
bool
=
False
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
@
dataclass
class
MockParallelConfig
:
_api_process_rank
:
int
=
0
@
dataclass
class
MockVllmConfig
:
model_config
:
MockModelConfig
parallel_config
:
MockParallelConfig
def
_build_renderer
(
model_config
:
MockModelConfig
):
_
,
tokenizer_name
,
_
,
kwargs
=
tokenizer_args_from_config
(
model_config
)
return
HfRenderer
(
model_config
,
return
HfRenderer
.
from_config
(
MockVllmConfig
(
model_config
,
parallel_config
=
MockParallelConfig
())
,
tokenizer_kwargs
=
{
**
kwargs
,
"tokenizer_name"
:
tokenizer_name
},
)
def
_build_serving_render
(
engine
,
model_registry
:
OpenAIModelRegistry
)
->
OpenAIServingRender
:
return
OpenAIServingRender
(
model_config
=
engine
.
model_config
,
renderer
=
engine
.
renderer
,
io_processor
=
engine
.
io_processor
,
model_registry
=
model_registry
,
request_logger
=
None
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
)
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
models
=
OpenAIServingModels
(
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_chat
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
@@ -572,10 +606,13 @@ async def _async_serving_chat_init():
engine
=
MockEngine
()
models
=
OpenAIServingModels
(
engine
,
BASE_MODEL_PATHS
)
openai_serving_render
=
_build_serving_render
(
engine
,
models
.
registry
)
serving_completion
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
@@ -645,12 +682,10 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
#
Setting server's max_tokens in the generation_config.json
#
lower than context_window - prompt_tokens
#
Model author's generation_config.json sets max_tokens (auto, no override)
#
— should act as fallback only, not ceiling
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
10
# Setting server-side max_tokens limit
}
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
10
}
# Reinitialize the engine with new settings
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
...
...
@@ -674,13 +709,14 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 2: Request's max_tokens set higher than server accepts
# Test Case 2: Request's max_tokens set higher than generation_config
# default so request-provided max_tokens takes precedence
req
.
max_tokens
=
15
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
1
0
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
1
5
# Test Case 3: Request's max_tokens set lower than server accepts
req
.
max_tokens
=
5
...
...
@@ -690,12 +726,52 @@ async def test_serving_chat_should_set_correct_max_tokens():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
5
# User explicitly sets max_tokens via --override-generation-config
# — should act as a ceiling
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
10
}
mock_model_config
.
override_generation_config
=
{
"max_new_tokens"
:
10
}
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
mock_model_config
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
mock_engine
.
renderer
=
_build_renderer
(
mock_engine
.
model_config
)
serving_chat
=
_build_serving_chat
(
mock_engine
)
# Test Case 3.1: No max_tokens — uses override as default
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
)
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 3.2: Request max_tokens higher — capped by user ceiling from override
req
.
max_tokens
=
15
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
# Test Case 3.3: Request max_tokens lower — respected
req
.
max_tokens
=
5
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
5
# Setting server's max_tokens in the generation_config.json
# higher than context_window - prompt_tokens
mock_model_config
=
MockModelConfig
()
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
200
# Setting server-side max_tokens limit
}
mock_model_config
.
diff_sampling_param
=
{
"max_tokens"
:
200
}
# Reinitialize the engine with new settings
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
...
...
@@ -749,8 +825,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
mock_engine
.
io_processor
=
MagicMock
()
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_renderer
=
MistralRenderer
(
mock_engine
.
model_config
,
tokenizer_kwargs
=
{})
mock_renderer
.
_tokenizer
=
mock_tokenizer
mock_renderer
=
MistralRenderer
(
MockVllmConfig
(
mock_engine
.
model_config
,
parallel_config
=
MockParallelConfig
()),
tokenizer
=
mock_tokenizer
,
)
# Force the Mistral chat template renderer to return token IDs.
# Choose a prompt length that is < max_model_len, but large enough that
# adding max_tokens should exceed the model context window.
...
...
@@ -770,9 +848,8 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
max_tokens
=
10
,
)
resp
=
await
serving_chat
.
create_chat_completion
(
req
)
assert
isinstance
(
resp
,
ErrorResponse
)
assert
"context length is only"
in
resp
.
error
.
message
with
pytest
.
raises
(
VLLMValidationError
):
await
serving_chat
.
create_chat_completion
(
req
)
@
pytest
.
mark
.
asyncio
...
...
@@ -788,8 +865,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
mock_engine
.
io_processor
=
MagicMock
()
mock_tokenizer
=
MagicMock
(
spec
=
MistralTokenizer
)
mock_renderer
=
MistralRenderer
(
mock_engine
.
model_config
,
tokenizer_kwargs
=
{})
mock_renderer
.
_tokenizer
=
mock_tokenizer
mock_renderer
=
MistralRenderer
(
MockVllmConfig
(
mock_engine
.
model_config
,
parallel_config
=
MockParallelConfig
()),
tokenizer
=
mock_tokenizer
,
)
# prompt_token_ids length == max_model_len should be rejected for
# completion-like requests (ChatCompletionRequest).
mock_renderer
.
render_messages_async
=
AsyncMock
(
...
...
@@ -810,9 +889,8 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
max_tokens
=
1
,
)
resp
=
await
serving_chat
.
create_chat_completion
(
req
)
assert
isinstance
(
resp
,
ErrorResponse
)
assert
"context length is only"
in
resp
.
error
.
message
with
pytest
.
raises
(
VLLMValidationError
):
await
serving_chat
.
create_chat_completion
(
req
)
@
pytest
.
mark
.
asyncio
...
...
@@ -1127,7 +1205,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1154,7 +1234,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1175,7 +1257,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1219,7 +1303,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1256,7 +1342,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1300,7 +1388,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1337,7 +1427,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the first turn's input
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
[
...
...
@@ -1381,7 +1473,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the second turn's input
req_2
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_2
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_2
)
input_messages_2
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_2
)
)
verify_harmony_messages
(
input_messages_2
,
[
...
...
@@ -1431,7 +1525,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the third turn's input
req_3
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_3
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_3
)
input_messages_3
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_3
)
)
verify_harmony_messages
(
input_messages_3
,
[
...
...
@@ -1494,7 +1590,9 @@ class TestServingChatWithHarmony:
# Test the Harmony messages for the fourth turn's input
req_4
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
,
tools
=
tools
)
input_messages_4
,
_
=
serving_chat
.
_make_request_with_harmony
(
req_4
)
input_messages_4
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req_4
)
)
verify_harmony_messages
(
input_messages_4
,
[
...
...
@@ -1543,7 +1641,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1574,7 +1674,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1603,7 +1705,9 @@ class TestServingChatWithHarmony:
},
]
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
messages
)
input_messages
,
_
=
serving_chat
.
_make_request_with_harmony
(
req
)
input_messages
,
_
=
(
serving_chat
.
openai_serving_render
.
_make_request_with_harmony
(
req
)
)
verify_harmony_messages
(
input_messages
,
...
...
@@ -1634,11 +1738,14 @@ async def test_tool_choice_validation_without_parser():
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
openai_serving_render
=
_build_serving_render
(
mock_engine
,
models
.
registry
)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat
=
OpenAIServingChat
(
mock_engine
,
models
,
response_role
=
"assistant"
,
openai_serving_render
=
openai_serving_render
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
...
...
tests/entrypoints/openai/test_serving_chat_stream_harmony.py
→
tests/entrypoints/openai/
chat_completion/
test_serving_chat_stream_harmony.py
View file @
3fb4b5fa
...
...
@@ -180,20 +180,13 @@ class TestExtractHarmonyStreamingDelta:
assert
delta_message
.
tool_calls
[
0
].
index
==
1
@
pytest
.
mark
.
parametrize
(
"channel,recipient"
,
[
(
"commentary"
,
None
),
(
"commentary"
,
"browser.search"
),
],
)
def
test_returns_tool_call_preambles
(
self
,
channel
,
recipient
):
"""Test that invalid tool recipient on commentary is treated as content."""
def
test_returns_preambles_as_content
(
self
):
"""Test that commentary with no recipient (preamble) is user content."""
parser
=
MockStreamableParser
()
delta_text
=
"some text"
token_states
=
[
TokenState
(
channel
=
channel
,
recipient
=
recipient
,
text
=
delta_text
)
TokenState
(
channel
=
"commentary"
,
recipient
=
None
,
text
=
delta_text
)
]
delta_message
,
tools_streamed
=
extract_harmony_streaming_delta
(
...
...
@@ -211,6 +204,7 @@ class TestExtractHarmonyStreamingDelta:
[
(
None
,
None
),
(
"unknown_channel"
,
None
),
(
"commentary"
,
"browser.search"
),
],
)
def
test_returns_none_for_invalid_inputs
(
self
,
channel
,
recipient
):
...
...
vllm/model_executor/layers/quantization/kernels
/__init__.py
→
tests/entrypoints/openai/cpu
/__init__.py
View file @
3fb4b5fa
File moved
tests/entrypoints/openai/test_render.py
→
tests/entrypoints/openai/
cpu/
test_render.py
View file @
3fb4b5fa
...
...
@@ -7,7 +7,7 @@ import httpx
import
pytest
import
pytest_asyncio
from
..
.utils
import
Remote
OpenAI
Server
from
tests
.utils
import
Remote
LaunchRender
Server
MODEL_NAME
=
"hmellor/tiny-random-LlamaForCausalLM"
...
...
@@ -16,7 +16,7 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def
server
():
args
:
list
[
str
]
=
[]
with
Remote
OpenAI
Server
(
MODEL_NAME
,
args
)
as
remote_server
:
with
Remote
LaunchRender
Server
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
@@ -43,23 +43,20 @@ async def test_completion_render_basic(client):
assert
response
.
status_code
==
200
data
=
response
.
json
()
# Verify response structure
# Verify response structure
- list of GenerateRequest
assert
isinstance
(
data
,
list
)
assert
len
(
data
)
>
0
# Verify first prompt
# Verify first prompt
is a GenerateRequest
first_prompt
=
data
[
0
]
assert
"prompt_token_ids"
in
first_prompt
assert
"prompt"
in
first_prompt
assert
isinstance
(
first_prompt
[
"prompt_token_ids"
],
list
)
assert
len
(
first_prompt
[
"prompt_token_ids"
])
>
0
assert
isinstance
(
first_prompt
[
"prompt"
],
str
)
# Verify prompt text is preserved
assert
(
"When should a chat-completions handler return an empty string?"
in
first_prompt
[
"prompt"
]
)
assert
"token_ids"
in
first_prompt
assert
"sampling_params"
in
first_prompt
assert
"model"
in
first_prompt
assert
"request_id"
in
first_prompt
assert
isinstance
(
first_prompt
[
"token_ids"
],
list
)
assert
len
(
first_prompt
[
"token_ids"
])
>
0
assert
first_prompt
[
"model"
]
==
MODEL_NAME
assert
first_prompt
[
"request_id"
].
startswith
(
"cmpl-"
)
@
pytest
.
mark
.
asyncio
...
...
@@ -84,36 +81,15 @@ async def test_chat_completion_render_basic(client):
assert
response
.
status_code
==
200
data
=
response
.
json
()
# Verify response structure - should be [conversation, engine_prompts]
assert
isinstance
(
data
,
list
)
assert
len
(
data
)
==
2
conversation
,
engine_prompts
=
data
# Verify conversation
assert
isinstance
(
conversation
,
list
)
assert
len
(
conversation
)
>
0
assert
conversation
[
0
][
"role"
]
==
"user"
assert
"empty string"
in
conversation
[
0
][
"content"
]
# Verify engine_prompts
assert
isinstance
(
engine_prompts
,
list
)
assert
len
(
engine_prompts
)
>
0
# Verify response structure - should be a GenerateRequest
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
first_prompt
=
engine_prompts
[
0
]
assert
"prompt_token_ids"
in
first_prompt
assert
"prompt"
in
first_prompt
assert
isinstance
(
first_prompt
[
"prompt_token_ids"
],
list
)
assert
len
(
first_prompt
[
"prompt_token_ids"
])
>
0
# Verify chat template was applied (should have instruction markers)
assert
"[INST]"
in
first_prompt
[
"prompt"
]
assert
"[/INST]"
in
first_prompt
[
"prompt"
]
# Verify token IDs are correctly preserved as integers
token_ids
=
first_prompt
[
"prompt_token_ids"
]
# Verify token IDs are integers and BOS token is present
token_ids
=
data
[
"token_ids"
]
assert
all
(
isinstance
(
tid
,
int
)
for
tid
in
token_ids
)
# Verify BOS token (usually 1 for LLaMA models)
assert
token_ids
[
0
]
==
1
...
...
@@ -131,15 +107,18 @@ async def test_completion_render_multiple_prompts(client):
assert
response
.
status_code
==
200
data
=
response
.
json
()
# Should return two
prompt
s
# Should return two
GenerateRequest item
s
assert
isinstance
(
data
,
list
)
assert
len
(
data
)
==
2
# Verify both prompts have
required
fields
# Verify both prompts have
GenerateRequest
fields
for
prompt
in
data
:
assert
"prompt_token_ids"
in
prompt
assert
"prompt"
in
prompt
assert
len
(
prompt
[
"prompt_token_ids"
])
>
0
assert
"token_ids"
in
prompt
assert
"sampling_params"
in
prompt
assert
"model"
in
prompt
assert
"request_id"
in
prompt
assert
len
(
prompt
[
"token_ids"
])
>
0
assert
prompt
[
"request_id"
].
startswith
(
"cmpl-"
)
@
pytest
.
mark
.
asyncio
...
...
@@ -160,17 +139,49 @@ async def test_chat_completion_render_multi_turn(client):
assert
response
.
status_code
==
200
data
=
response
.
json
()
conversation
,
engine_prompts
=
data
# Verify tokenization occurred
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# Verify all messages preserved
assert
len
(
conversation
)
==
3
assert
conversation
[
0
][
"role"
]
==
"user"
assert
conversation
[
1
][
"role"
]
==
"assistant"
assert
conversation
[
2
][
"role"
]
==
"user"
# Verify tokenization occurred
assert
len
(
engine_prompts
)
>
0
assert
len
(
engine_prompts
[
0
][
"prompt_token_ids"
])
>
0
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_render_with_stream_true
(
client
):
"""Render accepts stream params but still returns JSON (non-streamed)."""
response
=
await
client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
MODEL_NAME
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
"Stream options should be accepted by /render."
,
}
],
},
)
assert
response
.
status_code
==
200
assert
response
.
headers
.
get
(
"content-type"
,
""
).
startswith
(
"application/json"
)
data
=
response
.
json
()
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# /render should preserve stream fields on the returned token-in request.
assert
data
.
get
(
"stream"
)
is
True
assert
isinstance
(
data
.
get
(
"stream_options"
),
dict
)
assert
data
[
"stream_options"
].
get
(
"include_usage"
)
is
True
assert
data
[
"stream_options"
].
get
(
"continuous_usage_stats"
)
is
True
@
pytest
.
mark
.
asyncio
...
...
@@ -224,3 +235,31 @@ async def test_completion_render_no_generation(client):
assert
response
.
status_code
==
200
# Render should be fast (< 1 second) since no generation
assert
elapsed
<
1.0
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_render_with_sampling_params
(
client
):
"""Verify sampling params are correctly returned by /render."""
response
=
await
client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
MODEL_NAME
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Test sampling params"
}],
"temperature"
:
0.123
,
"top_p"
:
0.456
,
"frequency_penalty"
:
1.1
,
},
)
assert
response
.
status_code
==
200
data
=
response
.
json
()
assert
"sampling_params"
in
data
sampling_params
=
data
[
"sampling_params"
]
assert
sampling_params
.
get
(
"temperature"
)
==
0.123
assert
sampling_params
.
get
(
"top_p"
)
==
0.456
assert
sampling_params
.
get
(
"frequency_penalty"
)
==
1.1
# Check that internal fields are not present
assert
"_all_stop_token_ids"
not
in
sampling_params
tests/entrypoints/openai/cpu/test_render_multimodal.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""
import
httpx
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
vllm.multimodal.utils
import
encode_image_url
VISION_MODEL_NAME
=
"Qwen/Qwen3-VL-2B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
vision_server
():
"""Vision-capable server used for multimodal /render tests."""
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--max-num-seqs"
,
"1"
,
"--limit-mm-per-prompt.image"
,
"1"
,
"--limit-mm-per-prompt.video"
,
"0"
,
]
env_overrides
:
dict
[
str
,
str
]
=
{}
with
RemoteOpenAIServer
(
VISION_MODEL_NAME
,
args
,
env_dict
=
env_overrides
,
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
vision_client
(
vision_server
):
async
with
httpx
.
AsyncClient
(
base_url
=
vision_server
.
url_for
(
""
),
timeout
=
60.0
)
as
http_client
:
yield
http_client
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_render_with_base64_image_url
(
vision_client
,
local_asset_server
,
):
"""Render a multimodal chat request and verify tokens are returned."""
image
=
local_asset_server
.
get_image_asset
(
"RGBA_comp.png"
)
data_url
=
encode_image_url
(
image
,
format
=
"PNG"
)
assert
data_url
.
startswith
(
"data:image/"
)
assert
";base64,"
in
data_url
response
=
await
vision_client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
data_url
}},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}
],
},
)
assert
response
.
status_code
==
200
data
=
response
.
json
()
assert
isinstance
(
data
,
dict
)
assert
"token_ids"
in
data
assert
isinstance
(
data
[
"token_ids"
],
list
)
assert
len
(
data
[
"token_ids"
])
>
0
# Verify multimodal features are populated
assert
"features"
in
data
features
=
data
[
"features"
]
assert
features
is
not
None
# mm_hashes: should have an "image" key with a list of hash strings
assert
"mm_hashes"
in
features
assert
"image"
in
features
[
"mm_hashes"
]
image_hashes
=
features
[
"mm_hashes"
][
"image"
]
assert
isinstance
(
image_hashes
,
list
)
assert
len
(
image_hashes
)
>
0
assert
all
(
isinstance
(
h
,
str
)
for
h
in
image_hashes
)
# mm_placeholders: should have an "image" key with offset/length dicts
assert
"mm_placeholders"
in
features
assert
"image"
in
features
[
"mm_placeholders"
]
image_placeholders
=
features
[
"mm_placeholders"
][
"image"
]
assert
isinstance
(
image_placeholders
,
list
)
assert
len
(
image_placeholders
)
>
0
for
p
in
image_placeholders
:
assert
"offset"
in
p
assert
"length"
in
p
assert
isinstance
(
p
[
"offset"
],
int
)
assert
isinstance
(
p
[
"length"
],
int
)
assert
p
[
"length"
]
>
0
@
pytest
.
mark
.
asyncio
async
def
test_tokenize_matches_render_for_multimodal_input
(
vision_client
,
local_asset_server
,
):
"""`/tokenize` should match `/v1/chat/completions/render` token output."""
image
=
local_asset_server
.
get_image_asset
(
"RGBA_comp.png"
)
data_url
=
encode_image_url
(
image
,
format
=
"PNG"
)
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
data_url
}},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}
]
render_response
=
await
vision_client
.
post
(
"/v1/chat/completions/render"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
messages
,
},
)
assert
render_response
.
status_code
==
200
render_data
=
render_response
.
json
()
tokenize_response
=
await
vision_client
.
post
(
"/tokenize"
,
json
=
{
"model"
:
VISION_MODEL_NAME
,
"messages"
:
messages
,
},
)
assert
tokenize_response
.
status_code
==
200
tokenize_data
=
tokenize_response
.
json
()
assert
tokenize_data
[
"tokens"
]
==
render_data
[
"token_ids"
]
assert
tokenize_data
[
"count"
]
==
len
(
render_data
[
"token_ids"
])
tests/entrypoints/openai/parser/test_harmony_utils.py
View file @
3fb4b5fa
...
...
@@ -2,31 +2,32 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
openai.types.responses
import
ResponseFunctionToolCall
,
ResponseReasoningItem
from
openai.types.responses.response_output_item
import
McpCall
from
openai_harmony
import
Author
,
Message
,
Role
,
TextContent
from
openai_harmony
import
Message
,
Role
from
tests.entrypoints.openai.utils
import
verify_harmony_messages
from
vllm.entrypoints.openai.parser.harmony_utils
import
(
auto_drop_analysis_messages
,
get_encoding
,
get_system_message
,
has_custom_tools
,
parse_chat_input_to_harmony_message
,
parse_chat_output
,
parse_input_to_harmony_message
,
parse_output_message
,
)
from
vllm.entrypoints.openai.responses.harmony
import
(
response_input_to_harmony
,
response_previous_input_to_harmony
,
)
class
TestCommonParseInputToHarmonyMessage
:
"""
Tests for scenarios that are common to both Chat Completion
parse_chat_input_to_harmony_message and Response
e
s API
parse
_input_to_harmony
_message
functions.
parse_chat_input_to_harmony_message and Responses API
response_previous
_input_to_harmony functions.
"""
@
pytest
.
fixture
(
params
=
[
parse_chat_input_to_harmony_message
,
parse
_input_to_harmony
_message
]
params
=
[
parse_chat_input_to_harmony_message
,
response_previous
_input_to_harmony
]
)
def
parse_function
(
self
,
request
):
return
request
.
param
...
...
@@ -211,81 +212,6 @@ class TestCommonParseInputToHarmonyMessage:
assert
messages
[
0
].
content
[
1
].
text
==
"actual text"
class
TestParseInputToHarmonyMessage
:
"""
Tests for scenarios that are specific to the Responses API
parse_input_to_harmony_message function.
"""
def
test_message_with_empty_content
(
self
):
"""Test parsing message with empty string content."""
chat_msg
=
{
"role"
:
"user"
,
"content"
:
""
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
content
[
0
].
text
==
""
def
test_tool_message_with_string_content
(
self
):
"""Test parsing tool message with string content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"get_weather"
,
"content"
:
"The weather in San Francisco is sunny, 72°F"
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.get_weather"
assert
(
messages
[
0
].
content
[
0
].
text
==
"The weather in San Francisco is sunny, 72°F"
)
assert
messages
[
0
].
channel
==
"commentary"
def
test_tool_message_with_array_content
(
self
):
"""Test parsing tool message with array content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"search_results"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"Result 1: "
},
{
"type"
:
"text"
,
"text"
:
"Result 2: "
},
{
"type"
:
"image"
,
"url"
:
"http://example.com/img.png"
,
},
# Should be ignored
{
"type"
:
"text"
,
"text"
:
"Result 3"
},
],
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.search_results"
assert
messages
[
0
].
content
[
0
].
text
==
"Result 1: Result 2: Result 3"
def
test_tool_message_with_empty_content
(
self
):
"""Test parsing tool message with None content."""
chat_msg
=
{
"role"
:
"tool"
,
"name"
:
"empty_tool"
,
"content"
:
None
,
}
messages
=
parse_input_to_harmony_message
(
chat_msg
)
assert
len
(
messages
)
==
1
assert
messages
[
0
].
author
.
role
==
Role
.
TOOL
assert
messages
[
0
].
author
.
name
==
"functions.empty_tool"
assert
messages
[
0
].
content
[
0
].
text
==
""
class
TestParseChatInputToHarmonyMessage
:
"""
Tests for scenarios that are specific to the Chat Completion API
...
...
@@ -840,192 +766,47 @@ class TestParseChatOutput:
assert
reasoning
==
"I've thought hard about this."
assert
final_content
==
"The answer is 4."
def
test_parse_chat_output_commentary_with_recipient_excluded
(
self
)
->
None
:
"""Commentary with a recipient (tool call) should not appear in
final_content — those are handled separately by the tool parser.
class
TestParseOutputMessage
:
"""Tests for parse_output_message function."""
def
test_commentary_with_no_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient=None (preambles) creates reasoning items.
Per Harmony format, commentary channel can contain preambles to calling
multiple functions - explanatory text with no recipient.
The first message is a preamble (visible), the second is a tool
call (excluded). Only the preamble should appear in final_content.
"""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"I will now search for the weather information."
)
message
=
message
.
with_channel
(
"commentary"
)
# recipient is None by default, representing a preamble
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"I will now search for the weather information."
)
assert
output_items
[
0
].
content
[
0
].
type
==
"reasoning_text"
def
test_commentary_with_function_recipient_creates_function_call
(
self
):
"""Test commentary with recipient='functions.X' creates function calls."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"location": "San Francisco", "units": "celsius"}'
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"functions.get_weather"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseFunctionToolCall
)
assert
output_items
[
0
].
type
==
"function_call"
assert
output_items
[
0
].
name
==
"get_weather"
assert
(
output_items
[
0
].
arguments
==
'{"location": "San Francisco", "units": "celsius"}'
)
assert
output_items
[
0
].
call_id
.
startswith
(
"call_"
)
assert
output_items
[
0
].
id
.
startswith
(
"fc_"
)
def
test_commentary_with_python_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='python' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"import numpy as np
\n
print(np.array([1, 2, 3]))"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"python"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"import numpy as np
\n
print(np.array([1, 2, 3]))"
)
def
test_commentary_with_browser_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='browser' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Navigating to the specified URL"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"browser"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
output_items
[
0
].
content
[
0
].
text
==
"Navigating to the specified URL"
def
test_commentary_with_container_recipient_creates_reasoning
(
self
):
"""Test that commentary with recipient='container' creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Running command in container"
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"container"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
output_items
[
0
].
content
[
0
].
text
==
"Running command in container"
def
test_commentary_with_empty_content_and_no_recipient
(
self
):
"""Test edge case: empty commentary with recipient=None."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
""
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
content
[
0
].
text
==
""
def
test_commentary_with_multiple_contents_and_no_recipient
(
self
):
"""Test multiple content items in commentary with no recipient."""
contents
=
[
TextContent
(
text
=
"Step 1: Analyze the request"
),
TextContent
(
text
=
"Step 2: Prepare to call functions"
),
]
message
=
Message
.
from_role_and_contents
(
Role
.
ASSISTANT
,
contents
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
2
assert
all
(
isinstance
(
item
,
ResponseReasoningItem
)
for
item
in
output_items
)
assert
output_items
[
0
].
content
[
0
].
text
==
"Step 1: Analyze the request"
assert
output_items
[
1
].
content
[
0
].
text
==
"Step 2: Prepare to call functions"
def
test_commentary_with_multiple_function_calls
(
self
):
"""Test multiple function calls in commentary channel."""
contents
=
[
TextContent
(
text
=
'{"location": "San Francisco"}'
),
TextContent
(
text
=
'{"location": "New York"}'
),
]
message
=
Message
.
from_role_and_contents
(
Role
.
ASSISTANT
,
contents
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"functions.get_weather"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
2
assert
all
(
isinstance
(
item
,
ResponseFunctionToolCall
)
for
item
in
output_items
)
assert
output_items
[
0
].
name
==
"get_weather"
assert
output_items
[
1
].
name
==
"get_weather"
assert
output_items
[
0
].
arguments
==
'{"location": "San Francisco"}'
assert
output_items
[
1
].
arguments
==
'{"location": "New York"}'
def
test_commentary_with_unknown_recipient_creates_mcp_call
(
self
):
"""Test that commentary with unknown recipient creates MCP call."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"arg": "value"}'
)
message
=
message
.
with_channel
(
"commentary"
)
message
=
message
.
with_recipient
(
"custom_tool"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
McpCall
)
assert
output_items
[
0
].
type
==
"mcp_call"
assert
output_items
[
0
].
name
==
"custom_tool"
assert
output_items
[
0
].
server_label
==
"custom_tool"
def
test_analysis_channel_creates_reasoning
(
self
):
"""Test that analysis channel creates reasoning items."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"Analyzing the problem step by step..."
)
message
=
message
.
with_channel
(
"analysis"
)
output_items
=
parse_output_message
(
message
)
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
ResponseReasoningItem
)
assert
output_items
[
0
].
type
==
"reasoning"
assert
(
output_items
[
0
].
content
[
0
].
text
==
"Analyzing the problem step by step..."
harmony_str
=
(
"<|channel|>commentary"
"<|message|>Let me check the weather.<|end|>"
"<|start|>assistant to=functions.get_weather"
"<|channel|>commentary"
'<|message|>{"location": "SF"}<|end|>'
)
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
reasoning
,
final_content
,
_
=
parse_chat_output
(
token_ids
)
assert
reasoning
is
None
assert
final_content
==
"Let me check the weather."
def
test_non_assistant_message_returns_empty
(
self
):
"""Test that non-assistant messages return empty list.
def
test_parse_chat_output_interrupted_preamble
(
self
)
->
None
:
"""Partial/interrupted preamble (commentary without recipient) should
appear in final_content, not reasoning."""
harmony_str
=
"<|channel|>commentary<|message|>I'll search for that"
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
reasoning
,
final_content
,
_
=
parse_chat_output
(
token_ids
)
assert
reasoning
is
None
assert
final_content
==
"I'll search for that"
Per the implementation, tool messages to assistant (e.g., search results)
are not included in final output to align with OpenAI behavior.
"""
message
=
Message
.
from_author_and_content
(
Author
.
new
(
Role
.
TOOL
,
"functions.get_weather"
),
"The weather is sunny, 72°F"
,
def
test_parse_chat_output_preamble_then_final
(
self
)
->
None
:
"""Preamble followed by a final message should both appear in
final_content, joined by newline."""
harmony_str
=
(
"<|channel|>commentary"
"<|message|>Let me look that up.<|end|>"
"<|start|>assistant<|channel|>final"
"<|message|>The answer is 42.<|end|>"
)
output_items
=
parse_output
_message
(
message
)
assert
len
(
output_items
)
==
0
token_ids
=
get_encoding
().
encode
(
harmony_str
,
allowed_special
=
"all"
)
reasoning
,
final_content
,
_
=
parse_
chat_
output
(
token_ids
)
assert
reasoning
is
None
assert
final_content
==
"Let me look that up.
\n
The answer is 42."
def
test_has_custom_tools
()
->
None
:
...
...
@@ -1037,165 +818,113 @@ def test_has_custom_tools() -> None:
)
def
test_parse_mcp_call_basic
()
->
None
:
"""Test that MCP calls are parsed with correct type and server_label."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"path": "/tmp"}'
)
message
=
message
.
with_recipient
(
"filesystem"
)
message
=
message
.
with_channel
(
"commentary"
)
class
TestGetSystemMessage
:
"""Tests for get_system_message channel configuration."""
output_items
=
parse_output_message
(
message
)
def
test_commentary_channel_present_without_custom_tools
(
self
)
->
None
:
"""Commentary channel must be valid even without custom tools."""
sys_msg
=
get_system_message
(
with_custom_tools
=
False
)
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
assert
"commentary"
in
valid_channels
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
McpCall
)
assert
output_items
[
0
].
type
==
"mcp_call"
assert
output_items
[
0
].
name
==
"filesystem"
assert
output_items
[
0
].
server_label
==
"filesystem"
assert
output_items
[
0
].
arguments
==
'{"path": "/tmp"}'
assert
output_items
[
0
].
status
==
"completed"
def
test_commentary_channel_present_with_custom_tools
(
self
)
->
None
:
"""Commentary channel present when custom tools are enabled."""
sys_msg
=
get_system_message
(
with_custom_tools
=
True
)
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
assert
"commentary"
in
valid_channels
def
test_all_standard_channels_present
(
self
)
->
None
:
"""All three standard Harmony channels should always be valid."""
for
with_tools
in
(
True
,
False
):
sys_msg
=
get_system_message
(
with_custom_tools
=
with_tools
)
valid_channels
=
sys_msg
.
content
[
0
].
channel_config
.
valid_channels
for
channel
in
(
"analysis"
,
"commentary"
,
"final"
):
assert
channel
in
valid_channels
,
(
f
"
{
channel
}
missing when with_custom_tools=
{
with_tools
}
"
)
def
test_parse_mcp_call_dotted_recipient
()
->
None
:
"""Test that dotted recipients extract the tool name correctly."""
message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"cmd": "ls"}'
)
message
=
message
.
with_recipient
(
"repo_browser.list"
)
message
=
message
.
with_channel
(
"commentary"
)
output_items
=
parse_output_message
(
message
)
class
TestResponseInputToHarmonyReasoningItem
:
"""Tests for response_input_to_harmony handling of reasoning input items.
assert
len
(
output_items
)
==
1
assert
isinstance
(
output_items
[
0
],
McpCall
)
assert
output_items
[
0
].
name
==
"list"
assert
output_items
[
0
].
server_label
==
"repo_browser"
Per the OpenAI spec, ResponseReasoningItem.content is
Optional[List[Content]] = None. Clients like langchain-openai may omit
this field when constructing multi-turn input from previous responses.
Reasoning items with content are converted to Harmony messages on the
'analysis' channel. All content items are concatenated. Items without
content return None (skipped by the caller).
"""
def
test_mcp_vs_function_call
()
->
None
:
"""Test that function calls are not parsed as MCP calls."""
func_message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"arg": "value"}'
)
func_message
=
func_message
.
with_recipient
(
"functions.my_tool"
)
func_message
=
func_message
.
with_channel
(
"commentary"
)
def
test_reasoning_with_single_content
(
self
):
"""Test reasoning item with a single content entry."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
[{
"type"
:
"reasoning_text"
,
"text"
:
"Thinking step by step"
}],
}
func_ite
ms
=
parse_output_message
(
func_message
)
ms
g
=
response_input_to_harmony
(
item
,
prev_responses
=
[]
)
assert
len
(
func_items
)
==
1
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
func_items
[
0
].
type
==
"function_call"
assert
msg
is
not
None
assert
msg
.
author
.
role
==
Role
.
ASSISTANT
assert
msg
.
content
[
0
].
text
==
"Thinking step by step"
assert
msg
.
channel
==
"analysis"
def
test_reasoning_with_multiple_content_items
(
self
):
"""Test reasoning item with multiple content entries concatenated."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
[
{
"type"
:
"reasoning_text"
,
"text"
:
"First, let me analyze"
},
{
"type"
:
"reasoning_text"
,
"text"
:
"Second, I should consider"
},
{
"type"
:
"reasoning_text"
,
"text"
:
"Finally, the answer is"
},
],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
assert
msg
is
not
None
assert
msg
.
author
.
role
==
Role
.
ASSISTANT
assert
msg
.
content
[
0
].
text
==
(
"First, let me analyze
\n
Second, I should consider
\n
Finally, the answer is"
)
assert
msg
.
channel
==
"analysis"
def
test_reasoning_without_content_returns_none
(
self
):
"""Test reasoning item without content field returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"summary"
:
[{
"type"
:
"summary_text"
,
"text"
:
"Thinking about math"
}],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
def
test_mcp_vs_builtin_tools
()
->
None
:
"""Test that built-in tools (python, container) are not parsed as MCP calls."""
# Test python (built-in tool) - should be reasoning, not MCP
python_message
=
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"print('hello')"
)
python_message
=
python_message
.
with_recipient
(
"python"
)
python_message
=
python_message
.
with_channel
(
"commentary"
)
assert
msg
is
None
python_items
=
parse_output_message
(
python_message
)
def
test_reasoning_with_none_content_returns_none
(
self
):
"""Test reasoning item with content=None returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
None
,
"summary"
:
[{
"type"
:
"summary_text"
,
"text"
:
"Thinking about math"
}],
}
assert
len
(
python_items
)
==
1
assert
not
isinstance
(
python_items
[
0
],
McpCall
)
assert
python_items
[
0
].
type
==
"reasoning"
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
assert
msg
is
None
def
test_reasoning_with_empty_content_returns_none
(
self
):
"""Test reasoning item with empty content list returns None."""
item
=
{
"type"
:
"reasoning"
,
"id"
:
"rs_123"
,
"content"
:
[],
}
msg
=
response_input_to_harmony
(
item
,
prev_responses
=
[])
def
test_parse_remaining_state_commentary_channel
()
->
None
:
"""Test parse_remaining_state with commentary channel and various recipients."""
from
unittest.mock
import
Mock
from
vllm.entrypoints.openai.parser.harmony_utils
import
parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func
=
Mock
()
parser_func
.
current_content
=
'{"arg": "value"}'
parser_func
.
current_role
=
Role
.
ASSISTANT
parser_func
.
current_channel
=
"commentary"
parser_func
.
current_recipient
=
"functions.my_tool"
func_items
=
parse_remaining_state
(
parser_func
)
assert
len
(
func_items
)
==
1
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
func_items
[
0
].
type
==
"function_call"
assert
func_items
[
0
].
name
==
"my_tool"
assert
func_items
[
0
].
status
==
"in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp
=
Mock
()
parser_mcp
.
current_content
=
'{"path": "/tmp"}'
parser_mcp
.
current_role
=
Role
.
ASSISTANT
parser_mcp
.
current_channel
=
"commentary"
parser_mcp
.
current_recipient
=
"filesystem"
mcp_items
=
parse_remaining_state
(
parser_mcp
)
assert
len
(
mcp_items
)
==
1
assert
isinstance
(
mcp_items
[
0
],
McpCall
)
assert
mcp_items
[
0
].
type
==
"mcp_call"
assert
mcp_items
[
0
].
name
==
"filesystem"
assert
mcp_items
[
0
].
server_label
==
"filesystem"
assert
mcp_items
[
0
].
status
==
"in_progress"
# Test 3: Built-in tool (python)
# should NOT return MCP call, falls through to reasoning
parser_builtin
=
Mock
()
parser_builtin
.
current_content
=
"print('hello')"
parser_builtin
.
current_role
=
Role
.
ASSISTANT
parser_builtin
.
current_channel
=
"commentary"
parser_builtin
.
current_recipient
=
"python"
builtin_items
=
parse_remaining_state
(
parser_builtin
)
# Should fall through to reasoning logic
assert
len
(
builtin_items
)
==
1
assert
not
isinstance
(
builtin_items
[
0
],
McpCall
)
assert
builtin_items
[
0
].
type
==
"reasoning"
def
test_parse_remaining_state_analysis_channel
()
->
None
:
"""Test parse_remaining_state with analysis channel and various recipients."""
from
unittest.mock
import
Mock
from
vllm.entrypoints.openai.parser.harmony_utils
import
parse_remaining_state
# Test 1: functions.* recipient → should return function tool call
parser_func
=
Mock
()
parser_func
.
current_content
=
'{"arg": "value"}'
parser_func
.
current_role
=
Role
.
ASSISTANT
parser_func
.
current_channel
=
"analysis"
parser_func
.
current_recipient
=
"functions.my_tool"
func_items
=
parse_remaining_state
(
parser_func
)
assert
len
(
func_items
)
==
1
assert
not
isinstance
(
func_items
[
0
],
McpCall
)
assert
func_items
[
0
].
type
==
"function_call"
assert
func_items
[
0
].
name
==
"my_tool"
assert
func_items
[
0
].
status
==
"in_progress"
# Test 2: MCP tool (not builtin) → should return MCP call
parser_mcp
=
Mock
()
parser_mcp
.
current_content
=
'{"query": "test"}'
parser_mcp
.
current_role
=
Role
.
ASSISTANT
parser_mcp
.
current_channel
=
"analysis"
parser_mcp
.
current_recipient
=
"database"
mcp_items
=
parse_remaining_state
(
parser_mcp
)
assert
len
(
mcp_items
)
==
1
assert
isinstance
(
mcp_items
[
0
],
McpCall
)
assert
mcp_items
[
0
].
type
==
"mcp_call"
assert
mcp_items
[
0
].
name
==
"database"
assert
mcp_items
[
0
].
server_label
==
"database"
assert
mcp_items
[
0
].
status
==
"in_progress"
# Test 3: Built-in tool (container)
# should NOT return MCP call, falls through to reasoning
parser_builtin
=
Mock
()
parser_builtin
.
current_content
=
"docker run"
parser_builtin
.
current_role
=
Role
.
ASSISTANT
parser_builtin
.
current_channel
=
"analysis"
parser_builtin
.
current_recipient
=
"container"
builtin_items
=
parse_remaining_state
(
parser_builtin
)
# Should fall through to reasoning logic
assert
len
(
builtin_items
)
==
1
assert
not
isinstance
(
builtin_items
[
0
],
McpCall
)
assert
builtin_items
[
0
].
type
==
"reasoning"
assert
msg
is
None
tests/entrypoints/openai/responses/conftest.py
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
json
import
logging
from
collections.abc
import
Callable
from
typing
import
Any
import
pytest
logger
=
logging
.
getLogger
(
__name__
)
BASE_TEST_ENV
=
{
# The day vLLM said "hello world" on arxiv 🚀
"VLLM_SYSTEM_START_DATE"
:
"2023-09-12"
,
}
DEFAULT_MAX_RETRIES
=
3
@
pytest
.
fixture
def
pairs_of_event_types
()
->
dict
[
str
,
str
]:
...
...
@@ -24,7 +39,325 @@ def pairs_of_event_types() -> dict[str, str]:
"response.mcp_call.completed"
:
"response.mcp_call.in_progress"
,
"response.function_call_arguments.done"
:
"response.function_call_arguments.delta"
,
# noqa: E501
"response.code_interpreter_call_code.done"
:
"response.code_interpreter_call_code.delta"
,
# noqa: E501
"response.code_interpreter_call.completed"
:
"response.code_interpreter_call.in_progress"
,
# noqa: E501
"response.web_search_call.completed"
:
"response.web_search_call.in_progress"
,
}
# fmt: on
return
event_pairs
async
def
retry_for_tool_call
(
client
,
*
,
model
:
str
,
expected_tool_type
:
str
,
max_retries
:
int
=
DEFAULT_MAX_RETRIES
,
**
create_kwargs
:
Any
,
):
"""Call ``client.responses.create`` up to *max_retries* times, returning
the first response that contains an output item of *expected_tool_type*.
Returns the **last** response if none match so the caller's assertions
fire with a clear diagnostic.
"""
last_response
=
None
for
attempt
in
range
(
max_retries
):
response
=
await
client
.
responses
.
create
(
model
=
model
,
**
create_kwargs
)
last_response
=
response
if
any
(
getattr
(
item
,
"type"
,
None
)
==
expected_tool_type
for
item
in
response
.
output
):
return
response
assert
last_response
is
not
None
return
last_response
async
def
retry_streaming_for
(
client
,
*
,
model
:
str
,
validate_events
:
Callable
[[
list
],
bool
],
max_retries
:
int
=
DEFAULT_MAX_RETRIES
,
**
create_kwargs
:
Any
,
)
->
list
:
"""Call ``client.responses.create(stream=True)`` up to *max_retries*
times, returning the first event list where *validate_events* returns
``True``.
"""
last_events
:
list
=
[]
for
attempt
in
range
(
max_retries
):
stream
=
await
client
.
responses
.
create
(
model
=
model
,
stream
=
True
,
**
create_kwargs
)
events
:
list
=
[]
async
for
event
in
stream
:
events
.
append
(
event
)
last_events
=
events
if
validate_events
(
events
):
return
events
return
last_events
def
has_output_type
(
response
,
type_name
:
str
)
->
bool
:
"""Return True if *response* has at least one output item of *type_name*."""
return
any
(
getattr
(
item
,
"type"
,
None
)
==
type_name
for
item
in
response
.
output
)
def
events_contain_type
(
events
:
list
,
type_substring
:
str
)
->
bool
:
"""Return True if any event's type contains *type_substring*."""
return
any
(
type_substring
in
getattr
(
e
,
"type"
,
""
)
for
e
in
events
)
def
_validate_event_pairing
(
events
:
list
,
pairs_of_event_types
:
dict
[
str
,
str
])
->
None
:
"""Validate that streaming events are properly nested/paired.
Derives push/pop sets from *pairs_of_event_types* so that every
start/end pair in the dict is handled automatically.
"""
start_events
=
set
(
pairs_of_event_types
.
values
())
end_events
=
set
(
pairs_of_event_types
.
keys
())
stack
:
list
[
str
]
=
[]
for
event
in
events
:
etype
=
event
.
type
if
etype
in
end_events
:
expected_start
=
pairs_of_event_types
[
etype
]
assert
stack
and
stack
[
-
1
]
==
expected_start
,
(
f
"Stack mismatch for
{
etype
}
: "
f
"expected
{
expected_start
}
, "
f
"got
{
stack
[
-
1
]
if
stack
else
'<empty>'
}
"
)
stack
.
pop
()
elif
etype
in
start_events
:
# Consecutive deltas of the same type share a single stack slot.
if
etype
.
endswith
(
"delta"
)
and
stack
and
stack
[
-
1
]
==
etype
:
continue
stack
.
append
(
etype
)
# else: passthrough event (e.g. response.in_progress,
# web_search_call.searching, code_interpreter_call.interpreting)
assert
len
(
stack
)
==
0
,
f
"Unclosed events on stack:
{
stack
}
"
def
_validate_event_ordering
(
events
:
list
)
->
None
:
"""Validate that envelope events appear in the correct positions."""
assert
len
(
events
)
>=
2
,
f
"Expected at least 2 events, got
{
len
(
events
)
}
"
# First event must be response.created
assert
events
[
0
].
type
==
"response.created"
,
(
f
"First event must be response.created, got
{
events
[
0
].
type
}
"
)
# Last event must be response.completed
assert
events
[
-
1
].
type
==
"response.completed"
,
(
f
"Last event must be response.completed, got
{
events
[
-
1
].
type
}
"
)
# response.in_progress, if present, must be the second event
in_progress_indices
=
[
i
for
i
,
e
in
enumerate
(
events
)
if
e
.
type
==
"response.in_progress"
]
if
in_progress_indices
:
assert
in_progress_indices
==
[
1
],
(
f
"response.in_progress must be the second event, "
f
"found at indices
{
in_progress_indices
}
"
)
# Exactly one created and one completed
created_count
=
sum
(
1
for
e
in
events
if
e
.
type
==
"response.created"
)
completed_count
=
sum
(
1
for
e
in
events
if
e
.
type
==
"response.completed"
)
assert
created_count
==
1
,
(
f
"Expected exactly 1 response.created, got
{
created_count
}
"
)
assert
completed_count
==
1
,
(
f
"Expected exactly 1 response.completed, got
{
completed_count
}
"
)
def
_validate_field_consistency
(
events
:
list
)
->
None
:
"""Validate item_id, output_index, and content_index consistency.
Tracks the active output item established by ``output_item.added``
and verifies that all subsequent events for that item carry matching
identifiers until ``output_item.done`` closes it.
"""
_SESSION_EVENTS
=
{
"response.created"
,
"response.in_progress"
,
"response.completed"
,
}
active_item_id
:
str
|
None
=
None
active_output_index
:
int
|
None
=
None
last_output_index
:
int
=
-
1
active_content_index
:
int
|
None
=
None
for
event
in
events
:
etype
=
event
.
type
if
etype
in
_SESSION_EVENTS
:
continue
# --- output_item.added: opens a new item ------------------
if
etype
==
"response.output_item.added"
:
item
=
getattr
(
event
,
"item"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
assert
item
is
not
None
,
"output_item.added must have an item"
item_id
=
getattr
(
item
,
"id"
,
None
)
assert
item_id
,
"output_item.added item must have an id"
# output_index must be non-decreasing across items
if
output_index
is
not
None
:
assert
output_index
>=
last_output_index
,
(
f
"output_index went backwards:
{
output_index
}
<
{
last_output_index
}
"
)
last_output_index
=
output_index
active_item_id
=
item_id
active_output_index
=
output_index
active_content_index
=
None
continue
# --- output_item.done: closes the active item -------------
if
etype
==
"response.output_item.done"
:
item
=
getattr
(
event
,
"item"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
assert
item
is
not
None
,
"output_item.done must have an item"
done_item_id
=
getattr
(
item
,
"id"
,
None
)
if
active_item_id
is
not
None
and
done_item_id
:
assert
done_item_id
==
active_item_id
,
(
f
"output_item.done item.id mismatch: "
f
"expected
{
active_item_id
}
, got
{
done_item_id
}
"
)
if
active_output_index
is
not
None
and
output_index
is
not
None
:
assert
output_index
==
active_output_index
,
(
f
"output_item.done output_index mismatch: "
f
"expected
{
active_output_index
}
, got
{
output_index
}
"
)
active_item_id
=
None
active_output_index
=
None
active_content_index
=
None
continue
# --- content_part / reasoning_part added: sets content_index
if
etype
in
(
"response.content_part.added"
,
"response.reasoning_part.added"
,
):
_assert_item_fields
(
event
,
etype
,
active_item_id
,
active_output_index
)
active_content_index
=
getattr
(
event
,
"content_index"
,
None
)
continue
# --- all other item-level events --------------------------
_assert_item_fields
(
event
,
etype
,
active_item_id
,
active_output_index
)
# content_index (only meaningful on events that carry it)
content_index
=
getattr
(
event
,
"content_index"
,
None
)
if
content_index
is
not
None
and
active_content_index
is
not
None
:
assert
content_index
==
active_content_index
,
(
f
"
{
etype
}
content_index mismatch: "
f
"expected
{
active_content_index
}
, got
{
content_index
}
"
)
def
_assert_item_fields
(
event
,
etype
:
str
,
active_item_id
:
str
|
None
,
active_output_index
:
int
|
None
,
)
->
None
:
"""Check that *event*'s item_id and output_index match the active item."""
event_item_id
=
getattr
(
event
,
"item_id"
,
None
)
output_index
=
getattr
(
event
,
"output_index"
,
None
)
if
active_item_id
is
not
None
and
event_item_id
is
not
None
:
assert
event_item_id
==
active_item_id
,
(
f
"
{
etype
}
item_id mismatch: expected
{
active_item_id
}
, got
{
event_item_id
}
"
)
if
active_output_index
is
not
None
and
output_index
is
not
None
:
assert
output_index
==
active_output_index
,
(
f
"
{
etype
}
output_index mismatch: "
f
"expected
{
active_output_index
}
, got
{
output_index
}
"
)
def
validate_streaming_event_stack
(
events
:
list
,
pairs_of_event_types
:
dict
[
str
,
str
]
)
->
None
:
"""Validate streaming events: pairing, ordering, and field consistency.
Checks three aspects:
1. **Event pairing** — start/end events are properly nested
(stack-based matching derived from *pairs_of_event_types*).
2. **Event ordering** — envelope events (``created``,
``in_progress``, ``completed``) appear at the correct positions.
3. **Field consistency** — ``item_id``, ``output_index``, and
``content_index`` are consistent across related events within
each output item's lifecycle.
"""
_validate_event_pairing
(
events
,
pairs_of_event_types
)
_validate_event_ordering
(
events
)
_validate_field_consistency
(
events
)
def
log_response_diagnostics
(
response
,
*
,
label
:
str
=
"Response Diagnostics"
,
)
->
dict
[
str
,
Any
]:
"""Extract and log diagnostic info from a Responses API response.
Logs reasoning, tool-call attempts, MCP items, and output types so
that CI output (``pytest -s`` or ``--log-cli-level=INFO``) gives
full visibility into model behaviour even on passing runs.
Returns the extracted data so callers can make additional assertions
if needed.
"""
reasoning_texts
=
[
text
for
item
in
response
.
output
if
getattr
(
item
,
"type"
,
None
)
==
"reasoning"
for
content
in
getattr
(
item
,
"content"
,
[])
if
(
text
:
=
getattr
(
content
,
"text"
,
None
))
]
tool_call_attempts
=
[
{
"recipient"
:
msg
.
get
(
"recipient"
),
"channel"
:
msg
.
get
(
"channel"
),
}
for
msg
in
response
.
output_messages
if
(
msg
.
get
(
"recipient"
)
or
""
).
startswith
(
"python"
)
]
mcp_items
=
[
{
"name"
:
getattr
(
item
,
"name"
,
None
),
"status"
:
getattr
(
item
,
"status"
,
None
),
}
for
item
in
response
.
output
if
getattr
(
item
,
"type"
,
None
)
==
"mcp_call"
]
output_types
=
[
getattr
(
o
,
"type"
,
None
)
for
o
in
response
.
output
]
diagnostics
=
{
"model_attempted_tool_calls"
:
bool
(
tool_call_attempts
),
"tool_call_attempts"
:
tool_call_attempts
,
"mcp_items"
:
mcp_items
,
"reasoning"
:
reasoning_texts
,
"output_text"
:
response
.
output_text
,
"output_types"
:
output_types
,
}
logger
.
info
(
"
\n
====== %s ======
\n
%s
\n
=============================="
,
label
,
json
.
dumps
(
diagnostics
,
indent
=
2
,
default
=
str
),
)
return
diagnostics
tests/entrypoints/openai/responses/test_errors.py
View file @
3fb4b5fa
...
...
@@ -6,7 +6,6 @@ from unittest.mock import MagicMock
import
pytest
from
vllm.entrypoints.openai.engine.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.engine.serving
import
GenerationError
,
OpenAIServing
...
...
@@ -38,32 +37,6 @@ async def test_raise_if_error_raises_generation_error():
serving
.
_raise_if_error
(
None
,
"test-request-id"
)
# should not raise
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_response
():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine
=
MagicMock
()
mock_engine
.
model_config
=
MagicMock
()
mock_engine
.
model_config
.
max_model_len
=
100
mock_models
=
MagicMock
()
serving
=
OpenAIServing
(
engine_client
=
mock_engine
,
models
=
mock_models
,
request_logger
=
None
,
)
# create a GenerationError
gen_error
=
GenerationError
(
"Internal server error"
)
# convert to ErrorResponse
error_response
=
serving
.
_convert_generation_error_to_response
(
gen_error
)
assert
isinstance
(
error_response
,
ErrorResponse
)
assert
error_response
.
error
.
type
==
"InternalServerError"
assert
error_response
.
error
.
message
==
"Internal server error"
assert
error_response
.
error
.
code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_streaming_response
():
"""test _convert_generation_error_to_streaming_response output"""
...
...
Prev
1
…
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment