Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
993 additions
and
74 deletions
+993
-74
tests/entrypoints/pooling/embed/test_online.py
tests/entrypoints/pooling/embed/test_online.py
+13
-5
tests/entrypoints/pooling/embed/test_online_dimensions.py
tests/entrypoints/pooling/embed/test_online_dimensions.py
+4
-5
tests/entrypoints/pooling/embed/test_online_long_text.py
tests/entrypoints/pooling/embed/test_online_long_text.py
+5
-6
tests/entrypoints/pooling/embed/test_online_vision.py
tests/entrypoints/pooling/embed/test_online_vision.py
+1
-9
tests/entrypoints/pooling/score/test_correctness_mteb.py
tests/entrypoints/pooling/score/test_correctness_mteb.py
+5
-6
tests/entrypoints/pooling/score/test_offline.py
tests/entrypoints/pooling/score/test_offline.py
+7
-5
tests/entrypoints/pooling/score/test_online_rerank.py
tests/entrypoints/pooling/score/test_online_rerank.py
+4
-5
tests/entrypoints/pooling/score/test_online_score.py
tests/entrypoints/pooling/score/test_online_score.py
+8
-18
tests/entrypoints/pooling/score/test_utils.py
tests/entrypoints/pooling/score/test_utils.py
+351
-0
tests/entrypoints/rpc/__init__.py
tests/entrypoints/rpc/__init__.py
+0
-0
tests/entrypoints/rpc/test_collective_rpc.py
tests/entrypoints/rpc/test_collective_rpc.py
+1
-1
tests/entrypoints/sleep/__init__.py
tests/entrypoints/sleep/__init__.py
+0
-0
tests/entrypoints/sleep/test_sleep.py
tests/entrypoints/sleep/test_sleep.py
+1
-1
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+6
-9
tests/entrypoints/test_grpc_server.py
tests/entrypoints/test_grpc_server.py
+428
-0
tests/entrypoints/test_responses_utils.py
tests/entrypoints/test_responses_utils.py
+118
-0
tests/entrypoints/test_utils.py
tests/entrypoints/test_utils.py
+10
-0
tests/evals/gsm8k/README.md
tests/evals/gsm8k/README.md
+9
-4
tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
+11
-0
tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
+11
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/entrypoints/pooling/embed/test_online.py
View file @
7e63ef82
...
...
@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
decode_pooling_output
,
)
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
MODEL_NAME
=
"intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DTYPE
=
"bfloat16"
if
current_platform
.
is_rocm
():
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch
.
backends
.
cuda
.
enable_flash_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_mem_efficient_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_math_sdp
(
True
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
...
...
@@ -53,6 +57,10 @@ def server():
DUMMY_CHAT_TEMPLATE
,
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/pooling/embed/test_online_dimensions.py
View file @
7e63ef82
...
...
@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingResponse
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
MODELS
=
[
EmbedModelInfo
(
"intfloat/multilingual-e5-small"
,
is_matryoshka
=
False
),
EmbedModelInfo
(
...
...
@@ -62,6 +57,10 @@ def server(model_info, dtype: str):
[
"--trust_remote_code"
,
"--hf_overrides"
,
'{"matryoshka_dimensions":[256]}'
]
)
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
model_info
.
name
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/pooling/embed/test_online_long_text.py
View file @
7e63ef82
...
...
@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingResponse
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
def
_generate_random_text
(
word_count
:
int
)
->
str
:
"""Generate random text with approximately the specified word count."""
...
...
@@ -221,13 +216,17 @@ def server_with_chunked_processing():
"512"
,
# Set smaller max_model_len to trigger chunking mechanism
"--pooler-config"
,
(
'{"pooling_type": "MEAN", "
normalize
": true, '
'{"pooling_type": "MEAN", "
use_activation
": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}'
),
"--gpu-memory-utilization"
,
"0.8"
,
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/pooling/embed/test_online_vision.py
View file @
7e63ef82
...
...
@@ -11,7 +11,7 @@ from transformers import AutoProcessor
from
tests.utils
import
VLLM_PATH
,
RemoteOpenAIServer
from
vllm.entrypoints.pooling.embed.protocol
import
EmbeddingResponse
from
vllm.multimodal.base
import
MediaWithBytes
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
vllm.multimodal.utils
import
fetch_image
from
...utils
import
models_path_prefix
,
urls_port
...
...
@@ -55,14 +55,6 @@ def server():
yield
remote_server
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_image
(
local_asset_server
)
->
dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
local_asset_server
.
get_image_asset
(
image_url
))
for
image_url
in
TEST_IMAGE_ASSETS
}
def
get_hf_prompt_tokens
(
model_name
,
content
,
image_url
):
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
,
num_crops
=
4
...
...
tests/entrypoints/pooling/score/test_correctness_mteb.py
View file @
7e63ef82
...
...
@@ -4,7 +4,7 @@ import os
import
pytest
from
tests.models.language.pooling_mteb_test.mteb_utils
import
(
from
tests.models.language.pooling_mteb_test.mteb_
score_
utils
import
(
MTEB_RERANK_LANGS
,
MTEB_RERANK_TASKS
,
MTEB_RERANK_TOL
,
...
...
@@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
MODEL_NAME
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
...
...
@@ -30,6 +25,10 @@ st_main_score = 0.33457
def
server
():
args
=
[
"--runner"
,
"pooling"
,
"--enforce-eager"
,
"--disable-uvicorn-access-log"
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/pooling/score/test_offline.py
View file @
7e63ef82
...
...
@@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
MODEL_NAME
=
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config
=
None
if
current_platform
.
is_rocm
():
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
}
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
...
...
@@ -30,6 +31,7 @@ def llm():
gpu_memory_utilization
=
0.75
,
enforce_eager
=
True
,
seed
=
0
,
attention_config
=
attention_config
,
)
yield
weakref
.
proxy
(
llm
)
...
...
tests/entrypoints/pooling/score/test_online_rerank.py
View file @
7e63ef82
...
...
@@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from
vllm.entrypoints.pooling.score.protocol
import
RerankResponse
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
MODEL_NAME
=
"BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
...
...
@@ -24,6 +19,10 @@ DTYPE = "bfloat16"
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/pooling/score/test_online_score.py
View file @
7e63ef82
...
...
@@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
from
vllm.entrypoints.pooling.score.protocol
import
ScoreResponse
from
vllm.platforms
import
current_platform
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Encoder self-attention is not implemented on ROCm."
,
allow_module_level
=
True
)
MODELS
=
[
{
"name"
:
"BAAI/bge-reranker-v2-m3"
,
"is_cross_encoder"
:
True
},
{
"name"
:
"BAAI/bge-base-en-v1.5"
,
"is_cross_encoder"
:
False
},
...
...
@@ -44,6 +39,10 @@ def model(request):
def
server
(
model
:
dict
[
str
,
Any
]):
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if
current_platform
.
is_rocm
():
args
.
extend
([
"--attention-backend"
,
"FLEX_ATTENTION"
])
with
RemoteOpenAIServer
(
model
[
"name"
],
args
)
as
remote_server
:
yield
remote_server
...
...
@@ -237,17 +236,14 @@ class TestModel:
"use_activation"
:
use_activation
,
},
)
if
response
.
status_code
!=
200
:
return
response
outputs
=
response
.
json
()
return
torch
.
tensor
([
x
[
"score"
]
for
x
in
outputs
[
"data"
]])
if
model
[
"is_cross_encoder"
]:
default
=
get_outputs
(
use_activation
=
None
)
w_activation
=
get_outputs
(
use_activation
=
True
)
wo_activation
=
get_outputs
(
use_activation
=
False
)
default
=
get_outputs
(
use_activation
=
None
)
w_activation
=
get_outputs
(
use_activation
=
True
)
wo_activation
=
get_outputs
(
use_activation
=
False
)
if
model
[
"is_cross_encoder"
]:
assert
torch
.
allclose
(
default
,
w_activation
,
atol
=
1e-2
),
(
"Default should use activation."
)
...
...
@@ -257,9 +253,3 @@ class TestModel:
assert
torch
.
allclose
(
F
.
sigmoid
(
wo_activation
),
w_activation
,
atol
=
1e-2
),
(
"w_activation should be close to activation(wo_activation)."
)
else
:
get_outputs
(
use_activation
=
None
)
# The activation parameter only works for the is_cross_encoder model
response
=
get_outputs
(
use_activation
=
True
)
assert
response
.
status_code
==
400
tests/entrypoints/pooling/score/test_utils.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
unittest.mock
import
patch
import
pytest
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.chat_utils
import
ChatTemplateResolutionError
from
vllm.entrypoints.score_utils
import
get_score_prompt
from
vllm.inputs
import
TokensPrompt
from
vllm.tokenizers
import
get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
def
assert_prompt_tokenization_consistent
(
tokenizer
,
full_prompt
,
engine_prompt
,
add_special_tokens
=
True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids
=
tokenizer
(
full_prompt
,
add_special_tokens
=
add_special_tokens
)[
"input_ids"
]
actual_ids
=
engine_prompt
[
"prompt_token_ids"
]
assert
actual_ids
==
expected_ids
,
(
f
"Token IDs don't match.
\n
Expected:
{
expected_ids
}
\n
Actual:
{
actual_ids
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
cross_encoder_model_config
():
return
ModelConfig
(
CROSS_ENCODER_MODEL_ID
,
runner
=
"pooling"
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
cross_encoder_tokenizer
(
cross_encoder_model_config
):
return
get_tokenizer
(
CROSS_ENCODER_MODEL_ID
,
trust_remote_code
=
cross_encoder_model_config
.
trust_remote_code
,
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm_reranker_model_config
():
"""Model config for LLM-as-reranker style (no pad token)."""
config
=
ModelConfig
(
CROSS_ENCODER_MODEL_ID
,
runner
=
"pooling"
,
)
# use_sep_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config
.
hf_config
.
use_sep_token
=
False
return
config
@
pytest
.
fixture
def
tokenization_kwargs
():
"""Common tokenization kwargs used across tests."""
return
{
"add_special_tokens"
:
True
,
"return_tensors"
:
None
}
@
pytest
.
fixture
def
mock_model_with_score_template
():
"""Mock model class that supports score template and tracks post_process calls."""
class
MockModelWithScoreTemplate
:
supports_score_template
=
True
post_process_called
:
list
[
TokensPrompt
]
=
[]
@
staticmethod
def
get_score_template
(
p1
:
str
,
p2
:
str
)
->
str
:
return
f
"[QUERY]
{
p1
}
[SEP][DOC]
{
p2
}
"
@
staticmethod
def
post_process_tokens
(
prompt
:
TokensPrompt
)
->
None
:
MockModelWithScoreTemplate
.
post_process_called
.
append
(
prompt
)
return
MockModelWithScoreTemplate
@
pytest
.
fixture
def
mock_model_no_score_template
():
"""Mock model class that does not support score template."""
class
MockModelNoScoreTemplate
:
supports_score_template
=
False
return
MockModelNoScoreTemplate
class
TestGetScorePrompt
:
"""Tests for the get_score_prompt function."""
def
test_tokenization_kwargs_passed_through
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
):
"""Test that tokenization kwargs are properly passed through."""
data_1
=
"Query text"
data_2
=
"Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs
=
{
"add_special_tokens"
:
True
,
"return_tensors"
:
None
,
"truncation"
:
True
,
"max_length"
:
20
,
}
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
cross_encoder_tokenizer
,
custom_tokenization_kwargs
,
data_1
,
data_2
,
)
assert
isinstance
(
full_prompt
,
str
)
assert
"prompt_token_ids"
in
engine_prompt
# With max_length=20 and truncation, should not exceed this
assert
len
(
engine_prompt
[
"prompt_token_ids"
])
<=
20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids
=
cross_encoder_tokenizer
(
full_prompt
,
add_special_tokens
=
True
)[
"input_ids"
]
actual_ids
=
engine_prompt
[
"prompt_token_ids"
]
assert
full_ids
[:
len
(
actual_ids
)]
==
actual_ids
,
(
f
"Token IDs are not a prefix of full encoding.
\n
"
f
"Full IDs:
{
full_ids
}
\n
"
f
"Actual IDs:
{
actual_ids
}
"
)
def
test_model_supports_score_template
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test when model supports score template (no score_template arg)."""
with
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query text"
,
"document text"
,
)
assert
full_prompt
==
"[QUERY]query text[SEP][DOC]document text"
assert
"prompt_token_ids"
in
engine_prompt
assert
len
(
engine_prompt
[
"prompt_token_ids"
])
>
0
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_model_supports_score_template_but_custom_template_provided
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test when model supports score template but custom template is provided."""
template
=
(
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"doc"
,
score_template
=
template
,
# Providing a template
)
assert
"prompt_token_ids"
in
engine_prompt
assert
full_prompt
==
"TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_not_using_default_template
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.score_utils.apply_hf_chat_template"
,
return_value
=
"test querytest doc"
,
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"test query"
,
"test doc"
,
)
assert
full_prompt
==
"test querytest doc"
assert
"prompt_token_ids"
in
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_fallback_with_sep_token
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=True."""
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.score_utils.apply_hf_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
# use_sep_token=True
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"document"
,
)
assert
"prompt_token_ids"
in
engine_prompt
# Should have token_type_ids from text_pair encoding
assert
"token_type_ids"
in
engine_prompt
assert
"query"
in
full_prompt
assert
"document"
in
full_prompt
assert
full_prompt
!=
"querydocument"
assert
(
engine_prompt
[
"prompt_token_ids"
]
==
cross_encoder_tokenizer
(
"query"
,
text_pair
=
"document"
,
add_special_tokens
=
True
)[
"input_ids"
]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
,
add_special_tokens
=
False
,
)
def
test_fallback_without_sep_token
(
self
,
llm_reranker_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_no_score_template
,
):
"""Test fallback path when ChatTemplateResolutionError
and use_sep_token=False."""
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_no_score_template
,
),
patch
(
"vllm.entrypoints.score_utils.apply_hf_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
llm_reranker_model_config
,
# use_sep_token=False
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"document"
,
)
assert
full_prompt
==
"querydocument"
assert
"prompt_token_ids"
in
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
def
test_post_process_tokens_called
(
self
,
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
mock_model_with_score_template
,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template
.
post_process_called
.
clear
()
with
(
patch
(
"vllm.model_executor.model_loader.get_model_cls"
,
return_value
=
mock_model_with_score_template
,
),
patch
(
"vllm.entrypoints.score_utils.apply_hf_chat_template"
,
side_effect
=
ChatTemplateResolutionError
(
"No template"
),
),
):
full_prompt
,
engine_prompt
=
get_score_prompt
(
cross_encoder_model_config
,
cross_encoder_tokenizer
,
tokenization_kwargs
,
"query"
,
"doc"
,
)
# post_process_tokens should have been called once
assert
len
(
mock_model_with_score_template
.
post_process_called
)
==
1
assert
mock_model_with_score_template
.
post_process_called
[
0
]
is
engine_prompt
assert_prompt_tokenization_consistent
(
cross_encoder_tokenizer
,
full_prompt
,
engine_prompt
)
tests/
v1/tpu/worker
/__init__.py
→
tests/
entrypoints/rpc
/__init__.py
View file @
7e63ef82
File moved
tests/entrypoints/
openai
/test_collective_rpc.py
→
tests/entrypoints/
rpc
/test_collective_rpc.py
View file @
7e63ef82
...
...
@@ -37,7 +37,7 @@ def server():
"--max-num-seqs"
,
"128"
,
"--worker-extension-cls"
,
"tests.entrypoints.
openai
.test_collective_rpc.TestWorkerExtension"
,
"tests.entrypoints.
rpc
.test_collective_rpc.TestWorkerExtension"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
...
...
vllm/attention/backends
/__init__.py
→
tests/entrypoints/sleep
/__init__.py
View file @
7e63ef82
File moved
tests/entrypoints/
openai
/test_sleep.py
→
tests/entrypoints/
sleep
/test_sleep.py
View file @
7e63ef82
...
...
@@ -5,7 +5,7 @@ import os
import
requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
..
.utils
import
RemoteOpenAIServer
,
models_path_prefix
from
tests
.utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
...
...
tests/entrypoints/test_chat_utils.py
View file @
7e63ef82
...
...
@@ -26,9 +26,9 @@ from vllm.entrypoints.chat_utils import (
)
from
vllm.multimodal
import
MultiModalDataDict
,
MultiModalUUIDDict
from
vllm.multimodal.utils
import
(
encode_audio_
base64
,
encode_image_
base64
,
encode_video_
base64
,
encode_audio_
url
,
encode_image_
url
,
encode_video_
url
,
)
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
...
...
@@ -142,22 +142,19 @@ def mistral_model_config():
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_url
():
image
=
ImageAsset
(
"cherry_blossom"
)
base64
=
encode_image_base64
(
image
.
pil_image
)
return
f
"data:image/jpeg;base64,
{
base64
}
"
return
encode_image_url
(
image
.
pil_image
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
video_url
():
video
=
VideoAsset
(
"baby_reading"
,
1
)
base64
=
encode_video_base64
(
video
.
np_ndarrays
)
return
f
"data:video/jpeg;base64,
{
base64
}
"
return
encode_video_url
(
video
.
np_ndarrays
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
audio_url
():
audio
=
AudioAsset
(
"mary_had_lamb"
)
base64
=
encode_audio_base64
(
*
audio
.
audio_and_sample_rate
)
return
f
"data:audio/ogg;base64,
{
base64
}
"
return
encode_audio_url
(
*
audio
.
audio_and_sample_rate
)
def
_assert_mm_data_is_image_input
(
...
...
tests/entrypoints/test_grpc_server.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end tests for the vLLM gRPC server.
"""
import
asyncio
import
socket
import
subprocess
import
sys
import
time
import
grpc
import
pytest
import
pytest_asyncio
from
vllm.grpc
import
vllm_engine_pb2
,
vllm_engine_pb2_grpc
# Use a small model for fast testing
MODEL_NAME
=
"hmellor/tiny-random-LlamaForCausalLM"
def
find_free_port
()
->
int
:
"""Find a free port on localhost."""
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
""
,
0
))
s
.
listen
(
1
)
port
=
s
.
getsockname
()[
1
]
return
port
async
def
wait_for_server
(
port
:
int
,
timeout
:
float
=
60.0
)
->
bool
:
"""Wait for the gRPC server to be ready by trying health checks."""
start_time
=
time
.
time
()
print
(
"waiting for server to start..."
)
while
time
.
time
()
-
start_time
<
timeout
:
try
:
channel
=
grpc
.
aio
.
insecure_channel
(
f
"localhost:
{
port
}
"
)
stub
=
vllm_engine_pb2_grpc
.
VllmEngineStub
(
channel
)
request
=
vllm_engine_pb2
.
HealthCheckRequest
()
response
=
await
stub
.
HealthCheck
(
request
,
timeout
=
5.0
)
await
channel
.
close
()
if
response
.
healthy
:
print
(
"server returned healthy=True"
)
return
True
except
Exception
:
await
asyncio
.
sleep
(
0.5
)
return
False
class
GrpcServerProcess
:
"""Manages a gRPC server running in a subprocess."""
def
__init__
(
self
):
self
.
process
:
subprocess
.
Popen
|
None
=
None
self
.
port
:
int
|
None
=
None
async
def
start
(
self
):
"""Start the gRPC server process."""
self
.
port
=
find_free_port
()
# Start the server as a subprocess
self
.
process
=
subprocess
.
Popen
(
[
sys
.
executable
,
"-m"
,
"vllm.entrypoints.grpc_server"
,
"--model"
,
MODEL_NAME
,
"--host"
,
"localhost"
,
"--port"
,
str
(
self
.
port
),
"--max-num-batched-tokens"
,
"512"
,
"--disable-log-stats-server"
,
],
)
# Wait for server to be ready
if
not
await
wait_for_server
(
self
.
port
):
self
.
stop
()
raise
RuntimeError
(
"gRPC server failed to start within timeout"
)
def
stop
(
self
):
"""Stop the gRPC server process."""
if
self
.
process
:
self
.
process
.
terminate
()
try
:
self
.
process
.
wait
(
timeout
=
10
)
except
subprocess
.
TimeoutExpired
:
self
.
process
.
kill
()
self
.
process
.
wait
()
@
pytest_asyncio
.
fixture
(
scope
=
"module"
)
async
def
grpc_server
():
"""Fixture providing a running gRPC server in a subprocess."""
server
=
GrpcServerProcess
()
await
server
.
start
()
yield
server
server
.
stop
()
@
pytest_asyncio
.
fixture
async
def
grpc_client
(
grpc_server
):
"""Fixture providing a gRPC client connected to the server."""
channel
=
grpc
.
aio
.
insecure_channel
(
f
"localhost:
{
grpc_server
.
port
}
"
)
stub
=
vllm_engine_pb2_grpc
.
VllmEngineStub
(
channel
)
yield
stub
await
channel
.
close
()
@
pytest
.
mark
.
asyncio
async
def
test_health_check
(
grpc_client
):
"""Test the HealthCheck RPC."""
request
=
vllm_engine_pb2
.
HealthCheckRequest
()
response
=
await
grpc_client
.
HealthCheck
(
request
)
assert
response
.
healthy
is
True
assert
response
.
message
==
"Health"
@
pytest
.
mark
.
asyncio
async
def
test_get_model_info
(
grpc_client
):
"""Test the GetModelInfo RPC."""
request
=
vllm_engine_pb2
.
GetModelInfoRequest
()
response
=
await
grpc_client
.
GetModelInfo
(
request
)
assert
response
.
model_path
==
MODEL_NAME
assert
response
.
is_generation
is
True
assert
response
.
max_context_length
>
0
assert
response
.
vocab_size
>
0
assert
response
.
supports_vision
is
False
@
pytest
.
mark
.
asyncio
async
def
test_get_server_info
(
grpc_client
):
"""Test the GetServerInfo RPC."""
request
=
vllm_engine_pb2
.
GetServerInfoRequest
()
response
=
await
grpc_client
.
GetServerInfo
(
request
)
assert
response
.
active_requests
>=
0
assert
response
.
is_paused
is
False
assert
response
.
uptime_seconds
>=
0
assert
response
.
server_type
==
"vllm-grpc"
assert
response
.
last_receive_timestamp
>
0
@
pytest
.
mark
.
asyncio
async
def
test_generate_non_streaming
(
grpc_client
):
"""Test the Generate RPC in non-streaming mode."""
# Create a simple request
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-non-streaming-1"
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello, my name is"
,
input_ids
=
[
15496
,
11
,
616
,
1438
,
318
],
# GPT-2 tokens for the prompt
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
10
,
n
=
1
,
),
stream
=
False
,
)
# Collect all responses
responses
=
[]
async
for
response
in
grpc_client
.
Generate
(
request
):
responses
.
append
(
response
)
# Should have exactly one response (complete)
assert
len
(
responses
)
==
1
# Check the response
final_response
=
responses
[
0
]
assert
final_response
.
HasField
(
"complete"
)
complete
=
final_response
.
complete
assert
len
(
complete
.
output_ids
)
>
0
assert
complete
.
finish_reason
in
[
"stop"
,
"length"
]
assert
complete
.
prompt_tokens
>
0
assert
complete
.
completion_tokens
>
0
@
pytest
.
mark
.
asyncio
async
def
test_generate_streaming
(
grpc_client
):
"""Test the Generate RPC in streaming mode."""
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-streaming-1"
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"The capital of France is"
,
input_ids
=
[
464
,
3139
,
286
,
4881
,
318
],
# GPT-2 tokens
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
10
,
n
=
1
),
stream
=
True
,
)
# Collect all responses
chunks
=
[]
complete_response
=
None
async
for
response
in
grpc_client
.
Generate
(
request
):
if
response
.
HasField
(
"chunk"
):
chunks
.
append
(
response
.
chunk
)
elif
response
.
HasField
(
"complete"
):
complete_response
=
response
.
complete
# Should have received some chunks
assert
len
(
chunks
)
>=
0
# May have 0 chunks if generation is very fast
# Should have a final complete response
assert
complete_response
is
not
None
assert
complete_response
.
finish_reason
in
[
"stop"
,
"length"
]
assert
complete_response
.
prompt_tokens
>
0
# Verify chunk structure
for
chunk
in
chunks
:
assert
chunk
.
prompt_tokens
>
0
assert
chunk
.
completion_tokens
>=
0
@
pytest
.
mark
.
asyncio
async
def
test_generate_with_different_sampling_params
(
grpc_client
):
"""Test Generate with various sampling parameters."""
# Test with temperature
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-sampling-temp"
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello"
,
input_ids
=
[
15496
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
5
),
stream
=
False
,
)
responses
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request
)]
assert
len
(
responses
)
==
1
assert
responses
[
0
].
HasField
(
"complete"
)
# Test with top_k
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-sampling-topk"
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello"
,
input_ids
=
[
15496
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
1.0
,
top_k
=
50
,
max_tokens
=
5
),
stream
=
False
,
)
responses
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request
)]
assert
len
(
responses
)
==
1
assert
responses
[
0
].
HasField
(
"complete"
)
@
pytest
.
mark
.
asyncio
async
def
test_generate_with_stop_strings
(
grpc_client
):
"""Test Generate with stop strings."""
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-stop-strings"
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello"
,
input_ids
=
[
15496
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
20
,
stop
=
[
"
\n
"
,
"END"
],
),
stream
=
False
,
)
responses
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request
)]
assert
len
(
responses
)
==
1
assert
responses
[
0
].
HasField
(
"complete"
)
complete
=
responses
[
0
].
complete
assert
complete
.
finish_reason
in
[
"stop"
,
"length"
]
@
pytest
.
mark
.
asyncio
async
def
test_generate_multiple_requests
(
grpc_client
):
"""Test handling multiple concurrent Generate requests."""
async
def
make_request
(
request_id
:
str
):
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
request_id
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello"
,
input_ids
=
[
15496
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
5
),
stream
=
False
,
)
responses
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request
)]
return
responses
[
0
]
# Send multiple requests concurrently
tasks
=
[
make_request
(
f
"test-concurrent-
{
i
}
"
)
for
i
in
range
(
3
)]
responses
=
await
asyncio
.
gather
(
*
tasks
)
# Verify all requests completed successfully
assert
len
(
responses
)
==
3
for
i
,
response
in
enumerate
(
responses
):
assert
response
.
HasField
(
"complete"
)
@
pytest
.
mark
.
asyncio
async
def
test_generate_with_seed
(
grpc_client
):
"""Test Generate with a fixed seed for reproducibility."""
def
make_request
(
request_id
:
str
,
seed
:
int
):
return
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
request_id
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"The future of AI is"
,
input_ids
=
[
464
,
2003
,
286
,
9552
,
318
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
10
,
seed
=
seed
),
stream
=
False
,
)
# Make two requests with the same seed
request1
=
make_request
(
"test-seed-1"
,
42
)
request2
=
make_request
(
"test-seed-2"
,
42
)
response_list1
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request1
)]
response_list2
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request2
)]
# Both should complete successfully
assert
len
(
response_list1
)
==
1
assert
len
(
response_list2
)
==
1
assert
response_list1
[
0
].
HasField
(
"complete"
)
assert
response_list2
[
0
].
HasField
(
"complete"
)
# With the same seed, outputs should be identical
output_ids1
=
list
(
response_list1
[
0
].
complete
.
output_ids
)
output_ids2
=
list
(
response_list2
[
0
].
complete
.
output_ids
)
assert
output_ids1
==
output_ids2
@
pytest
.
mark
.
asyncio
async
def
test_generate_error_handling
(
grpc_client
):
"""Test error handling in Generate RPC."""
# Request with invalid top_p value (-33)
request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
"test-error-invalid-topp"
,
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
10
,
top_p
=-
33
),
stream
=
False
,
)
# Should raise an error response
with
pytest
.
raises
(
grpc
.
RpcError
)
as
exc_info
:
_
=
[
r
async
for
r
in
grpc_client
.
Generate
(
request
)]
assert
exc_info
.
value
.
code
()
==
grpc
.
StatusCode
.
INVALID_ARGUMENT
assert
"top_p must be in (0, 1], got -33.0"
in
exc_info
.
value
.
details
()
@
pytest
.
mark
.
asyncio
async
def
test_abort_request
(
grpc_client
):
"""Test the out-of-band Abort RPC."""
request_id
=
"test-abort-1"
# Start a long-running streaming generate request
generate_request
=
vllm_engine_pb2
.
GenerateRequest
(
request_id
=
request_id
,
tokenized
=
vllm_engine_pb2
.
TokenizedInput
(
original_text
=
"Hello"
,
input_ids
=
[
15496
],
),
sampling_params
=
vllm_engine_pb2
.
SamplingParams
(
temperature
=
0.0
,
min_tokens
=
500
,
max_tokens
=
500
,
# Request many tokens to ensure it runs long enough
),
stream
=
True
,
)
# Track whether we were aborted
was_aborted
=
False
received_chunks
=
0
async
def
run_generate
():
nonlocal
was_aborted
,
received_chunks
async
for
response
in
grpc_client
.
Generate
(
generate_request
):
if
response
.
HasField
(
"chunk"
):
received_chunks
+=
1
if
response
.
HasField
(
"complete"
):
complete
=
response
.
complete
was_aborted
=
complete
.
finish_reason
==
"abort"
else
:
was_aborted
=
False
async
def
abort_after_delay
():
# Small delay to ensure generate has started
await
asyncio
.
sleep
(
0.1
)
abort_request
=
vllm_engine_pb2
.
AbortRequest
(
request_ids
=
[
request_id
])
await
grpc_client
.
Abort
(
abort_request
)
# Run generate and abort concurrently
await
asyncio
.
gather
(
run_generate
(),
abort_after_delay
())
# The request should have been aborted (received final chunk with
# "abort" finish reason) and finished early due to the abort.
assert
was_aborted
and
received_chunks
<
500
,
(
"Request should have been aborted before generating all 500 tokens"
)
tests/entrypoints/test_responses_utils.py
View file @
7e63ef82
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
openai.types.chat
import
ChatCompletionMessageParam
from
openai.types.responses.response_function_tool_call
import
ResponseFunctionToolCall
from
openai.types.responses.response_function_tool_call_output_item
import
(
ResponseFunctionToolCallOutputItem
,
...
...
@@ -14,8 +15,10 @@ from openai.types.responses.response_reasoning_item import (
Summary
,
)
from
vllm.entrypoints.constants
import
MCP_PREFIX
from
vllm.entrypoints.responses_utils
import
(
_construct_single_message_from_response_item
,
_maybe_combine_reasoning_and_tool_call
,
construct_chat_messages_with_tool_call
,
convert_tool_responses_to_completions_format
,
)
...
...
@@ -160,3 +163,118 @@ class TestResponsesUtils:
formatted_item
=
_construct_single_message_from_response_item
(
output_item
)
assert
formatted_item
[
"role"
]
==
"assistant"
assert
formatted_item
[
"content"
]
==
"dongyi"
class
TestMaybeCombineReasoningAndToolCall
:
"""Tests for _maybe_combine_reasoning_and_tool_call function."""
def
test_returns_none_when_item_id_is_none
(
self
):
"""
Test fix from PR #31999: when item.id is None, should return None
instead of raising TypeError on startswith().
"""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
None
,
# This was causing TypeError before the fix
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
"{}"
,
)
messages
:
list
[
ChatCompletionMessageParam
]
=
[]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
def
test_returns_none_when_id_does_not_start_with_mcp_prefix
(
self
):
"""Test that non-MCP tool calls are not combined."""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
"regular_id"
,
# Does not start with MCP_PREFIX
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
"{}"
,
)
messages
=
[{
"role"
:
"assistant"
,
"reasoning"
:
"some reasoning"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
def
test_returns_none_when_last_message_is_not_assistant
(
self
):
"""Test that non-assistant last message returns None."""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
f
"
{
MCP_PREFIX
}
tool_id"
,
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
"{}"
,
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"hello"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
def
test_returns_none_when_last_message_has_no_reasoning
(
self
):
"""Test that assistant message without reasoning returns None."""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
f
"
{
MCP_PREFIX
}
tool_id"
,
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
"{}"
,
)
messages
=
[{
"role"
:
"assistant"
,
"content"
:
"some content"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
def
test_combines_reasoning_and_mcp_tool_call
(
self
):
"""Test successful combination of reasoning message and MCP tool call."""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
f
"
{
MCP_PREFIX
}
tool_id"
,
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
'{"arg": "value"}'
,
)
messages
=
[{
"role"
:
"assistant"
,
"reasoning"
:
"I need to call this tool"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
not
None
assert
result
[
"role"
]
==
"assistant"
assert
result
[
"reasoning"
]
==
"I need to call this tool"
assert
"tool_calls"
in
result
assert
len
(
result
[
"tool_calls"
])
==
1
assert
result
[
"tool_calls"
][
0
][
"id"
]
==
"call_123"
assert
result
[
"tool_calls"
][
0
][
"function"
][
"name"
]
==
"test_function"
assert
result
[
"tool_calls"
][
0
][
"function"
][
"arguments"
]
==
'{"arg": "value"}'
assert
result
[
"tool_calls"
][
0
][
"type"
]
==
"function"
def
test_returns_none_for_non_function_tool_call_type
(
self
):
"""Test that non-ResponseFunctionToolCall items return None."""
# Pass a dict instead of ResponseFunctionToolCall
item
=
{
"type"
:
"message"
,
"content"
:
"hello"
}
messages
=
[{
"role"
:
"assistant"
,
"reasoning"
:
"some reasoning"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
def
test_returns_none_when_id_is_empty_string
(
self
):
"""Test that empty string id returns None (falsy check)."""
item
=
ResponseFunctionToolCall
(
type
=
"function_call"
,
id
=
""
,
# Empty string is falsy
call_id
=
"call_123"
,
name
=
"test_function"
,
arguments
=
"{}"
,
)
messages
=
[{
"role"
:
"assistant"
,
"reasoning"
:
"some reasoning"
}]
result
=
_maybe_combine_reasoning_and_tool_call
(
item
,
messages
)
assert
result
is
None
tests/entrypoints/test_utils.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.entrypoints.utils
import
sanitize_message
def
test_sanitize_message
():
assert
(
sanitize_message
(
"<_io.BytesIO object at 0x7a95e299e750>"
)
==
"<_io.BytesIO object>"
)
tests/evals/gsm8k/README.md
View file @
7e63ef82
...
...
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite)
```
bash
pytest
-s
-v
tests/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
\
--tp-size
=
1
pytest
-s
-v
tests/evals/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
```
### Run standalone evaluation script
...
...
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold
:
0.54
# Minimum expected accuracy
num_questions
:
1319
# Number of questions (default: full test set)
num_fewshot
:
5
# Few-shot examples from train set
max_model_len
:
4096
# Model context length
server_args
:
"
--max-model-len
4096
--tensor-parallel-size
2"
# Server arguments
env
:
# Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
```
The
`server_args`
field accepts any arguments that can be passed to
`vllm serve`
.
The
`env`
field accepts a dictionary of environment variables to set for the server process.
tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
deepseek-ai/DeepSeek-R1"
accuracy_threshold
:
0.95
num_questions
:
1319
num_fewshot
:
5
startup_max_wait_seconds
:
1200
server_args
:
>-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
deepseek-ai/DeepSeek-R1"
accuracy_threshold
:
0.95
num_questions
:
1319
num_fewshot
:
5
startup_max_wait_seconds
:
1200
server_args
:
>-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 8
--enable-expert-parallel
--speculative-config '{"method":"mtp","num_speculative_tokens":1}'
Prev
1
…
14
15
16
17
18
19
20
21
22
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment