Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2687 additions
and
105 deletions
+2687
-105
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
+276
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+18
-4
tests/entrypoints/openai/parser/test_harmony_utils.py
tests/entrypoints/openai/parser/test_harmony_utils.py
+666
-85
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+227
-0
tests/entrypoints/openai/test_completion_error.py
tests/entrypoints/openai/test_completion_error.py
+216
-0
tests/entrypoints/openai/test_messages.py
tests/entrypoints/openai/test_messages.py
+6
-3
tests/entrypoints/openai/test_responses_error.py
tests/entrypoints/openai/test_responses_error.py
+89
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+645
-1
tests/entrypoints/openai/test_serving_engine.py
tests/entrypoints/openai/test_serving_engine.py
+1
-1
tests/entrypoints/openai/test_serving_responses.py
tests/entrypoints/openai/test_serving_responses.py
+3
-3
tests/entrypoints/openai/test_sparse_tensor_validation.py
tests/entrypoints/openai/test_sparse_tensor_validation.py
+342
-0
tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
...ypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
...ntrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
...ints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
...oints/openai/tool_parsers/test_llama3_json_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
...s/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
...entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
...rypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+1
-1
tests/entrypoints/openai/tool_parsers/utils.py
tests/entrypoints/openai/tool_parsers/utils.py
+1
-1
tests/entrypoints/openai/utils.py
tests/entrypoints/openai/utils.py
+190
-0
No files found.
tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4
from
dataclasses
import
dataclass
import
pytest
import
torch
from
tests.kernels.moe.utils
import
make_test_quant_config
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.distributed.eplb.rebalance_execute
import
rearrange_expert_weights_inplace
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
get_dp_group
,
)
from
vllm.forward_context
import
set_forward_context
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
from
vllm.model_executor.layers.quantization.modelopt
import
(
ModelOptNvFp4Config
,
ModelOptNvFp4FusedMoE
,
)
from
.eplb_utils
import
distributed_run
,
set_env_vars_and_device
@
dataclass
class
TestConfig
:
num_layers
:
int
num_experts
:
int
num_local_experts
:
int
num_topk
:
int
hidden_size
:
int
intermediate_size
:
int
num_tokens
:
int
def
make_fused_moe_layer
(
rank
:
int
,
layer_idx
:
int
,
test_config
:
TestConfig
,
)
->
FusedMoE
:
quant_config
=
None
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
quant_config
=
ModelOptNvFp4Config
(
is_checkpoint_nvfp4_serialized
=
True
,
kv_cache_quant_algo
=
None
,
exclude_modules
=
[],
)
fml
=
FusedMoE
(
num_experts
=
test_config
.
num_experts
,
top_k
=
test_config
.
num_topk
,
hidden_size
=
test_config
.
hidden_size
,
intermediate_size
=
test_config
.
intermediate_size
,
prefix
=
f
"dummy_layer_
{
layer_idx
}
"
,
activation
=
"silu"
,
is_act_and_mul
=
True
,
params_dtype
=
torch
.
bfloat16
,
quant_config
=
quant_config
,
)
nvfp4_fused_moe
=
ModelOptNvFp4FusedMoE
(
quant_config
,
fml
)
nvfp4_fused_moe
.
create_weights
(
fml
,
test_config
.
num_local_experts
,
test_config
.
hidden_size
,
test_config
.
intermediate_size
,
params_dtype
=
torch
.
uint8
,
global_num_experts
=
test_config
.
num_experts
,
)
fml
=
fml
.
to
(
device
)
w1_q
,
w2_q
,
quant_config
=
make_test_quant_config
(
test_config
.
num_local_experts
,
test_config
.
intermediate_size
,
test_config
.
hidden_size
,
in_dtype
=
torch
.
bfloat16
,
quant_dtype
=
"nvfp4"
,
block_shape
=
None
,
per_act_token_quant
=
False
,
)
fml
.
w13_weight
.
data
=
w1_q
fml
.
w2_weight
.
data
=
w2_q
fml
.
w2_input_scale
.
data
=
torch
.
randn_like
(
fml
.
w2_input_scale
.
data
)
/
5
fml
.
w13_input_scale
.
data
=
torch
.
randn_like
(
fml
.
w13_input_scale
.
data
)
/
5
fml
.
w2_weight_scale_2
.
data
=
torch
.
randn_like
(
fml
.
w2_weight_scale_2
.
data
)
/
5
fml
.
w13_weight_scale_2
.
data
=
torch
.
randn_like
(
fml
.
w13_weight_scale_2
.
data
)
/
5
fml
.
w2_weight_scale
.
data
=
(
torch
.
randn
(
fml
.
w2_weight_scale
.
data
.
shape
,
device
=
device
)
/
5
).
to
(
fml
.
w2_weight_scale
.
data
.
dtype
)
fml
.
w13_weight_scale
.
data
=
(
torch
.
randn
(
fml
.
w13_weight_scale
.
data
.
shape
,
device
=
device
)
/
5
).
to
(
fml
.
w13_weight_scale
.
data
.
dtype
)
nvfp4_fused_moe
.
process_weights_after_loading
(
fml
)
fml
.
maybe_init_modular_kernel
()
return
fml
def
_test_eplb_fml
(
env
,
world_size
:
int
,
test_config
:
TestConfig
):
set_env_vars_and_device
(
env
)
vllm_config
=
VllmConfig
()
vllm_config
.
parallel_config
.
data_parallel_size
=
world_size
vllm_config
.
parallel_config
.
enable_expert_parallel
=
True
with
set_current_vllm_config
(
vllm_config
):
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
1
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_dp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
fml_layers
=
[
make_fused_moe_layer
(
ep_rank
,
layer_idx
,
test_config
).
to
(
device
)
for
layer_idx
in
range
(
test_config
.
num_layers
)
]
rank_expert_weights
=
[
fml
.
get_expert_weights
()
for
fml
in
fml_layers
]
hidden_states
=
[]
router_logits
=
[]
for
layer_idx
in
range
(
test_config
.
num_layers
):
hidden_states
.
append
(
torch
.
randn
(
(
test_config
.
num_tokens
,
test_config
.
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
device
,
)
)
router_logits
.
append
(
torch
.
randn
(
(
test_config
.
num_tokens
,
test_config
.
num_experts
),
dtype
=
torch
.
bfloat16
,
device
=
device
,
)
)
out_before_shuffle
=
[]
with
set_forward_context
(
{},
num_tokens
=
test_config
.
num_tokens
,
num_tokens_across_dp
=
torch
.
tensor
(
[
test_config
.
num_tokens
]
*
world_size
,
device
=
"cpu"
,
dtype
=
torch
.
int
),
vllm_config
=
vllm_config
,
):
for
lidx
,
fml
in
enumerate
(
fml_layers
):
out_before_shuffle
.
append
(
fml
(
hidden_states
[
lidx
].
clone
(),
router_logits
[
lidx
].
clone
())
)
indices
=
torch
.
zeros
(
test_config
.
num_layers
,
test_config
.
num_experts
,
dtype
=
torch
.
long
)
for
lidx
in
range
(
test_config
.
num_layers
):
indices
[
lidx
]
=
torch
.
Tensor
(
range
(
test_config
.
num_experts
))
shuffled_indices
=
torch
.
zeros_like
(
indices
)
for
lidx
in
range
(
test_config
.
num_layers
):
shuffled_indices
[
lidx
]
=
torch
.
randperm
(
test_config
.
num_experts
)
rearrange_expert_weights_inplace
(
indices
,
shuffled_indices
,
rank_expert_weights
,
ep_group
,
is_profile
=
False
,
)
num_global_experts
=
test_config
.
num_experts
logical_to_physical_map_list
=
[]
for
lidx
,
fml
in
enumerate
(
fml_layers
):
physical_to_logical_map
=
shuffled_indices
[
lidx
].
to
(
device
)
logical_to_physical_map
=
torch
.
empty
(
(
num_global_experts
,),
dtype
=
torch
.
int32
,
device
=
device
)
logical_to_physical_map
[
physical_to_logical_map
]
=
torch
.
arange
(
0
,
num_global_experts
,
dtype
=
torch
.
int32
,
device
=
device
)
logical_to_physical_map_list
.
append
(
logical_to_physical_map
.
reshape
(
num_global_experts
,
1
)
)
logical_to_physical_map
=
torch
.
stack
(
logical_to_physical_map_list
)
for
lidx
,
fml
in
enumerate
(
fml_layers
):
logical_replica_count
=
torch
.
ones
(
(
test_config
.
num_layers
,
num_global_experts
),
dtype
=
torch
.
int32
,
device
=
device
,
)
fml
.
enable_eplb
=
True
fml
.
set_eplb_state
(
lidx
,
torch
.
zeros
(
(
test_config
.
num_layers
,
num_global_experts
),
dtype
=
torch
.
int32
,
device
=
device
,
),
logical_to_physical_map
,
logical_replica_count
,
)
out_after_shuffle
=
[]
with
set_forward_context
(
{},
num_tokens
=
test_config
.
num_tokens
,
num_tokens_across_dp
=
torch
.
tensor
(
[
test_config
.
num_tokens
]
*
world_size
,
device
=
"cpu"
,
dtype
=
torch
.
int
),
vllm_config
=
vllm_config
,
):
for
lidx
,
fml
in
enumerate
(
fml_layers
):
out_after_shuffle
.
append
(
fml
(
hidden_states
[
lidx
].
clone
(),
router_logits
[
lidx
].
clone
())
)
for
lidx
in
range
(
test_config
.
num_layers
):
torch
.
testing
.
assert_close
(
out_before_shuffle
[
lidx
],
out_after_shuffle
[
lidx
],
atol
=
1e-1
,
rtol
=
1e-1
)
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"num_layers"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_experts"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"intermediate_size"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"latency"
,
"throughput"
])
def
test_eplb_fml
(
world_size
:
int
,
num_layers
:
int
,
num_experts
:
int
,
hidden_size
:
int
,
intermediate_size
:
int
,
num_tokens
:
int
,
backend
:
str
,
monkeypatch
,
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
backend
)
if
torch
.
cuda
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
num_local_experts
=
num_experts
//
world_size
num_topk
=
4
test_config
=
TestConfig
(
num_layers
=
num_layers
,
num_experts
=
num_experts
,
num_local_experts
=
num_local_experts
,
num_topk
=
num_topk
,
hidden_size
=
hidden_size
,
intermediate_size
=
intermediate_size
,
num_tokens
=
num_tokens
,
)
distributed_run
(
_test_eplb_fml
,
world_size
,
test_config
,
)
tests/engine/test_arg_utils.py
View file @
a3f8d5dd
...
...
@@ -350,21 +350,35 @@ def test_human_readable_model_len():
assert
args
.
max_model_len
==
1_000_000
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10k"
])
assert
args
.
max_model_len
==
10_000
args
=
parser
.
parse_args
([
"--max-model-len"
,
"2g"
])
assert
args
.
max_model_len
==
2_000_000_000
args
=
parser
.
parse_args
([
"--max-model-len"
,
"2t"
])
assert
args
.
max_model_len
==
2_000_000_000_000
# Capital
args
=
parser
.
parse_args
([
"--max-model-len"
,
"3K"
])
assert
args
.
max_model_len
==
10
24
*
3
assert
args
.
max_model_len
==
2
**
10
*
3
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10M"
])
assert
args
.
max_model_len
==
2
**
20
*
10
args
=
parser
.
parse_args
([
"--max-model-len"
,
"4G"
])
assert
args
.
max_model_len
==
2
**
30
*
4
args
=
parser
.
parse_args
([
"--max-model-len"
,
"4T"
])
assert
args
.
max_model_len
==
2
**
40
*
4
# Decimal values
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2k"
])
assert
args
.
max_model_len
==
10200
# ..truncated to the nearest int
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.212345k"
])
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.212345
1234567
k"
])
assert
args
.
max_model_len
==
10212
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567m"
])
assert
args
.
max_model_len
==
10212345
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567g"
])
assert
args
.
max_model_len
==
10212345123
args
=
parser
.
parse_args
([
"--max-model-len"
,
"10.2123451234567t"
])
assert
args
.
max_model_len
==
10212345123456
# Invalid (do not allow decimals with binary multipliers)
for
invalid
in
[
"1a"
,
"pwd"
,
"10.24"
,
"1.23M"
]:
for
invalid
in
[
"1a"
,
"pwd"
,
"10.24"
,
"1.23M"
,
"1.22T"
]:
with
pytest
.
raises
(
ArgumentError
):
args
=
parser
.
parse_args
([
"--max-model-len"
,
invalid
])
parser
.
parse_args
([
"--max-model-len"
,
invalid
])
tests/entrypoints/openai/parser/test_harmony_utils.py
View file @
a3f8d5dd
This diff is collapsed.
Click to expand it.
tests/entrypoints/openai/test_chat_error.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
,
field
from
http
import
HTTPStatus
from
typing
import
Any
from
unittest.mock
import
AsyncMock
,
MagicMock
import
pytest
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
ErrorResponse
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_models
import
BaseModelPath
,
OpenAIServingModels
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.v1.engine.async_llm
import
AsyncLLM
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
),
BaseModelPath
(
name
=
MODEL_NAME_SHORT
,
model_path
=
MODEL_NAME_SHORT
),
]
@
dataclass
class
MockHFConfig
:
model_type
:
str
=
"any"
@
dataclass
class
MockModelConfig
:
task
=
"generate"
runner_type
=
"generate"
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
tokenizer_mode
=
"auto"
max_model_len
=
100
tokenizer_revision
=
None
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
logits_processor_pattern
=
None
logits_processors
:
list
[
str
]
|
None
=
None
diff_sampling_param
:
dict
|
None
=
None
allowed_local_media_path
:
str
=
""
allowed_media_domains
:
list
[
str
]
|
None
=
None
encoder_config
=
None
generation_config
:
str
=
"auto"
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
skip_tokenizer_init
=
False
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
def
_build_serving_chat
(
engine
:
AsyncLLM
)
->
OpenAIServingChat
:
models
=
OpenAIServingModels
(
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
serving_chat
=
OpenAIServingChat
(
engine
,
models
,
response_role
=
"assistant"
,
request_logger
=
None
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
async
def
_fake_process_inputs
(
request_id
,
engine_prompt
,
sampling_params
,
*
,
lora_request
,
trace_headers
,
priority
,
):
return
dict
(
engine_prompt
),
{}
async
def
_fake_preprocess_chat
(
*
args
,
**
kwargs
):
# return conversation, engine_prompts
return
(
[{
"role"
:
"user"
,
"content"
:
"Test"
}],
[{
"prompt_token_ids"
:
[
1
,
2
,
3
]}],
)
serving_chat
.
_process_inputs
=
AsyncMock
(
side_effect
=
_fake_process_inputs
)
serving_chat
.
_preprocess_chat
=
AsyncMock
(
side_effect
=
_fake_preprocess_chat
)
return
serving_chat
@
pytest
.
mark
.
asyncio
async
def
test_chat_error_non_stream
():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
MockModelConfig
()
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
serving_chat
=
_build_serving_chat
(
mock_engine
)
completion_output
=
CompletionOutput
(
index
=
0
,
text
=
""
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
"error"
,
)
request_output
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output
],
finished
=
True
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
async
def
mock_generate
(
*
args
,
**
kwargs
):
yield
request_output
mock_engine
.
generate
=
MagicMock
(
side_effect
=
mock_generate
)
request
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"Test prompt"
}],
max_tokens
=
10
,
stream
=
False
,
)
response
=
await
serving_chat
.
create_chat_completion
(
request
)
assert
isinstance
(
response
,
ErrorResponse
)
assert
response
.
error
.
type
==
"InternalServerError"
assert
response
.
error
.
message
==
"Internal server error"
assert
response
.
error
.
code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
@
pytest
.
mark
.
asyncio
async
def
test_chat_error_stream
():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
MockModelConfig
()
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
serving_chat
=
_build_serving_chat
(
mock_engine
)
completion_output_1
=
CompletionOutput
(
index
=
0
,
text
=
"Hello"
,
token_ids
=
[
100
],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
None
,
)
request_output_1
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output_1
],
finished
=
False
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
completion_output_2
=
CompletionOutput
(
index
=
0
,
text
=
"Hello"
,
token_ids
=
[
100
],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
"error"
,
)
request_output_2
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output_2
],
finished
=
True
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
async
def
mock_generate
(
*
args
,
**
kwargs
):
yield
request_output_1
yield
request_output_2
mock_engine
.
generate
=
MagicMock
(
side_effect
=
mock_generate
)
request
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"Test prompt"
}],
max_tokens
=
10
,
stream
=
True
,
)
response
=
await
serving_chat
.
create_chat_completion
(
request
)
chunks
=
[]
async
for
chunk
in
response
:
chunks
.
append
(
chunk
)
assert
len
(
chunks
)
>=
2
assert
any
(
"Internal server error"
in
chunk
for
chunk
in
chunks
),
(
f
"Expected error message in chunks:
{
chunks
}
"
)
assert
chunks
[
-
1
]
==
"data: [DONE]
\n\n
"
tests/entrypoints/openai/test_completion_error.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
,
field
from
http
import
HTTPStatus
from
typing
import
Any
from
unittest.mock
import
AsyncMock
,
MagicMock
import
pytest
from
vllm.config.multimodal
import
MultiModalConfig
from
vllm.entrypoints.openai.protocol
import
CompletionRequest
,
ErrorResponse
from
vllm.entrypoints.openai.serving_completion
import
OpenAIServingCompletion
from
vllm.entrypoints.openai.serving_models
import
BaseModelPath
,
OpenAIServingModels
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.v1.engine.async_llm
import
AsyncLLM
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
),
BaseModelPath
(
name
=
MODEL_NAME_SHORT
,
model_path
=
MODEL_NAME_SHORT
),
]
@
dataclass
class
MockHFConfig
:
model_type
:
str
=
"any"
@
dataclass
class
MockModelConfig
:
task
=
"generate"
runner_type
=
"generate"
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
tokenizer_mode
=
"auto"
max_model_len
=
100
tokenizer_revision
=
None
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
logits_processor_pattern
=
None
logits_processors
:
list
[
str
]
|
None
=
None
diff_sampling_param
:
dict
|
None
=
None
allowed_local_media_path
:
str
=
""
allowed_media_domains
:
list
[
str
]
|
None
=
None
encoder_config
=
None
generation_config
:
str
=
"auto"
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
skip_tokenizer_init
=
False
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
def
_build_serving_completion
(
engine
:
AsyncLLM
)
->
OpenAIServingCompletion
:
models
=
OpenAIServingModels
(
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
serving_completion
=
OpenAIServingCompletion
(
engine
,
models
,
request_logger
=
None
,
)
async
def
_fake_process_inputs
(
request_id
,
engine_prompt
,
sampling_params
,
*
,
lora_request
,
trace_headers
,
priority
,
):
return
dict
(
engine_prompt
),
{}
serving_completion
.
_process_inputs
=
AsyncMock
(
side_effect
=
_fake_process_inputs
)
return
serving_completion
@
pytest
.
mark
.
asyncio
async
def
test_completion_error_non_stream
():
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
MockModelConfig
()
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
serving_completion
=
_build_serving_completion
(
mock_engine
)
completion_output
=
CompletionOutput
(
index
=
0
,
text
=
""
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
"error"
,
)
request_output
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output
],
finished
=
True
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
async
def
mock_generate
(
*
args
,
**
kwargs
):
yield
request_output
mock_engine
.
generate
=
MagicMock
(
side_effect
=
mock_generate
)
request
=
CompletionRequest
(
model
=
MODEL_NAME
,
prompt
=
"Test prompt"
,
max_tokens
=
10
,
stream
=
False
,
)
response
=
await
serving_completion
.
create_completion
(
request
)
assert
isinstance
(
response
,
ErrorResponse
)
assert
response
.
error
.
type
==
"InternalServerError"
assert
response
.
error
.
message
==
"Internal server error"
assert
response
.
error
.
code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
@
pytest
.
mark
.
asyncio
async
def
test_completion_error_stream
():
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
mock_engine
=
MagicMock
(
spec
=
AsyncLLM
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
mock_engine
.
model_config
=
MockModelConfig
()
mock_engine
.
input_processor
=
MagicMock
()
mock_engine
.
io_processor
=
MagicMock
()
serving_completion
=
_build_serving_completion
(
mock_engine
)
completion_output_1
=
CompletionOutput
(
index
=
0
,
text
=
"Hello"
,
token_ids
=
[
100
],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
None
,
)
request_output_1
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output_1
],
finished
=
False
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
completion_output_2
=
CompletionOutput
(
index
=
0
,
text
=
"Hello"
,
token_ids
=
[
100
],
cumulative_logprob
=
None
,
logprobs
=
None
,
finish_reason
=
"error"
,
)
request_output_2
=
RequestOutput
(
request_id
=
"test-id"
,
prompt
=
"Test prompt"
,
prompt_token_ids
=
[
1
,
2
,
3
],
prompt_logprobs
=
None
,
outputs
=
[
completion_output_2
],
finished
=
True
,
metrics
=
None
,
lora_request
=
None
,
encoder_prompt
=
None
,
encoder_prompt_token_ids
=
None
,
)
async
def
mock_generate
(
*
args
,
**
kwargs
):
yield
request_output_1
yield
request_output_2
mock_engine
.
generate
=
MagicMock
(
side_effect
=
mock_generate
)
request
=
CompletionRequest
(
model
=
MODEL_NAME
,
prompt
=
"Test prompt"
,
max_tokens
=
10
,
stream
=
True
,
)
response
=
await
serving_completion
.
create_completion
(
request
)
chunks
=
[]
async
for
chunk
in
response
:
chunks
.
append
(
chunk
)
assert
len
(
chunks
)
>=
2
assert
any
(
"Internal server error"
in
chunk
for
chunk
in
chunks
),
(
f
"Expected error message in chunks:
{
chunks
}
"
)
assert
chunks
[
-
1
]
==
"data: [DONE]
\n\n
"
tests/entrypoints/openai/test_messages.py
View file @
a3f8d5dd
...
...
@@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
assert
chunk_count
>
0
assert
first_chunk
is
not
None
,
"message_start chunk was never observed"
assert
first_chunk
.
usage
is
not
None
,
"first chunk should include usage stats"
assert
first_chunk
.
usage
[
"output_tokens"
]
==
0
assert
first_chunk
.
usage
[
"input_tokens"
]
>
5
assert
first_chunk
.
message
is
not
None
,
"first chunk should include message"
assert
first_chunk
.
message
.
usage
is
not
None
,
(
"first chunk should include usage stats"
)
assert
first_chunk
.
message
.
usage
.
output_tokens
==
0
assert
first_chunk
.
message
.
usage
.
input_tokens
>
5
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_responses_error.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
http
import
HTTPStatus
from
unittest.mock
import
MagicMock
import
pytest
from
vllm.entrypoints.openai.protocol
import
ErrorResponse
from
vllm.entrypoints.openai.serving_engine
import
GenerationError
,
OpenAIServing
@
pytest
.
mark
.
asyncio
async
def
test_raise_if_error_raises_generation_error
():
"""test _raise_if_error raises GenerationError"""
# create a minimal OpenAIServing instance
mock_engine
=
MagicMock
()
mock_engine
.
model_config
=
MagicMock
()
mock_engine
.
model_config
.
max_model_len
=
100
mock_models
=
MagicMock
()
serving
=
OpenAIServing
(
engine_client
=
mock_engine
,
models
=
mock_models
,
request_logger
=
None
,
)
# test that error finish_reason raises GenerationError
with
pytest
.
raises
(
GenerationError
)
as
exc_info
:
serving
.
_raise_if_error
(
"error"
,
"test-request-id"
)
assert
str
(
exc_info
.
value
)
==
"Internal server error"
assert
exc_info
.
value
.
status_code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
# test that other finish_reasons don't raise
serving
.
_raise_if_error
(
"stop"
,
"test-request-id"
)
# should not raise
serving
.
_raise_if_error
(
"length"
,
"test-request-id"
)
# should not raise
serving
.
_raise_if_error
(
None
,
"test-request-id"
)
# should not raise
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_response
():
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
mock_engine
=
MagicMock
()
mock_engine
.
model_config
=
MagicMock
()
mock_engine
.
model_config
.
max_model_len
=
100
mock_models
=
MagicMock
()
serving
=
OpenAIServing
(
engine_client
=
mock_engine
,
models
=
mock_models
,
request_logger
=
None
,
)
# create a GenerationError
gen_error
=
GenerationError
(
"Internal server error"
)
# convert to ErrorResponse
error_response
=
serving
.
_convert_generation_error_to_response
(
gen_error
)
assert
isinstance
(
error_response
,
ErrorResponse
)
assert
error_response
.
error
.
type
==
"InternalServerError"
assert
error_response
.
error
.
message
==
"Internal server error"
assert
error_response
.
error
.
code
==
HTTPStatus
.
INTERNAL_SERVER_ERROR
@
pytest
.
mark
.
asyncio
async
def
test_convert_generation_error_to_streaming_response
():
"""test _convert_generation_error_to_streaming_response output"""
mock_engine
=
MagicMock
()
mock_engine
.
model_config
=
MagicMock
()
mock_engine
.
model_config
.
max_model_len
=
100
mock_models
=
MagicMock
()
serving
=
OpenAIServing
(
engine_client
=
mock_engine
,
models
=
mock_models
,
request_logger
=
None
,
)
# create a GenerationError
gen_error
=
GenerationError
(
"Internal server error"
)
# convert to streaming error response
error_json
=
serving
.
_convert_generation_error_to_streaming_response
(
gen_error
)
assert
isinstance
(
error_json
,
str
)
assert
"Internal server error"
in
error_json
assert
"InternalServerError"
in
error_json
tests/entrypoints/openai/test_serving_chat.py
View file @
a3f8d5dd
This diff is collapsed.
Click to expand it.
tests/entrypoints/openai/test_serving_engine.py
View file @
a3f8d5dd
...
...
@@ -10,7 +10,7 @@ import pytest
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.openai.serving_models
import
OpenAIServingModels
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
@
pytest
.
fixture
()
...
...
tests/entrypoints/openai/test_serving_responses.py
View file @
a3f8d5dd
...
...
@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.serving_responses import (
extract_tool_types
,
)
from
vllm.entrypoints.tool_server
import
ToolServer
from
vllm.inputs.data
import
TokensPrompt
as
EngineTokensPrompt
from
vllm.inputs.data
import
TokensPrompt
class
MockConversationContext
(
ConversationContext
):
...
...
@@ -237,7 +237,7 @@ class TestValidateGeneratorInput:
"""Test _validate_generator_input with valid prompt length"""
# Create an engine prompt with valid length (less than max_model_len)
valid_prompt_token_ids
=
list
(
range
(
5
))
# 5 tokens < 100 max_model_len
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
valid_prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
valid_prompt_token_ids
)
# Call the method
result
=
serving_responses_instance
.
_validate_generator_input
(
engine_prompt
)
...
...
@@ -247,7 +247,7 @@ class TestValidateGeneratorInput:
# create an invalid engine prompt
invalid_prompt_token_ids
=
list
(
range
(
200
))
# 100 tokens >= 100 max_model_len
engine_prompt
=
Engine
TokensPrompt
(
prompt_token_ids
=
invalid_prompt_token_ids
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
invalid_prompt_token_ids
)
# Call the method
result
=
serving_responses_instance
.
_validate_generator_input
(
engine_prompt
)
...
...
tests/entrypoints/openai/test_sparse_tensor_validation.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Sparse tensor validation in embedding APIs.
Tests verify that malicious sparse tensors are rejected before they can trigger
out-of-bounds memory writes during to_dense() operations.
"""
import
base64
import
io
import
pytest
import
torch
from
vllm.entrypoints.renderer
import
CompletionRenderer
from
vllm.multimodal.audio
import
AudioEmbeddingMediaIO
from
vllm.multimodal.image
import
ImageEmbeddingMediaIO
def
_encode_tensor
(
tensor
:
torch
.
Tensor
)
->
bytes
:
"""Helper to encode a tensor as base64 bytes."""
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
return
base64
.
b64encode
(
buffer
.
read
())
def
_create_malicious_sparse_tensor
()
->
torch
.
Tensor
:
"""
Create a malicious sparse COO tensor with out-of-bounds indices.
This tensor has indices that point beyond the declared shape, which would
cause an out-of-bounds write when converted to dense format without
validation.
"""
# Create a 3x3 sparse tensor but with indices pointing to (10, 10)
indices
=
torch
.
tensor
([[
10
],
[
10
]])
# Out of bounds for 3x3 shape
values
=
torch
.
tensor
([
1.0
])
shape
=
(
3
,
3
)
# Create sparse tensor (this will be invalid)
sparse_tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
,
dtype
=
torch
.
float32
)
return
sparse_tensor
def
_create_valid_sparse_tensor
()
->
torch
.
Tensor
:
"""Create a valid sparse COO tensor for baseline testing."""
indices
=
torch
.
tensor
([[
0
,
1
,
2
],
[
0
,
1
,
2
]])
values
=
torch
.
tensor
([
1.0
,
2.0
,
3.0
])
shape
=
(
3
,
3
)
sparse_tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
,
dtype
=
torch
.
float32
)
return
sparse_tensor
def
_create_valid_dense_tensor
()
->
torch
.
Tensor
:
"""Create a valid dense tensor for baseline testing."""
return
torch
.
randn
(
10
,
768
,
dtype
=
torch
.
float32
)
# (seq_len, hidden_size)
class
TestPromptEmbedsValidation
:
"""Test sparse tensor validation in prompt embeddings (Completions API)."""
def
test_valid_dense_tensor_accepted
(
self
,
model_config
):
"""Baseline: Valid dense tensors should work normally."""
renderer
=
CompletionRenderer
(
model_config
)
valid_tensor
=
_create_valid_dense_tensor
()
encoded
=
_encode_tensor
(
valid_tensor
)
# Should not raise any exception
result
=
renderer
.
load_prompt_embeds
(
encoded
)
assert
len
(
result
)
==
1
assert
result
[
0
][
"prompt_embeds"
].
shape
==
valid_tensor
.
shape
def
test_valid_sparse_tensor_accepted
(
self
):
"""Baseline: Valid sparse tensors should load successfully."""
io_handler
=
ImageEmbeddingMediaIO
()
valid_sparse
=
_create_valid_sparse_tensor
()
encoded
=
_encode_tensor
(
valid_sparse
)
# Should not raise any exception (sparse tensors remain sparse)
result
=
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
assert
result
.
shape
==
valid_sparse
.
shape
def
test_malicious_sparse_tensor_rejected
(
self
,
model_config
):
"""Security: Malicious sparse tensors should be rejected."""
renderer
=
CompletionRenderer
(
model_config
)
malicious_tensor
=
_create_malicious_sparse_tensor
()
encoded
=
_encode_tensor
(
malicious_tensor
)
# Should raise RuntimeError due to invalid sparse tensor
with
pytest
.
raises
((
RuntimeError
,
ValueError
))
as
exc_info
:
renderer
.
load_prompt_embeds
(
encoded
)
# Error should indicate sparse tensor validation failure
error_msg
=
str
(
exc_info
.
value
).
lower
()
assert
"sparse"
in
error_msg
or
"index"
in
error_msg
or
"bounds"
in
error_msg
def
test_extremely_large_indices_rejected
(
self
,
model_config
):
"""Security: Sparse tensors with extremely large indices should be rejected."""
renderer
=
CompletionRenderer
(
model_config
)
# Create tensor with indices far beyond reasonable bounds
indices
=
torch
.
tensor
([[
999999
],
[
999999
]])
values
=
torch
.
tensor
([
1.0
])
shape
=
(
10
,
10
)
malicious_tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
,
dtype
=
torch
.
float32
)
encoded
=
_encode_tensor
(
malicious_tensor
)
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
renderer
.
load_prompt_embeds
(
encoded
)
def
test_negative_indices_rejected
(
self
,
model_config
):
"""Security: Sparse tensors with negative indices should be rejected."""
renderer
=
CompletionRenderer
(
model_config
)
# Create tensor with negative indices
indices
=
torch
.
tensor
([[
-
1
],
[
-
1
]])
values
=
torch
.
tensor
([
1.0
])
shape
=
(
10
,
10
)
malicious_tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
,
dtype
=
torch
.
float32
)
encoded
=
_encode_tensor
(
malicious_tensor
)
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
renderer
.
load_prompt_embeds
(
encoded
)
class
TestImageEmbedsValidation
:
"""Test sparse tensor validation in image embeddings (Chat API)."""
def
test_valid_dense_tensor_accepted
(
self
):
"""Baseline: Valid dense tensors should work normally."""
io_handler
=
ImageEmbeddingMediaIO
()
valid_tensor
=
_create_valid_dense_tensor
()
encoded
=
_encode_tensor
(
valid_tensor
)
# Should not raise any exception
result
=
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
assert
result
.
shape
==
valid_tensor
.
shape
def
test_valid_sparse_tensor_accepted
(
self
):
"""Baseline: Valid sparse tensors should load successfully."""
io_handler
=
AudioEmbeddingMediaIO
()
valid_sparse
=
_create_valid_sparse_tensor
()
encoded
=
_encode_tensor
(
valid_sparse
)
# Should not raise any exception (sparse tensors remain sparse)
result
=
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
assert
result
.
shape
==
valid_sparse
.
shape
def
test_malicious_sparse_tensor_rejected
(
self
):
"""Security: Malicious sparse tensors should be rejected."""
io_handler
=
ImageEmbeddingMediaIO
()
malicious_tensor
=
_create_malicious_sparse_tensor
()
encoded
=
_encode_tensor
(
malicious_tensor
)
# Should raise RuntimeError due to invalid sparse tensor
with
pytest
.
raises
((
RuntimeError
,
ValueError
))
as
exc_info
:
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
error_msg
=
str
(
exc_info
.
value
).
lower
()
assert
"sparse"
in
error_msg
or
"index"
in
error_msg
or
"bounds"
in
error_msg
def
test_load_bytes_validates
(
self
):
"""Security: Validation should also work for load_bytes method."""
io_handler
=
ImageEmbeddingMediaIO
()
malicious_tensor
=
_create_malicious_sparse_tensor
()
buffer
=
io
.
BytesIO
()
torch
.
save
(
malicious_tensor
,
buffer
)
buffer
.
seek
(
0
)
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
io_handler
.
load_bytes
(
buffer
.
read
())
class
TestAudioEmbedsValidation
:
"""Test sparse tensor validation in audio embeddings (Chat API)."""
def
test_valid_dense_tensor_accepted
(
self
):
"""Baseline: Valid dense tensors should work normally."""
io_handler
=
AudioEmbeddingMediaIO
()
valid_tensor
=
_create_valid_dense_tensor
()
encoded
=
_encode_tensor
(
valid_tensor
)
# Should not raise any exception
result
=
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
assert
result
.
shape
==
valid_tensor
.
shape
def
test_valid_sparse_tensor_accepted
(
self
):
"""Baseline: Valid sparse tensors should be converted successfully."""
io_handler
=
AudioEmbeddingMediaIO
()
valid_sparse
=
_create_valid_sparse_tensor
()
encoded
=
_encode_tensor
(
valid_sparse
)
# Should not raise any exception
result
=
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
assert
result
.
is_sparse
is
False
def
test_malicious_sparse_tensor_rejected
(
self
):
"""Security: Malicious sparse tensors should be rejected."""
io_handler
=
AudioEmbeddingMediaIO
()
malicious_tensor
=
_create_malicious_sparse_tensor
()
encoded
=
_encode_tensor
(
malicious_tensor
)
# Should raise RuntimeError due to invalid sparse tensor
with
pytest
.
raises
((
RuntimeError
,
ValueError
))
as
exc_info
:
io_handler
.
load_base64
(
""
,
encoded
.
decode
(
"utf-8"
))
error_msg
=
str
(
exc_info
.
value
).
lower
()
assert
"sparse"
in
error_msg
or
"index"
in
error_msg
or
"bounds"
in
error_msg
def
test_load_bytes_validates
(
self
):
"""Security: Validation should also work for load_bytes method."""
io_handler
=
AudioEmbeddingMediaIO
()
malicious_tensor
=
_create_malicious_sparse_tensor
()
buffer
=
io
.
BytesIO
()
torch
.
save
(
malicious_tensor
,
buffer
)
buffer
.
seek
(
0
)
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
io_handler
.
load_bytes
(
buffer
.
read
())
class
TestSparseTensorValidationIntegration
:
"""
These tests verify the complete attack chain is blocked at all entry points.
"""
def
test_attack_scenario_completions_api
(
self
,
model_config
):
"""
Simulate a complete attack through the Completions API.
Attack scenario:
1. Attacker crafts malicious sparse tensor
2. Encodes it as base64
3. Sends to /v1/completions with prompt_embeds parameter
4. Server should reject before memory corruption occurs
"""
renderer
=
CompletionRenderer
(
model_config
)
# Step 1-2: Attacker creates malicious payload
attack_payload
=
_encode_tensor
(
_create_malicious_sparse_tensor
())
# Step 3-4: Server processes and should reject
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
renderer
.
load_prompt_embeds
(
attack_payload
)
def
test_attack_scenario_chat_api_image
(
self
):
"""
Simulate attack through Chat API with image_embeds.
Verifies the image embeddings path is protected.
"""
io_handler
=
ImageEmbeddingMediaIO
()
attack_payload
=
_encode_tensor
(
_create_malicious_sparse_tensor
())
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
io_handler
.
load_base64
(
""
,
attack_payload
.
decode
(
"utf-8"
))
def
test_attack_scenario_chat_api_audio
(
self
):
"""
Simulate attack through Chat API with audio_embeds.
Verifies the audio embeddings path is protected.
"""
io_handler
=
AudioEmbeddingMediaIO
()
attack_payload
=
_encode_tensor
(
_create_malicious_sparse_tensor
())
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
io_handler
.
load_base64
(
""
,
attack_payload
.
decode
(
"utf-8"
))
def
test_multiple_valid_embeddings_in_batch
(
self
,
model_config
):
"""
Regression test: Multiple valid embeddings should still work.
Ensures the fix doesn't break legitimate batch processing.
"""
renderer
=
CompletionRenderer
(
model_config
)
valid_tensors
=
[
_encode_tensor
(
_create_valid_dense_tensor
()),
_encode_tensor
(
_create_valid_dense_tensor
()),
_encode_tensor
(
_create_valid_dense_tensor
()),
]
# Should process all without error
result
=
renderer
.
load_prompt_embeds
(
valid_tensors
)
assert
len
(
result
)
==
3
def
test_mixed_valid_and_malicious_rejected
(
self
,
model_config
):
"""
Security: Batch with one malicious tensor should be rejected.
Even if most tensors are valid, a single malicious one should
cause rejection of the entire batch.
"""
renderer
=
CompletionRenderer
(
model_config
)
mixed_batch
=
[
_encode_tensor
(
_create_valid_dense_tensor
()),
_encode_tensor
(
_create_malicious_sparse_tensor
()),
# Malicious
_encode_tensor
(
_create_valid_dense_tensor
()),
]
# Should fail on the malicious tensor
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
renderer
.
load_prompt_embeds
(
mixed_batch
)
# Pytest fixtures
@
pytest
.
fixture
def
model_config
():
"""Mock ModelConfig for testing."""
from
vllm.config
import
ModelConfig
return
ModelConfig
(
model
=
"facebook/opt-125m"
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
dtype
=
"float32"
,
seed
=
0
,
enable_prompt_embeds
=
True
,
# Required for prompt embeds tests
)
tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming
,
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
SIMPLE_ARGS_DICT
=
{
"action"
:
"create"
,
...
...
tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -6,8 +6,8 @@ import json
import
pytest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.tool_parsers.hermes_tool_parser
import
Hermes2ProToolParser
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers.hermes_tool_parser
import
Hermes2ProToolParser
from
....utils
import
RemoteOpenAIServer
...
...
tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -12,7 +12,7 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming
,
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.
entrypoints.openai.
tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
def
make_tool_call
(
name
,
arguments
):
...
...
tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -6,8 +6,8 @@ from unittest.mock import MagicMock, patch
import
pytest
from
vllm.entrypoints.openai.protocol
import
ExtractedToolCallInformation
from
vllm.entrypoints.openai.tool_parsers.llama_tool_parser
import
Llama3JsonToolParser
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers.llama_tool_parser
import
Llama3JsonToolParser
@
pytest
.
fixture
...
...
tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming
,
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
# Test cases similar to pythonic parser but with Llama4 specific format
SIMPLE_FUNCTION_OUTPUT
=
"[get_weather(city='LA', metric='C')]"
...
...
tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming
,
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT
=
"get_weather(city='San Francisco', metric='celsius')"
...
...
tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
View file @
a3f8d5dd
...
...
@@ -10,8 +10,8 @@ from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction_streaming
,
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
,
ToolParserManager
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
,
ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT
=
"get_weather(city='San Francisco', metric='celsius')"
...
...
tests/entrypoints/openai/tool_parsers/utils.py
View file @
a3f8d5dd
...
...
@@ -10,8 +10,8 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
ToolCall
,
)
from
vllm.entrypoints.openai.tool_parsers
import
ToolParser
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tool_parsers
import
ToolParser
class
StreamingToolReconstructor
:
...
...
tests/entrypoints/openai/utils.py
0 → 100644
View file @
a3f8d5dd
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
8
9
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment