Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1083 additions
and
175 deletions
+1083
-175
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+2
-5
tests/models/multimodal/processing/test_gemma3.py
tests/models/multimodal/processing/test_gemma3.py
+42
-0
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_mllama4.py
+2
-2
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+35
-0
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+8
-0
tests/models/registry.py
tests/models/registry.py
+14
-11
tests/multimodal/test_sparse_tensor_validation_unit.py
tests/multimodal/test_sparse_tensor_validation_unit.py
+134
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+100
-2
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+123
-1
tests/quantization/test_blackwell_moe.py
tests/quantization/test_blackwell_moe.py
+2
-2
tests/quantization/test_quark.py
tests/quantization/test_quark.py
+2
-2
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+195
-0
tests/reasoning/test_minimax_m2_reasoning_parser.py
tests/reasoning/test_minimax_m2_reasoning_parser.py
+230
-0
tests/reasoning/test_mistral_reasoning_parser.py
tests/reasoning/test_mistral_reasoning_parser.py
+85
-62
tests/reasoning/utils.py
tests/reasoning/utils.py
+1
-1
tests/standalone_tests/python_only_compile.sh
tests/standalone_tests/python_only_compile.sh
+36
-3
tests/test_config.py
tests/test_config.py
+1
-59
tests/test_envs.py
tests/test_envs.py
+38
-0
tests/test_inputs.py
tests/test_inputs.py
+9
-2
tests/tokenizers_/test_basic.py
tests/tokenizers_/test_basic.py
+24
-23
No files found.
tests/models/multimodal/processing/test_common.py
View file @
a3f8d5dd
...
...
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.inputs
import
MultiModalInputs
,
batched_tensors_equal
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
InputProcessingContext
from
vllm.tokenizers
import
(
MistralTokenizer
,
TokenizerLike
,
cached_tokenizer_from_config
,
)
from
vllm.tokenizers
import
TokenizerLike
,
cached_tokenizer_from_config
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
(
...
...
tests/models/multimodal/processing/test_gemma3.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
ImageTestAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"google/gemma-3-4b-it"
])
def
test_get_image_size_with_most_features
(
image_assets
:
ImageTestAssets
,
model_id
:
str
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
{
"do_pan_and_scan"
:
True
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
:
dict
[
str
,
object
]
=
{}
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
max_image_size
=
processor
.
info
.
get_image_size_with_most_features
()
max_tokens
=
processor
.
info
.
get_num_image_tokens
(
image_width
=
max_image_size
.
width
,
image_height
=
max_image_size
.
height
,
processor
=
hf_processor
,
)
prompt
=
"<start_of_image>"
image_seq_length
=
hf_processor
.
image_seq_length
for
asset
in
image_assets
:
mm_data
=
{
"image"
:
[
asset
.
pil_image
]}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
mm_kwargs_data
=
processed_inputs
[
"mm_kwargs"
].
get_data
()
num_patches_tensor
=
mm_kwargs_data
[
"num_patches"
]
tokens
=
int
(
num_patches_tensor
.
item
())
*
image_seq_length
assert
tokens
<=
max_tokens
tests/models/multimodal/processing/test_mllama4.py
View file @
a3f8d5dd
...
...
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
total_num_patches
.
item
()
+
num_tiles
.
item
()
+
3
)
# image start, image, image end
profiled_tokens
=
profiler
.
get_mm_max_
contiguous_
tokens
(
profiled_tokens
=
profiler
.
get_mm_max_tokens
(
max_model_len
,
mm_counts
=
mm_counts
,
)
assert
total_
token
s
==
profiled_tokens
[
"image"
]
assert
total_
num_patche
s
==
profiled_tokens
[
"image"
]
assert
total_tokens
==
sum
(
placeholder
.
length
for
placeholder
in
decoder_dummy_data
.
multi_modal_placeholders
[
"image"
]
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
a3f8d5dd
...
...
@@ -53,3 +53,38 @@ def test_processor_override(
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
assert
pixel_shape
[
1
]
==
expected_pixels_shape
[
1
]
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen2-VL-2B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"max_pixels"
,
[
1280
*
28
*
28
,
1283
*
28
*
28
])
def
test_get_image_size_with_most_features
(
image_assets
:
ImageTestAssets
,
model_id
:
str
,
max_pixels
:
int
,
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
{
"max_pixels"
:
max_pixels
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
:
dict
[
str
,
object
]
=
{}
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
merge_size
=
processor
.
info
.
get_hf_config
().
vision_config
.
spatial_merge_size
max_image_size
=
processor
.
info
.
get_image_size_with_most_features
()
max_tokens
=
processor
.
info
.
get_num_image_tokens
(
image_width
=
max_image_size
.
width
,
image_height
=
max_image_size
.
height
,
image_processor
=
hf_processor
.
image_processor
,
)
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
for
asset
in
image_assets
:
mm_data
=
{
"image"
:
[
asset
.
pil_image
]}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
grid_thw
=
processed_inputs
[
"mm_kwargs"
].
get_data
()[
"image_grid_thw"
].
tolist
()
t
,
h
,
w
=
grid_thw
[
0
]
tokens
=
(
t
*
h
*
w
)
//
(
merge_size
**
2
)
assert
tokens
<
max_tokens
tests/models/multimodal/processing/test_tensor_schema.py
View file @
a3f8d5dd
...
...
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
import
numpy
as
np
import
pytest
import
torch
import
torch.nn
as
nn
from
PIL
import
Image
...
...
@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
....utils
import
create_new_process_for_each_test
from
...registry
import
HF_EXAMPLE_MODELS
from
...utils
import
dummy_hf_overrides
from
.test_common
import
get_model_ids_to_test
,
get_text_token_prompts
...
...
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
)
# TODO(Isotr0py): Don't initalize model during test
@
contextmanager
def
initialize_dummy_model
(
model_cls
:
type
[
nn
.
Module
],
...
...
@@ -150,16 +153,21 @@ def initialize_dummy_model(
backend
=
"nccl"
,
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
current_device
=
torch
.
get_default_device
()
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
with
set_default_torch_dtype
(
model_config
.
dtype
):
torch
.
set_default_device
(
current_platform
.
device_type
)
model
=
model_cls
(
vllm_config
=
vllm_config
)
torch
.
set_default_device
(
current_device
)
yield
model
del
model
cleanup_dist_env_and_memory
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
get_model_ids_to_test
())
def
test_model_tensor_schema
(
model_id
:
str
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
...
...
tests/models/registry.py
View file @
a3f8d5dd
...
...
@@ -173,10 +173,7 @@ class _HfExamplesInfo:
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
# [Decoder-only]
"AfmoeForCausalLM"
:
_HfExamplesInfo
(
"arcee-ai/Trinity-Nano"
,
is_available_online
=
False
,
),
"AfmoeForCausalLM"
:
_HfExamplesInfo
(
"arcee-ai/Trinity-Nano-Preview"
),
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B-Instruct-2509"
),
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
trust_remote_code
=
True
),
"AquilaForCausalLM"
:
_HfExamplesInfo
(
"BAAI/AquilaChat2-7B"
,
trust_remote_code
=
True
),
...
...
@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
,
is_available_online
=
False
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
...
...
@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
),
"Qwen3ForTokenClassification"
:
_HfExamplesInfo
(
"bd2lcco/Qwen3-0.6B-finetuned"
),
}
_MULTIMODAL_EXAMPLE_MODELS
=
{
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0.dev"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"Open-Bee/Bee-8B-RL"
,
trust_remote_code
=
True
,
...
...
@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"HunYuanVLForConditionalGeneration"
:
_HfExamplesInfo
(
"tencent/HunyuanOCR"
,
is_available_online
=
False
,
hf_overrides
=
{
"num_experts"
:
0
}
,
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
...
...
@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
,
),
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"lightonai/LightOnOCR-1B"
,
is_available_online
=
False
,
"lightonai/LightOnOCR-1B-1025"
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
...
...
@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"ministral-3"
:
"mistralai/Ministral-3-3B-Instruct-2512"
,
},
tokenizer_mode
=
"mistral"
,
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
is_available_online
=
False
,
),
"QwenVLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL"
,
...
...
@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online
=
False
,
),
# [Encoder-decoder]
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3-turbo"
,
extras
=
{
"v3"
:
"openai/whisper-large-v3"
},
),
# [Cross-encoder]
"JinaVLForRanking"
:
_HfExamplesInfo
(
"jinaai/jina-reranker-m0"
),
}
...
...
@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleMistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-Large-3-675B-Instruct-2512"
,
speculative_model
=
"mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
,
# TODO: revert once figuring out OOM in CI
is_available_online
=
False
,
),
"LlamaForCausalLMEagle3"
:
_HfExamplesInfo
(
...
...
tests/multimodal/test_sparse_tensor_validation_unit.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for sparse tensor validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
"""
import
io
import
pytest
import
torch
class
TestSparseTensorValidationContextManager
:
"""Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""
def
test_valid_sparse_tensor_passes
(
self
):
"""Valid sparse tensors should pass validation."""
indices
=
torch
.
tensor
([[
0
,
1
],
[
0
,
1
]])
values
=
torch
.
tensor
([
1.0
,
2.0
])
shape
=
(
2
,
2
)
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
dense
=
tensor
.
to_dense
()
assert
dense
.
shape
==
shape
def
test_out_of_bounds_indices_rejected
(
self
):
"""Sparse tensors with out-of-bounds indices should be rejected."""
indices
=
torch
.
tensor
([[
5
],
[
5
]])
# Out of bounds for 2x2
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
with
pytest
.
raises
(
RuntimeError
)
as
exc_info
:
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
tensor
.
to_dense
()
assert
(
"index"
in
str
(
exc_info
.
value
).
lower
()
or
"bound"
in
str
(
exc_info
.
value
).
lower
()
)
def
test_negative_indices_rejected
(
self
):
"""Sparse tensors with negative indices should be rejected."""
indices
=
torch
.
tensor
([[
-
1
],
[
0
]])
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
with
pytest
.
raises
(
RuntimeError
):
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
tensor
.
to_dense
()
def
test_without_context_manager_allows_invalid
(
self
):
"""
WITHOUT validation, invalid tensors may not immediately error.
This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
by default, which can lead to memory corruption.
"""
indices
=
torch
.
tensor
([[
100
],
[
100
]])
# Way out of bounds
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
# Without validation context, this might create an invalid tensor
# (actual behavior depends on PyTorch version)
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
# The tensor object is created, but it's invalid
assert
tensor
.
is_sparse
class
TestTorchLoadWithValidation
:
"""Test torch.load() with sparse tensor validation."""
def
test_load_valid_sparse_tensor_with_validation
(
self
):
"""Valid sparse tensors should load successfully with validation."""
# Create and save a valid sparse tensor
indices
=
torch
.
tensor
([[
0
,
1
],
[
0
,
1
]])
values
=
torch
.
tensor
([
1.0
,
2.0
])
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
(
2
,
2
))
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
dense
=
loaded
.
to_dense
()
assert
dense
.
shape
==
(
2
,
2
)
def
test_load_invalid_sparse_tensor_rejected
(
self
):
"""Invalid sparse tensors should be caught when loaded with validation."""
# Create an invalid sparse tensor (out of bounds)
indices
=
torch
.
tensor
([[
10
],
[
10
]])
values
=
torch
.
tensor
([
1.0
])
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
(
2
,
2
))
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation - should fail on to_dense()
with
pytest
.
raises
(
RuntimeError
):
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
loaded
.
to_dense
()
def
test_load_dense_tensor_unaffected
(
self
):
"""Dense tensors should work normally with the validation context."""
# Create and save a dense tensor
tensor
=
torch
.
randn
(
10
,
20
)
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation (should have no effect on dense tensors)
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
assert
loaded
.
shape
==
(
10
,
20
)
assert
not
loaded
.
is_sparse
if
__name__
==
"__main__"
:
# Allow running directly for quick testing
pytest
.
main
([
__file__
,
"-v"
,
"--tb=short"
])
tests/multimodal/test_utils.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
base64
import
mimetypes
import
os
...
...
@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
import
numpy
as
np
import
pytest
import
torch
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
...
...
@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion():
connector
.
fetch_image
(
broken_img
)
@
pytest
.
mark
.
flaky
(
reruns
=
3
,
reruns_delay
=
5
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
...
...
@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
}
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
try
:
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
except
(
TimeoutError
,
asyncio
.
TimeoutError
)
as
e
:
pytest
.
skip
(
f
"Timeout fetching video (CI network flakiness):
{
e
}
"
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
...
...
@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case):
assert
modality_idxs
==
expected_modality_idxs
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
5
),
(
torch
.
tensor
([
True
,
True
,
True
,
True
,
True
]),
5
),
(
torch
.
tensor
([
False
,
False
,
False
,
False
,
False
]),
0
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
,
True
]),
3
),
(
torch
.
tensor
([
True
]),
1
),
],
)
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_num_embeds
==
expected
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
None
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
torch
.
tensor
([
0
,
1
,
1
,
2
,
3
]),
),
(
torch
.
tensor
([
True
,
True
,
True
]),
torch
.
tensor
([
1
,
2
,
3
])),
],
)
def
test_placeholder_range_embeds_cumsum
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
if
expected
is
None
:
assert
pr
.
embeds_cumsum
is
None
return
assert
torch
.
equal
(
pr
.
embeds_cumsum
,
expected
)
# cached_property should return the same object on repeated access
assert
pr
.
embeds_cumsum
is
pr
.
embeds_cumsum
@
pytest
.
mark
.
parametrize
(
"is_embed,start_idx,end_idx,expected"
,
[
(
None
,
2
,
4
,
(
2
,
4
)),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
3
,
5
,
(
1
,
3
),
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
0
,
2
,
(
0
,
1
),
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
]),
2
,
2
,
(
1
,
1
),
),
],
)
def
test_placeholder_range_get_embeds_indices_in_range
(
is_embed
,
start_idx
,
end_idx
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_embeds_indices_in_range
(
start_idx
,
end_idx
)
==
expected
@
pytest
.
mark
.
parametrize
(
"offset,is_embed,expected"
,
[
(
0
,
None
,
[(
0
,
4
)]),
(
2
,
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
[(
3
,
3
),
(
5
,
6
)],
),
(
0
,
torch
.
tensor
([
True
,
True
,
True
,
True
]),
[(
0
,
3
)]),
(
0
,
torch
.
tensor
([
False
,
False
,
False
,
False
]),
[]),
],
)
def
test_placeholder_range_extract_embeds_range
(
offset
,
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
offset
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
extract_embeds_range
()
==
expected
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
...
...
tests/multimodal/test_video.py
View file @
a3f8d5dd
...
...
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
"""
Regression test for handling videos with broken frames.
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
contains broken
/unreadable
frames to verify the video loader handles
contains broken frames to verify the video loader handles
them gracefully without crashing and returns accurate metadata.
"""
with
monkeypatch
.
context
()
as
m
:
...
...
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
f
"Expected fewer than
{
metadata
[
'total_num_frames'
]
}
frames, "
f
"but loaded
{
frames
.
shape
[
0
]
}
frames"
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_1"
)
class
TestVideoBackendOverride1
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_1
,
{
"video_backend"
:
"test_video_backend_override_1"
}
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_2"
)
class
TestVideoBackendOverride2
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_2
,
{
"video_backend"
:
"test_video_backend_override_2"
}
def
test_video_media_io_backend_kwarg_override
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with
monkeypatch
.
context
()
as
m
:
# Set the env var to one backend
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_1"
)
imageio
=
ImageMediaIO
()
# Without video_backend kwarg, should use env var backend
videoio_default
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_default
,
metadata_default
=
videoio_default
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_default
,
FAKE_OUTPUT_1
)
assert
metadata_default
[
"video_backend"
]
==
"test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_video_backend_override_2"
)
frames_override
,
metadata_override
=
videoio_override
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_override
,
FAKE_OUTPUT_2
)
assert
metadata_override
[
"video_backend"
]
==
"test_video_backend_override_2"
def
test_video_media_io_backend_kwarg_not_passed_to_loader
(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_reject_video_backend_kwarg"
)
class
RejectVideoBackendKwargLoader
(
VideoLoader
):
"""Test loader that fails if video_backend is passed through."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
# This should never receive video_backend in kwargs
if
"video_backend"
in
kwargs
:
raise
AssertionError
(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return
FAKE_OUTPUT_1
,
{
"received_kwargs"
:
list
(
kwargs
.
keys
())}
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_reject_video_backend_kwarg"
)
imageio
=
ImageMediaIO
()
# Even when video_backend is provided, it should NOT be passed to loader
videoio
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_reject_video_backend_kwarg"
,
other_kwarg
=
"should_pass_through"
,
)
# This should NOT raise AssertionError
frames
,
metadata
=
videoio
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames
,
FAKE_OUTPUT_1
)
# Verify other kwargs are still passed through
assert
"other_kwarg"
in
metadata
[
"received_kwargs"
]
def
test_video_media_io_backend_env_var_fallback
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_2"
)
imageio
=
ImageMediaIO
()
# Explicit None should fall back to env var
videoio_none
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
None
)
frames_none
,
metadata_none
=
videoio_none
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_none
,
FAKE_OUTPUT_2
)
assert
metadata_none
[
"video_backend"
]
==
"test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
tests/quantization/test_blackwell_moe.py
View file @
a3f8d5dd
...
...
@@ -10,9 +10,9 @@ import pytest
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_device_capability
(
100
):
if
not
current_platform
.
is_device_capability
_family
(
100
):
pytest
.
skip
(
"This test only runs on Blackwell GPUs (SM10
0
)."
,
allow_module_level
=
True
"This test only runs on Blackwell GPUs (SM10
x
)."
,
allow_module_level
=
True
)
...
...
tests/quantization/test_quark.py
View file @
a3f8d5dd
...
...
@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
task
=
"wikitext"
rtol
=
0.1
# Smaller cuda
_
graph_sizes to speed up the test.
# Smaller cudagraph_
capture_
sizes to speed up the test.
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model_args
=
config
.
get_model_args
(
tp_size
=
tp_size
,
kwargs
=
{
"cuda
_
graph_sizes"
:
[
16
]}
tp_size
=
tp_size
,
kwargs
=
{
"cudagraph_
capture_
sizes"
:
[
16
]}
),
tasks
=
task
,
batch_size
=
64
,
...
...
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2_append_think"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT
=
{
"output"
:
"This is reasoning</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning</think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning in progress"
,
"is_reasoning_end"
:
False
,
}
# Case: only end token
ONLY_END_TOKEN
=
{
"output"
:
"</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think></think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: multiple lines
MULTIPLE_LINES
=
{
"output"
:
"Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"reasoning"
:
None
,
"content"
:
"<think>Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
"<think>"
,
"is_reasoning_end"
:
False
,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2</think>Yes!"
,
"reasoning"
:
None
,
"content"
:
"<think>Let me think... 1+1=2</think>Yes!"
,
"is_reasoning_end"
:
True
,
}
# Case: code in output
CODE_OUTPUT
=
{
"output"
:
"```python
\n
print('hi')
\n
```</think>Here's the code."
,
"reasoning"
:
None
,
"content"
:
"<think>```python
\n
print('hi')
\n
```</think>Here's the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
SIMPLE_OUTPUT
,
id
=
"simple_output"
,
),
pytest
.
param
(
True
,
SIMPLE_OUTPUT
,
id
=
"simple_output_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
ONLY_END_TOKEN
,
id
=
"only_end_token"
,
),
pytest
.
param
(
True
,
ONLY_END_TOKEN
,
id
=
"only_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_OUTPUT
,
id
=
"code_output"
,
),
pytest
.
param
(
True
,
CODE_OUTPUT
,
id
=
"code_output_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
tests/reasoning/test_minimax_m2_reasoning_parser.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section</think>"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
"This is reasoning in progress"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES
=
{
"output"
:
"First line
\n
Second line</think>Response first line
\n
Response second"
,
"reasoning"
:
"First line
\n
Second line"
,
"content"
:
"Response first line
\n
Response second"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
""
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
None
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
""
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: empty streaming
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: long reasoning with special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2, right?</think>Yes, 1+1=2."
,
"reasoning"
:
"Let me think... 1+1=2, right?"
,
"content"
:
"Yes, 1+1=2."
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning with code blocks
CODE_IN_REASONING
=
{
"output"
:
"```python
\n
print('hello')
\n
```</think>Here is the code."
,
"reasoning"
:
"```python
\n
print('hello')
\n
```"
,
"content"
:
"Here is the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
# Core cases: no start token (MiniMax M2 actual behavior)
pytest
.
param
(
False
,
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
),
pytest
.
param
(
True
,
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_reasoning"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING_STREAMING
,
id
=
"shortest_reasoning_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning"
,
),
pytest
.
param
(
True
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_ids
)
assert
content
==
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"content"
])
)
else
:
content
=
parser
.
extract_content_ids
(
output
)
assert
content
==
[]
tests/reasoning/test_mistral_reasoning_parser.py
View file @
a3f8d5dd
...
...
@@ -5,7 +5,7 @@ import pytest
from
tests.reasoning.utils
import
run_reasoning_extraction_mistral
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
parser_name
=
"mistral"
...
...
@@ -18,47 +18,53 @@ def mistral_tokenizer():
return
mistral_tokenizer
SIMPLE_REASONING
=
{
INVALID_
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section[/THINK]This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"reasoning"
:
None
,
"content"
:
"This is
a reasoning sectionThis is
the rest"
,
"is_reasoning_end"
:
Fals
e
,
}
COMPLETE_REASONING
=
{
INVALID_
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section[/THINK]"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
Tru
e
,
"reasoning"
:
None
,
"content"
:
"This is a reasoning section"
,
"is_reasoning_end"
:
Fals
e
,
}
NO_CONTENT
=
{
"output"
:
"This is
content
"
,
"reasoning"
:
"This is
content
"
,
"output"
:
"
[THINK]
This is
reasoning
"
,
"reasoning"
:
"This is
reasoning
"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
NO_REASONING
=
{
"output"
:
"This is content"
,
"reasoning"
:
None
,
"content"
:
"This is content"
,
"is_reasoning_end"
:
False
,
}
NO_REASONING_STREAMING
=
{
"output"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is a reasoning section"
,
"is_reasoning_end"
:
False
,
}
MULTIPLE_LINES
=
{
INVALID_
MULTIPLE_LINES
=
{
"output"
:
"This
\n
That[/THINK]This is the rest
\n
That"
,
"reasoning"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
Tru
e
,
"reasoning"
:
None
,
"content"
:
"This
\n
ThatThis
is the rest
\n
That"
,
"is_reasoning_end"
:
Fals
e
,
}
SHORTEST_REASONING_NO_STREAMING
=
{
INVALID_
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
""
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
SHORTEST_REASONING
=
{
INVALID_
SHORTEST_REASONING
=
{
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
REASONING_WITH_THINK
=
{
"output"
:
"[THINK]This is a reasoning section[/THINK]This is the rest"
,
...
...
@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
INVALID_
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
""
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
SHORTEST_REASONING_WITH_THINK
=
{
INVALID_
SHORTEST_REASONING_WITH_THINK
=
{
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
THINK_NO_END
=
{
"output"
:
"[THINK]This is a reasoning section"
,
...
...
@@ -98,8 +104,8 @@ THINK_NO_END = {
}
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
""
,
"content"
:
None
,
"reasoning"
:
None
,
"content"
:
""
,
"is_reasoning_end"
:
False
,
}
EMPTY_STREAMING
=
{
...
...
@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
"is_reasoning_end"
:
False
,
}
NEW_LINE
=
{
"output"
:
"
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"output"
:
"
Before
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"content"
:
"
Before
\n
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support [THINK]...[/THINK] and [/THINK]...
# We cannot know if the text before [THINK] is reasoning content
# or not.
NEW_LINE_STREAMING
=
{
"output"
:
"
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"reasoning"
:
"
\n
This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"output"
:
"
Before
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"
Before
\n
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
INVALID_
SIMPLE_REASONING
,
id
=
"
invalid_
simple_reasoning"
,
),
pytest
.
param
(
True
,
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
INVALID_
SIMPLE_REASONING
,
id
=
"
invalid_
simple_reasoning_streaming"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
INVALID_
COMPLETE_REASONING
,
id
=
"
invalid_
complete_reasoning"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
INVALID_
COMPLETE_REASONING
,
id
=
"
invalid_
complete_reasoning_streaming"
,
),
pytest
.
param
(
False
,
NO_CONTENT
,
id
=
"no_content_token"
,
id
=
"no_content"
,
),
pytest
.
param
(
False
,
NO_REASONING
,
id
=
"no_reasoning"
,
),
pytest
.
param
(
True
,
...
...
@@ -158,23 +165,23 @@ TEST_CASES = [
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
INVALID_
MULTIPLE_LINES
,
id
=
"
invalid_
multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
INVALID_
MULTIPLE_LINES
,
id
=
"
invalid_
multiple_lines_streaming"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING
,
id
=
"shortest"
,
INVALID_
SHORTEST_REASONING
,
id
=
"
invalid_
shortest"
,
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_streaming"
,
INVALID_
SHORTEST_REASONING_NO_STREAMING
,
id
=
"
invalid_
shortest_streaming"
,
),
pytest
.
param
(
False
,
...
...
@@ -208,13 +215,13 @@ TEST_CASES = [
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
,
id
=
"shortest_with_think"
,
INVALID_
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
,
id
=
"
invalid_
shortest_with_think"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING_WITH_THINK
,
id
=
"shortest_with_think_streaming"
,
INVALID_
SHORTEST_REASONING_WITH_THINK
,
id
=
"
invalid_
shortest_with_think_streaming"
,
),
pytest
.
param
(
False
,
...
...
@@ -316,10 +323,26 @@ def test_mistral_reasoning(
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_tokens
)
assert
content
==
mistral_tokenizer
.
tokenizer
.
encode
(
param_dict
[
"content"
],
bos
=
False
,
eos
=
False
# Handle the case where there are tokens outputted before Thinking.
# This should not occur if the model is well trained and prompted.
if
"[THINK]"
in
param_dict
[
"output"
]
and
not
param_dict
[
"output"
].
startswith
(
"[THINK]"
):
before_content
=
param_dict
[
"output"
].
split
(
"[THINK]"
)[
0
]
before_token_ids
=
mistral_tokenizer
.
tokenizer
.
encode
(
before_content
,
bos
=
False
,
eos
=
False
)
left_to_encode
=
param_dict
[
"content"
][
len
(
before_content
)
:]
# Normal situation.
else
:
before_token_ids
=
[]
left_to_encode
=
param_dict
[
"content"
]
content_tokens
=
parser
.
extract_content_ids
(
output_tokens
)
expected_token_ids
=
before_token_ids
+
mistral_tokenizer
.
tokenizer
.
encode
(
left_to_encode
,
bos
=
False
,
eos
=
False
)
assert
content_tokens
==
expected_token_ids
else
:
content
=
parser
.
extract_content_ids
(
output_tokens
)
assert
content
==
[]
tests/reasoning/utils.py
View file @
a3f8d5dd
...
...
@@ -4,7 +4,7 @@
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
DeltaMessage
from
vllm.reasoning
import
ReasoningParser
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
class
StreamingReasoningReconstructor
:
...
...
tests/standalone_tests/python_only_compile.sh
View file @
a3f8d5dd
...
...
@@ -3,12 +3,45 @@
# for users who do not have any compilers installed on their system
set
-e
set
-x
merge_base_commit
=
$(
git merge-base HEAD origin/main
)
echo
"
C
urrent merge base commit with main:
$merge_base_commit
"
echo
"
INFO: c
urrent merge base commit with main:
$merge_base_commit
"
git show
--oneline
-s
$merge_base_commit
# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times
# this avoids cumbersome error messages & manual retries in case the precompiled wheel
# for the given commit is still being built in the release pipeline
meta_json_url
=
"https://wheels.vllm.ai/
$merge_base_commit
/vllm/metadata.json"
echo
"INFO: will use metadata.json from
$meta_json_url
"
for
i
in
{
1..5
}
;
do
echo
"Checking metadata.json URL (attempt
$i
)..."
if
curl
--fail
"
$meta_json_url
"
>
metadata.json
;
then
echo
"INFO: metadata.json URL is valid."
# check whether it is valid json by python
if
python3
-m
json.tool metadata.json
;
then
echo
"INFO: metadata.json is valid JSON. Proceeding with the test."
else
echo
"CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
exit
1
fi
break
fi
# failure handling
if
[
$i
-eq
5
]
;
then
echo
"ERROR: metadata.json URL is still not valid after 5 attempts."
echo
"ERROR: Please check whether the precompiled wheel for commit
$merge_base_commit
exists."
echo
" NOTE: If
$merge_base_commit
is a new commit on main, maybe try again after its release pipeline finishes."
echo
" NOTE: If it fails, please report in #sig-ci channel."
exit
1
else
echo
"WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
sleep
180
fi
done
set
-x
cd
/vllm-workspace/
# uninstall vllm
...
...
@@ -29,6 +62,6 @@ python3 -c 'import vllm'
# Check if the clangd log file was created
if
[
!
-f
/tmp/changed.file
]
;
then
echo
"changed.file was not created, python only compilation failed"
echo
"
ERROR:
changed.file was not created, python only compilation failed"
exit
1
fi
tests/test_config.py
View file @
a3f8d5dd
...
...
@@ -89,64 +89,6 @@ def test_update_config():
new_config3
=
update_config
(
config3
,
{
"a"
:
"new_value"
})
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"generate"
,
"none"
,
"generate"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"none"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"none"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"none"
,
"embed"
),
(
"openai/whisper-small"
,
"generate"
,
"none"
,
"transcription"
),
],
)
def
test_auto_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"auto"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"pooling"
,
"embed"
,
"embed"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"classify"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"embed"
,
"embed"
),
(
"openai/whisper-small"
,
"pooling"
,
"embed"
,
"embed"
),
],
)
def
test_score_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"score"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"openai/whisper-small"
,
"generate"
,
"none"
,
"transcription"
),
],
)
def
test_transcription_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"transcription"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
),
[
...
...
@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
)
# Override one field but not others
pass_config
=
PassConfig
(
e
nabl
e_noop
=
False
)
pass_config
=
PassConfig
(
e
liminat
e_noop
s
=
False
)
compilation_config
=
CompilationConfig
(
pass_config
=
pass_config
)
config
=
VllmConfig
(
model_config
=
regular_model
,
...
...
tests/test_envs.py
View file @
a3f8d5dd
...
...
@@ -8,6 +8,7 @@ import pytest
import
vllm.envs
as
envs
from
vllm.envs
import
(
disable_envs_cache
,
enable_envs_cache
,
env_list_with_choices
,
env_set_with_choices
,
...
...
@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
envs
.
__getattr__
=
envs
.
__getattr__
.
__wrapped__
def
test_getattr_with_reset
(
monkeypatch
:
pytest
.
MonkeyPatch
)
->
None
:
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"1.1.1.1"
)
# __getattr__ is not decorated with functools.cache
assert
not
hasattr
(
envs
.
__getattr__
,
"cache_info"
)
# Enable envs cache and ignore ongoing environment changes
enable_envs_cache
()
assert
envs
.
VLLM_HOST_IP
==
"1.1.1.1"
# With cache enabled, the environment variable value is cached and unchanged
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"2.2.2.2"
)
assert
envs
.
VLLM_HOST_IP
==
"1.1.1.1"
disable_envs_cache
()
assert
envs
.
VLLM_HOST_IP
==
"2.2.2.2"
# After cache disabled, the environment variable value would be synced
# with os.environ
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"3.3.3.3"
)
assert
envs
.
VLLM_HOST_IP
==
"3.3.3.3"
def
test_is_envs_cache_enabled
()
->
None
:
assert
not
envs
.
_is_envs_cache_enabled
()
enable_envs_cache
()
assert
envs
.
_is_envs_cache_enabled
()
# Only wrap one-layer of cache, so we only need to
# call disable once to reset.
enable_envs_cache
()
enable_envs_cache
()
enable_envs_cache
()
disable_envs_cache
()
assert
not
envs
.
_is_envs_cache_enabled
()
disable_envs_cache
()
assert
not
envs
.
_is_envs_cache_enabled
()
class
TestEnvWithChoices
:
"""Test cases for env_with_choices function."""
...
...
tests/test_inputs.py
View file @
a3f8d5dd
...
...
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.tokenizers
import
init
_tokenizer_from_config
from
vllm.tokenizers
import
cached
_tokenizer_from_config
pytestmark
=
pytest
.
mark
.
cpu_test
...
...
@@ -34,6 +34,13 @@ INPUTS_SLICES = [
]
# Test that a nested mixed-type list of lists raises a TypeError.
@
pytest
.
mark
.
parametrize
(
"invalid_input"
,
[[[
1
,
2
],
[
"foo"
,
"bar"
]]])
def
test_invalid_input_raise_type_error
(
invalid_input
):
with
pytest
.
raises
(
TypeError
):
parse_raw_prompts
(
invalid_input
)
def
test_parse_raw_single_batch_empty
():
with
pytest
.
raises
(
ValueError
,
match
=
"at least one prompt"
):
parse_raw_prompts
([])
...
...
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
model_config
=
ModelConfig
(
model
=
model_id
)
tokenizer
=
init
_tokenizer_from_config
(
model_config
)
tokenizer
=
cached
_tokenizer_from_config
(
model_config
)
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
# HF processor adds sep token
...
...
tests/tokenizers_/test_basic.py
View file @
a3f8d5dd
...
...
@@ -3,38 +3,39 @@
from
typing
import
_get_protocol_attrs
# type: ignore
import
pytest
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
(
PreTrainedTokenizer
,
PreTrainedTokenizerBase
,
PreTrainedTokenizerFast
,
)
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
def
_get_missing_attrs
(
obj
:
object
,
target
:
type
):
return
[
k
for
k
in
_get_protocol_attrs
(
target
)
if
not
hasattr
(
obj
,
k
)]
def
_assert_tokenizer_like
(
tokenizer
:
object
):
missing_attrs
=
_get_missing_attrs
(
tokenizer
,
TokenizerLike
)
assert
not
missing_attrs
,
f
"Missing attrs:
{
missing_attrs
}
"
def
test_tokenizer_like_protocol
():
assert
not
(
missing_attrs
:
=
_get_missing_attrs
(
get_tokenizer
(
"gpt2"
,
use_fast
=
False
),
TokenizerLike
,
)
),
f
"Missing attrs:
{
missing_attrs
}
"
assert
not
(
missing_attrs
:
=
_get_missing_attrs
(
get_tokenizer
(
"gpt2"
,
use_fast
=
True
),
TokenizerLike
,
)
),
f
"Missing attrs:
{
missing_attrs
}
"
assert
not
(
missing_attrs
:
=
_get_missing_attrs
(
get_tokenizer
(
"mistralai/Mistral-7B-Instruct-v0.3"
,
tokenizer_mode
=
"mistral"
),
TokenizerLike
,
)
),
f
"Missing attrs:
{
missing_attrs
}
"
tokenizer
=
get_tokenizer
(
"gpt2"
,
use_fast
=
False
)
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
_assert_tokenizer_like
(
tokenizer
)
tokenizer
=
get_tokenizer
(
"gpt2"
,
use_fast
=
True
)
assert
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
)
_assert_tokenizer_like
(
tokenizer
)
tokenizer
=
get_tokenizer
(
"mistralai/Mistral-7B-Instruct-v0.3"
,
tokenizer_mode
=
"mistral"
)
assert
isinstance
(
tokenizer
,
MistralTokenizer
)
_assert_tokenizer_like
(
tokenizer
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"facebook/opt-125m"
,
"gpt2"
])
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment