Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1083 additions
and
175 deletions
+1083
-175
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+2
-5
tests/models/multimodal/processing/test_gemma3.py
tests/models/multimodal/processing/test_gemma3.py
+42
-0
tests/models/multimodal/processing/test_mllama4.py
tests/models/multimodal/processing/test_mllama4.py
+2
-2
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+35
-0
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+8
-0
tests/models/registry.py
tests/models/registry.py
+14
-11
tests/multimodal/test_sparse_tensor_validation_unit.py
tests/multimodal/test_sparse_tensor_validation_unit.py
+134
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+100
-2
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+123
-1
tests/quantization/test_blackwell_moe.py
tests/quantization/test_blackwell_moe.py
+2
-2
tests/quantization/test_quark.py
tests/quantization/test_quark.py
+2
-2
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+195
-0
tests/reasoning/test_minimax_m2_reasoning_parser.py
tests/reasoning/test_minimax_m2_reasoning_parser.py
+230
-0
tests/reasoning/test_mistral_reasoning_parser.py
tests/reasoning/test_mistral_reasoning_parser.py
+85
-62
tests/reasoning/utils.py
tests/reasoning/utils.py
+1
-1
tests/standalone_tests/python_only_compile.sh
tests/standalone_tests/python_only_compile.sh
+36
-3
tests/test_config.py
tests/test_config.py
+1
-59
tests/test_envs.py
tests/test_envs.py
+38
-0
tests/test_inputs.py
tests/test_inputs.py
+9
-2
tests/tokenizers_/test_basic.py
tests/tokenizers_/test_basic.py
+24
-23
No files found.
tests/models/multimodal/processing/test_common.py
View file @
a3f8d5dd
...
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
...
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.cache
import
MultiModalProcessorOnlyCache
from
vllm.multimodal.inputs
import
MultiModalInputs
,
batched_tensors_equal
from
vllm.multimodal.inputs
import
MultiModalInputs
,
batched_tensors_equal
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
InputProcessingContext
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
,
InputProcessingContext
from
vllm.tokenizers
import
(
from
vllm.tokenizers
import
TokenizerLike
,
cached_tokenizer_from_config
MistralTokenizer
,
from
vllm.tokenizers.mistral
import
MistralTokenizer
TokenizerLike
,
cached_tokenizer_from_config
,
)
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
(
from
...registry
import
(
...
...
tests/models/multimodal/processing/test_gemma3.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
ImageTestAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"google/gemma-3-4b-it"
])
def
test_get_image_size_with_most_features
(
image_assets
:
ImageTestAssets
,
model_id
:
str
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
{
"do_pan_and_scan"
:
True
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
:
dict
[
str
,
object
]
=
{}
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
max_image_size
=
processor
.
info
.
get_image_size_with_most_features
()
max_tokens
=
processor
.
info
.
get_num_image_tokens
(
image_width
=
max_image_size
.
width
,
image_height
=
max_image_size
.
height
,
processor
=
hf_processor
,
)
prompt
=
"<start_of_image>"
image_seq_length
=
hf_processor
.
image_seq_length
for
asset
in
image_assets
:
mm_data
=
{
"image"
:
[
asset
.
pil_image
]}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
mm_kwargs_data
=
processed_inputs
[
"mm_kwargs"
].
get_data
()
num_patches_tensor
=
mm_kwargs_data
[
"num_patches"
]
tokens
=
int
(
num_patches_tensor
.
item
())
*
image_seq_length
assert
tokens
<=
max_tokens
tests/models/multimodal/processing/test_mllama4.py
View file @
a3f8d5dd
...
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
...
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
total_num_patches
.
item
()
+
num_tiles
.
item
()
+
3
total_num_patches
.
item
()
+
num_tiles
.
item
()
+
3
)
# image start, image, image end
)
# image start, image, image end
profiled_tokens
=
profiler
.
get_mm_max_
contiguous_
tokens
(
profiled_tokens
=
profiler
.
get_mm_max_tokens
(
max_model_len
,
max_model_len
,
mm_counts
=
mm_counts
,
mm_counts
=
mm_counts
,
)
)
assert
total_
token
s
==
profiled_tokens
[
"image"
]
assert
total_
num_patche
s
==
profiled_tokens
[
"image"
]
assert
total_tokens
==
sum
(
assert
total_tokens
==
sum
(
placeholder
.
length
placeholder
.
length
for
placeholder
in
decoder_dummy_data
.
multi_modal_placeholders
[
"image"
]
for
placeholder
in
decoder_dummy_data
.
multi_modal_placeholders
[
"image"
]
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
a3f8d5dd
...
@@ -53,3 +53,38 @@ def test_processor_override(
...
@@ -53,3 +53,38 @@ def test_processor_override(
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_pixels_shape
[
0
]
*
num_imgs
assert
pixel_shape
[
1
]
==
expected_pixels_shape
[
1
]
assert
pixel_shape
[
1
]
==
expected_pixels_shape
[
1
]
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen2-VL-2B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"max_pixels"
,
[
1280
*
28
*
28
,
1283
*
28
*
28
])
def
test_get_image_size_with_most_features
(
image_assets
:
ImageTestAssets
,
model_id
:
str
,
max_pixels
:
int
,
):
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
{
"max_pixels"
:
max_pixels
},
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
:
dict
[
str
,
object
]
=
{}
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
merge_size
=
processor
.
info
.
get_hf_config
().
vision_config
.
spatial_merge_size
max_image_size
=
processor
.
info
.
get_image_size_with_most_features
()
max_tokens
=
processor
.
info
.
get_num_image_tokens
(
image_width
=
max_image_size
.
width
,
image_height
=
max_image_size
.
height
,
image_processor
=
hf_processor
.
image_processor
,
)
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
for
asset
in
image_assets
:
mm_data
=
{
"image"
:
[
asset
.
pil_image
]}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
grid_thw
=
processed_inputs
[
"mm_kwargs"
].
get_data
()[
"image_grid_thw"
].
tolist
()
t
,
h
,
w
=
grid_thw
[
0
]
tokens
=
(
t
*
h
*
w
)
//
(
merge_size
**
2
)
assert
tokens
<
max_tokens
tests/models/multimodal/processing/test_tensor_schema.py
View file @
a3f8d5dd
...
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
...
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
PIL
import
Image
from
PIL
import
Image
...
@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
...
@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
vllm.utils.torch_utils
import
set_default_torch_dtype
from
....utils
import
create_new_process_for_each_test
from
...registry
import
HF_EXAMPLE_MODELS
from
...registry
import
HF_EXAMPLE_MODELS
from
...utils
import
dummy_hf_overrides
from
...utils
import
dummy_hf_overrides
from
.test_common
import
get_model_ids_to_test
,
get_text_token_prompts
from
.test_common
import
get_model_ids_to_test
,
get_text_token_prompts
...
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
...
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
)
)
# TODO(Isotr0py): Don't initalize model during test
@
contextmanager
@
contextmanager
def
initialize_dummy_model
(
def
initialize_dummy_model
(
model_cls
:
type
[
nn
.
Module
],
model_cls
:
type
[
nn
.
Module
],
...
@@ -150,16 +153,21 @@ def initialize_dummy_model(
...
@@ -150,16 +153,21 @@ def initialize_dummy_model(
backend
=
"nccl"
,
backend
=
"nccl"
,
)
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
current_device
=
torch
.
get_default_device
()
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
set_default_torch_dtype
(
model_config
.
dtype
):
torch
.
set_default_device
(
current_platform
.
device_type
)
model
=
model_cls
(
vllm_config
=
vllm_config
)
model
=
model_cls
(
vllm_config
=
vllm_config
)
torch
.
set_default_device
(
current_device
)
yield
model
yield
model
del
model
del
model
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
get_model_ids_to_test
())
@
pytest
.
mark
.
parametrize
(
"model_id"
,
get_model_ids_to_test
())
def
test_model_tensor_schema
(
model_id
:
str
):
def
test_model_tensor_schema
(
model_id
:
str
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
...
...
tests/models/registry.py
View file @
a3f8d5dd
...
@@ -173,10 +173,7 @@ class _HfExamplesInfo:
...
@@ -173,10 +173,7 @@ class _HfExamplesInfo:
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
_TEXT_GENERATION_EXAMPLE_MODELS
=
{
# [Decoder-only]
# [Decoder-only]
"AfmoeForCausalLM"
:
_HfExamplesInfo
(
"AfmoeForCausalLM"
:
_HfExamplesInfo
(
"arcee-ai/Trinity-Nano-Preview"
),
"arcee-ai/Trinity-Nano"
,
is_available_online
=
False
,
),
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B-Instruct-2509"
),
"ApertusForCausalLM"
:
_HfExamplesInfo
(
"swiss-ai/Apertus-8B-Instruct-2509"
),
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
trust_remote_code
=
True
),
"AquilaModel"
:
_HfExamplesInfo
(
"BAAI/AquilaChat-7B"
,
trust_remote_code
=
True
),
"AquilaForCausalLM"
:
_HfExamplesInfo
(
"BAAI/AquilaChat2-7B"
,
trust_remote_code
=
True
),
"AquilaForCausalLM"
:
_HfExamplesInfo
(
"BAAI/AquilaChat2-7B"
,
trust_remote_code
=
True
),
...
@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MistralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-7B-Instruct-v0.1"
),
"MistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"MistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
,
is_available_online
=
False
"mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
),
),
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"MixtralForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
...
@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
...
@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
),
),
"Qwen3ForTokenClassification"
:
_HfExamplesInfo
(
"bd2lcco/Qwen3-0.6B-finetuned"
),
}
}
_MULTIMODAL_EXAMPLE_MODELS
=
{
_MULTIMODAL_EXAMPLE_MODELS
=
{
# [Decoder-only]
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0.dev"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"BeeForConditionalGeneration"
:
_HfExamplesInfo
(
"Open-Bee/Bee-8B-RL"
,
"Open-Bee/Bee-8B-RL"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
),
"HunYuanVLForConditionalGeneration"
:
_HfExamplesInfo
(
"HunYuanVLForConditionalGeneration"
:
_HfExamplesInfo
(
"tencent/HunyuanOCR"
,
"tencent/HunyuanOCR"
,
is_available_online
=
False
,
hf_overrides
=
{
"num_experts"
:
0
}
,
),
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
...
@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
,
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
,
),
),
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"lightonai/LightOnOCR-1B"
,
"lightonai/LightOnOCR-1B-1025"
is_available_online
=
False
,
),
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
...
@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"ministral-3"
:
"mistralai/Ministral-3-3B-Instruct-2512"
,
"ministral-3"
:
"mistralai/Ministral-3-3B-Instruct-2512"
,
},
},
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
# TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
is_available_online
=
False
,
),
),
"QwenVLForConditionalGeneration"
:
_HfExamplesInfo
(
"QwenVLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL"
,
"Qwen/Qwen-VL"
,
...
@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online
=
False
,
is_available_online
=
False
,
),
),
# [Encoder-decoder]
# [Encoder-decoder]
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3-turbo"
,
extras
=
{
"v3"
:
"openai/whisper-large-v3"
},
),
# [Cross-encoder]
# [Cross-encoder]
"JinaVLForRanking"
:
_HfExamplesInfo
(
"jinaai/jina-reranker-m0"
),
"JinaVLForRanking"
:
_HfExamplesInfo
(
"jinaai/jina-reranker-m0"
),
}
}
...
@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"EagleMistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"EagleMistralLarge3ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mistral-Large-3-675B-Instruct-2512"
,
"mistralai/Mistral-Large-3-675B-Instruct-2512"
,
speculative_model
=
"mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
,
speculative_model
=
"mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle"
,
# TODO: revert once figuring out OOM in CI
is_available_online
=
False
,
is_available_online
=
False
,
),
),
"LlamaForCausalLMEagle3"
:
_HfExamplesInfo
(
"LlamaForCausalLMEagle3"
:
_HfExamplesInfo
(
...
...
tests/multimodal/test_sparse_tensor_validation_unit.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for sparse tensor validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
"""
import
io
import
pytest
import
torch
class
TestSparseTensorValidationContextManager
:
"""Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""
def
test_valid_sparse_tensor_passes
(
self
):
"""Valid sparse tensors should pass validation."""
indices
=
torch
.
tensor
([[
0
,
1
],
[
0
,
1
]])
values
=
torch
.
tensor
([
1.0
,
2.0
])
shape
=
(
2
,
2
)
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
dense
=
tensor
.
to_dense
()
assert
dense
.
shape
==
shape
def
test_out_of_bounds_indices_rejected
(
self
):
"""Sparse tensors with out-of-bounds indices should be rejected."""
indices
=
torch
.
tensor
([[
5
],
[
5
]])
# Out of bounds for 2x2
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
with
pytest
.
raises
(
RuntimeError
)
as
exc_info
:
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
tensor
.
to_dense
()
assert
(
"index"
in
str
(
exc_info
.
value
).
lower
()
or
"bound"
in
str
(
exc_info
.
value
).
lower
()
)
def
test_negative_indices_rejected
(
self
):
"""Sparse tensors with negative indices should be rejected."""
indices
=
torch
.
tensor
([[
-
1
],
[
0
]])
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
with
pytest
.
raises
(
RuntimeError
):
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
tensor
.
to_dense
()
def
test_without_context_manager_allows_invalid
(
self
):
"""
WITHOUT validation, invalid tensors may not immediately error.
This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
by default, which can lead to memory corruption.
"""
indices
=
torch
.
tensor
([[
100
],
[
100
]])
# Way out of bounds
values
=
torch
.
tensor
([
1.0
])
shape
=
(
2
,
2
)
# Without validation context, this might create an invalid tensor
# (actual behavior depends on PyTorch version)
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
shape
)
# The tensor object is created, but it's invalid
assert
tensor
.
is_sparse
class
TestTorchLoadWithValidation
:
"""Test torch.load() with sparse tensor validation."""
def
test_load_valid_sparse_tensor_with_validation
(
self
):
"""Valid sparse tensors should load successfully with validation."""
# Create and save a valid sparse tensor
indices
=
torch
.
tensor
([[
0
,
1
],
[
0
,
1
]])
values
=
torch
.
tensor
([
1.0
,
2.0
])
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
(
2
,
2
))
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
dense
=
loaded
.
to_dense
()
assert
dense
.
shape
==
(
2
,
2
)
def
test_load_invalid_sparse_tensor_rejected
(
self
):
"""Invalid sparse tensors should be caught when loaded with validation."""
# Create an invalid sparse tensor (out of bounds)
indices
=
torch
.
tensor
([[
10
],
[
10
]])
values
=
torch
.
tensor
([
1.0
])
tensor
=
torch
.
sparse_coo_tensor
(
indices
,
values
,
(
2
,
2
))
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation - should fail on to_dense()
with
pytest
.
raises
(
RuntimeError
):
# noqa: SIM117
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
loaded
.
to_dense
()
def
test_load_dense_tensor_unaffected
(
self
):
"""Dense tensors should work normally with the validation context."""
# Create and save a dense tensor
tensor
=
torch
.
randn
(
10
,
20
)
buffer
=
io
.
BytesIO
()
torch
.
save
(
tensor
,
buffer
)
buffer
.
seek
(
0
)
# Load with validation (should have no effect on dense tensors)
with
torch
.
sparse
.
check_sparse_tensor_invariants
():
loaded
=
torch
.
load
(
buffer
,
weights_only
=
True
)
assert
loaded
.
shape
==
(
10
,
20
)
assert
not
loaded
.
is_sparse
if
__name__
==
"__main__"
:
# Allow running directly for quick testing
pytest
.
main
([
__file__
,
"-v"
,
"--tb=short"
])
tests/multimodal/test_utils.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
base64
import
base64
import
mimetypes
import
mimetypes
import
os
import
os
...
@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
...
@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch
from
PIL
import
Image
,
ImageChops
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.image
import
convert_image_mode
...
@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion():
...
@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion():
connector
.
fetch_image
(
broken_img
)
connector
.
fetch_image
(
broken_img
)
@
pytest
.
mark
.
flaky
(
reruns
=
3
,
reruns_delay
=
5
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
...
@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
...
@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
}
}
)
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
try
:
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
video_sync
,
metadata_sync
=
connector
.
fetch_video
(
video_url
)
video_async
,
metadata_async
=
await
connector
.
fetch_video_async
(
video_url
)
except
(
TimeoutError
,
asyncio
.
TimeoutError
)
as
e
:
pytest
.
skip
(
f
"Timeout fetching video (CI network flakiness):
{
e
}
"
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
assert
metadata_sync
==
metadata_async
assert
metadata_sync
==
metadata_async
...
@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case):
...
@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case):
assert
modality_idxs
==
expected_modality_idxs
assert
modality_idxs
==
expected_modality_idxs
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
5
),
(
torch
.
tensor
([
True
,
True
,
True
,
True
,
True
]),
5
),
(
torch
.
tensor
([
False
,
False
,
False
,
False
,
False
]),
0
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
,
True
]),
3
),
(
torch
.
tensor
([
True
]),
1
),
],
)
def
test_placeholder_range_get_num_embeds
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_num_embeds
==
expected
@
pytest
.
mark
.
parametrize
(
"is_embed,expected"
,
[
(
None
,
None
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
torch
.
tensor
([
0
,
1
,
1
,
2
,
3
]),
),
(
torch
.
tensor
([
True
,
True
,
True
]),
torch
.
tensor
([
1
,
2
,
3
])),
],
)
def
test_placeholder_range_embeds_cumsum
(
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
if
expected
is
None
:
assert
pr
.
embeds_cumsum
is
None
return
assert
torch
.
equal
(
pr
.
embeds_cumsum
,
expected
)
# cached_property should return the same object on repeated access
assert
pr
.
embeds_cumsum
is
pr
.
embeds_cumsum
@
pytest
.
mark
.
parametrize
(
"is_embed,start_idx,end_idx,expected"
,
[
(
None
,
2
,
4
,
(
2
,
4
)),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
3
,
5
,
(
1
,
3
),
),
(
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
0
,
2
,
(
0
,
1
),
),
(
torch
.
tensor
([
True
,
False
,
True
,
False
]),
2
,
2
,
(
1
,
1
),
),
],
)
def
test_placeholder_range_get_embeds_indices_in_range
(
is_embed
,
start_idx
,
end_idx
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
0
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
get_embeds_indices_in_range
(
start_idx
,
end_idx
)
==
expected
@
pytest
.
mark
.
parametrize
(
"offset,is_embed,expected"
,
[
(
0
,
None
,
[(
0
,
4
)]),
(
2
,
torch
.
tensor
([
False
,
True
,
False
,
True
,
True
]),
[(
3
,
3
),
(
5
,
6
)],
),
(
0
,
torch
.
tensor
([
True
,
True
,
True
,
True
]),
[(
0
,
3
)]),
(
0
,
torch
.
tensor
([
False
,
False
,
False
,
False
]),
[]),
],
)
def
test_placeholder_range_extract_embeds_range
(
offset
,
is_embed
,
expected
):
length
=
len
(
is_embed
)
if
is_embed
is
not
None
else
5
pr
=
PlaceholderRange
(
offset
=
offset
,
length
=
length
,
is_embed
=
is_embed
)
assert
pr
.
extract_embeds_range
()
==
expected
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
...
...
tests/multimodal/test_video.py
View file @
a3f8d5dd
...
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
...
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
"""
"""
Regression test for handling videos with broken frames.
Regression test for handling videos with broken frames.
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
contains broken
/unreadable
frames to verify the video loader handles
contains broken frames to verify the video loader handles
them gracefully without crashing and returns accurate metadata.
them gracefully without crashing and returns accurate metadata.
"""
"""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
...
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
f
"Expected fewer than
{
metadata
[
'total_num_frames'
]
}
frames, "
f
"Expected fewer than
{
metadata
[
'total_num_frames'
]
}
frames, "
f
"but loaded
{
frames
.
shape
[
0
]
}
frames"
f
"but loaded
{
frames
.
shape
[
0
]
}
frames"
)
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_1"
)
class
TestVideoBackendOverride1
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_1
,
{
"video_backend"
:
"test_video_backend_override_1"
}
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_backend_override_2"
)
class
TestVideoBackendOverride2
(
VideoLoader
):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
return
FAKE_OUTPUT_2
,
{
"video_backend"
:
"test_video_backend_override_2"
}
def
test_video_media_io_backend_kwarg_override
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with
monkeypatch
.
context
()
as
m
:
# Set the env var to one backend
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_1"
)
imageio
=
ImageMediaIO
()
# Without video_backend kwarg, should use env var backend
videoio_default
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_default
,
metadata_default
=
videoio_default
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_default
,
FAKE_OUTPUT_1
)
assert
metadata_default
[
"video_backend"
]
==
"test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_video_backend_override_2"
)
frames_override
,
metadata_override
=
videoio_override
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_override
,
FAKE_OUTPUT_2
)
assert
metadata_override
[
"video_backend"
]
==
"test_video_backend_override_2"
def
test_video_media_io_backend_kwarg_not_passed_to_loader
(
monkeypatch
:
pytest
.
MonkeyPatch
,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_reject_video_backend_kwarg"
)
class
RejectVideoBackendKwargLoader
(
VideoLoader
):
"""Test loader that fails if video_backend is passed through."""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
)
->
tuple
[
npt
.
NDArray
,
dict
]:
# This should never receive video_backend in kwargs
if
"video_backend"
in
kwargs
:
raise
AssertionError
(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return
FAKE_OUTPUT_1
,
{
"received_kwargs"
:
list
(
kwargs
.
keys
())}
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_reject_video_backend_kwarg"
)
imageio
=
ImageMediaIO
()
# Even when video_backend is provided, it should NOT be passed to loader
videoio
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
"test_reject_video_backend_kwarg"
,
other_kwarg
=
"should_pass_through"
,
)
# This should NOT raise AssertionError
frames
,
metadata
=
videoio
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames
,
FAKE_OUTPUT_1
)
# Verify other kwargs are still passed through
assert
"other_kwarg"
in
metadata
[
"received_kwargs"
]
def
test_video_media_io_backend_env_var_fallback
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"test_video_backend_override_2"
)
imageio
=
ImageMediaIO
()
# Explicit None should fall back to env var
videoio_none
=
VideoMediaIO
(
imageio
,
num_frames
=
10
,
video_backend
=
None
)
frames_none
,
metadata_none
=
videoio_none
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_none
,
FAKE_OUTPUT_2
)
assert
metadata_none
[
"video_backend"
]
==
"test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing
=
VideoMediaIO
(
imageio
,
num_frames
=
10
)
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
tests/quantization/test_blackwell_moe.py
View file @
a3f8d5dd
...
@@ -10,9 +10,9 @@ import pytest
...
@@ -10,9 +10,9 @@ import pytest
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
if
not
current_platform
.
is_device_capability
(
100
):
if
not
current_platform
.
is_device_capability
_family
(
100
):
pytest
.
skip
(
pytest
.
skip
(
"This test only runs on Blackwell GPUs (SM10
0
)."
,
allow_module_level
=
True
"This test only runs on Blackwell GPUs (SM10
x
)."
,
allow_module_level
=
True
)
)
...
...
tests/quantization/test_quark.py
View file @
a3f8d5dd
...
@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
...
@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
task
=
"wikitext"
task
=
"wikitext"
rtol
=
0.1
rtol
=
0.1
# Smaller cuda
_
graph_sizes to speed up the test.
# Smaller cudagraph_
capture_
sizes to speed up the test.
results
=
lm_eval
.
simple_evaluate
(
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model
=
"vllm"
,
model_args
=
config
.
get_model_args
(
model_args
=
config
.
get_model_args
(
tp_size
=
tp_size
,
kwargs
=
{
"cuda
_
graph_sizes"
:
[
16
]}
tp_size
=
tp_size
,
kwargs
=
{
"cudagraph_
capture_
sizes"
:
[
16
]}
),
),
tasks
=
task
,
tasks
=
task
,
batch_size
=
64
,
batch_size
=
64
,
...
...
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2_append_think"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT
=
{
"output"
:
"This is reasoning</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning</think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning in progress"
,
"is_reasoning_end"
:
False
,
}
# Case: only end token
ONLY_END_TOKEN
=
{
"output"
:
"</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think></think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: multiple lines
MULTIPLE_LINES
=
{
"output"
:
"Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"reasoning"
:
None
,
"content"
:
"<think>Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
"<think>"
,
"is_reasoning_end"
:
False
,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2</think>Yes!"
,
"reasoning"
:
None
,
"content"
:
"<think>Let me think... 1+1=2</think>Yes!"
,
"is_reasoning_end"
:
True
,
}
# Case: code in output
CODE_OUTPUT
=
{
"output"
:
"```python
\n
print('hi')
\n
```</think>Here's the code."
,
"reasoning"
:
None
,
"content"
:
"<think>```python
\n
print('hi')
\n
```</think>Here's the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
SIMPLE_OUTPUT
,
id
=
"simple_output"
,
),
pytest
.
param
(
True
,
SIMPLE_OUTPUT
,
id
=
"simple_output_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
ONLY_END_TOKEN
,
id
=
"only_end_token"
,
),
pytest
.
param
(
True
,
ONLY_END_TOKEN
,
id
=
"only_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_OUTPUT
,
id
=
"code_output"
,
),
pytest
.
param
(
True
,
CODE_OUTPUT
,
id
=
"code_output_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
tests/reasoning/test_minimax_m2_reasoning_parser.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section</think>"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
"This is reasoning in progress"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES
=
{
"output"
:
"First line
\n
Second line</think>Response first line
\n
Response second"
,
"reasoning"
:
"First line
\n
Second line"
,
"content"
:
"Response first line
\n
Response second"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
""
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
None
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
""
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: empty streaming
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: long reasoning with special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2, right?</think>Yes, 1+1=2."
,
"reasoning"
:
"Let me think... 1+1=2, right?"
,
"content"
:
"Yes, 1+1=2."
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning with code blocks
CODE_IN_REASONING
=
{
"output"
:
"```python
\n
print('hello')
\n
```</think>Here is the code."
,
"reasoning"
:
"```python
\n
print('hello')
\n
```"
,
"content"
:
"Here is the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
# Core cases: no start token (MiniMax M2 actual behavior)
pytest
.
param
(
False
,
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
),
pytest
.
param
(
True
,
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_reasoning"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING_STREAMING
,
id
=
"shortest_reasoning_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning"
,
),
pytest
.
param
(
True
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_ids
)
assert
content
==
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"content"
])
)
else
:
content
=
parser
.
extract_content_ids
(
output
)
assert
content
==
[]
tests/reasoning/test_mistral_reasoning_parser.py
View file @
a3f8d5dd
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
tests.reasoning.utils
import
run_reasoning_extraction_mistral
from
tests.reasoning.utils
import
run_reasoning_extraction_mistral
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
parser_name
=
"mistral"
parser_name
=
"mistral"
...
@@ -18,47 +18,53 @@ def mistral_tokenizer():
...
@@ -18,47 +18,53 @@ def mistral_tokenizer():
return
mistral_tokenizer
return
mistral_tokenizer
SIMPLE_REASONING
=
{
INVALID_
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section[/THINK]This is the rest"
,
"output"
:
"This is a reasoning section[/THINK]This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is
a reasoning sectionThis is
the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
COMPLETE_REASONING
=
{
INVALID_
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section[/THINK]"
,
"output"
:
"This is a reasoning section[/THINK]"
,
"reasoning"
:
"This is a reasoning section"
,
"reasoning"
:
None
,
"content"
:
None
,
"content"
:
"This is a reasoning section"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
NO_CONTENT
=
{
NO_CONTENT
=
{
"output"
:
"This is
content
"
,
"output"
:
"
[THINK]
This is
reasoning
"
,
"reasoning"
:
"This is
content
"
,
"reasoning"
:
"This is
reasoning
"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
NO_REASONING
=
{
"output"
:
"This is content"
,
"reasoning"
:
None
,
"content"
:
"This is content"
,
"is_reasoning_end"
:
False
,
}
NO_REASONING_STREAMING
=
{
NO_REASONING_STREAMING
=
{
"output"
:
"This is a reasoning section"
,
"output"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"reasoning"
:
None
,
"content"
:
None
,
"content"
:
"This is a reasoning section"
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
MULTIPLE_LINES
=
{
INVALID_
MULTIPLE_LINES
=
{
"output"
:
"This
\n
That[/THINK]This is the rest
\n
That"
,
"output"
:
"This
\n
That[/THINK]This is the rest
\n
That"
,
"reasoning"
:
"This
\n
That"
,
"reasoning"
:
None
,
"content"
:
"This is the rest
\n
That"
,
"content"
:
"This
\n
ThatThis
is the rest
\n
That"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
SHORTEST_REASONING_NO_STREAMING
=
{
INVALID_
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"[/THINK]This is the rest"
,
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
""
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
SHORTEST_REASONING
=
{
INVALID_
SHORTEST_REASONING
=
{
"output"
:
"[/THINK]This is the rest"
,
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
REASONING_WITH_THINK
=
{
REASONING_WITH_THINK
=
{
"output"
:
"[THINK]This is a reasoning section[/THINK]This is the rest"
,
"output"
:
"[THINK]This is a reasoning section[/THINK]This is the rest"
,
...
@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
...
@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
"content"
:
"This is the rest
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
INVALID_
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
"output"
:
"[/THINK]This is the rest"
,
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
""
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
SHORTEST_REASONING_WITH_THINK
=
{
INVALID_
SHORTEST_REASONING_WITH_THINK
=
{
"output"
:
"[/THINK]This is the rest"
,
"output"
:
"[/THINK]This is the rest"
,
"reasoning"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
Tru
e
,
"is_reasoning_end"
:
Fals
e
,
}
}
THINK_NO_END
=
{
THINK_NO_END
=
{
"output"
:
"[THINK]This is a reasoning section"
,
"output"
:
"[THINK]This is a reasoning section"
,
...
@@ -98,8 +104,8 @@ THINK_NO_END = {
...
@@ -98,8 +104,8 @@ THINK_NO_END = {
}
}
EMPTY
=
{
EMPTY
=
{
"output"
:
""
,
"output"
:
""
,
"reasoning"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"content"
:
""
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
EMPTY_STREAMING
=
{
EMPTY_STREAMING
=
{
...
@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
...
@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
NEW_LINE
=
{
NEW_LINE
=
{
"output"
:
"
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"output"
:
"
Before
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"content"
:
"
Before
\n
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support [THINK]...[/THINK] and [/THINK]...
# We cannot know if the text before [THINK] is reasoning content
# or not.
NEW_LINE_STREAMING
=
{
NEW_LINE_STREAMING
=
{
"output"
:
"
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"output"
:
"
Before
\n
[THINK]This is a reasoning section[/THINK]
\n
This is the rest"
,
"reasoning"
:
"
\n
This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"content"
:
"
Before
\n
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
TEST_CASES
=
[
TEST_CASES
=
[
pytest
.
param
(
pytest
.
param
(
False
,
False
,
SIMPLE_REASONING
,
INVALID_
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
id
=
"
invalid_
simple_reasoning"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
SIMPLE_REASONING
,
INVALID_
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
id
=
"
invalid_
simple_reasoning_streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
COMPLETE_REASONING
,
INVALID_
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
id
=
"
invalid_
complete_reasoning"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
COMPLETE_REASONING
,
INVALID_
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
id
=
"
invalid_
complete_reasoning_streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
NO_CONTENT
,
NO_CONTENT
,
id
=
"no_content_token"
,
id
=
"no_content"
,
),
pytest
.
param
(
False
,
NO_REASONING
,
id
=
"no_reasoning"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
...
@@ -158,23 +165,23 @@ TEST_CASES = [
...
@@ -158,23 +165,23 @@ TEST_CASES = [
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
MULTIPLE_LINES
,
INVALID_
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
id
=
"
invalid_
multiple_lines"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
MULTIPLE_LINES
,
INVALID_
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
id
=
"
invalid_
multiple_lines_streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
SHORTEST_REASONING
,
INVALID_
SHORTEST_REASONING
,
id
=
"shortest"
,
id
=
"
invalid_
shortest"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
SHORTEST_REASONING_NO_STREAMING
,
INVALID_
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_streaming"
,
id
=
"
invalid_
shortest_streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
...
@@ -208,13 +215,13 @@ TEST_CASES = [
...
@@ -208,13 +215,13 @@ TEST_CASES = [
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
,
INVALID_
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
,
id
=
"shortest_with_think"
,
id
=
"
invalid_
shortest_with_think"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
SHORTEST_REASONING_WITH_THINK
,
INVALID_
SHORTEST_REASONING_WITH_THINK
,
id
=
"shortest_with_think_streaming"
,
id
=
"
invalid_
shortest_with_think_streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
...
@@ -316,10 +323,26 @@ def test_mistral_reasoning(
...
@@ -316,10 +323,26 @@ def test_mistral_reasoning(
# Test extract_content
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_tokens
)
# Handle the case where there are tokens outputted before Thinking.
assert
content
==
mistral_tokenizer
.
tokenizer
.
encode
(
# This should not occur if the model is well trained and prompted.
param_dict
[
"content"
],
bos
=
False
,
eos
=
False
if
"[THINK]"
in
param_dict
[
"output"
]
and
not
param_dict
[
"output"
].
startswith
(
"[THINK]"
):
before_content
=
param_dict
[
"output"
].
split
(
"[THINK]"
)[
0
]
before_token_ids
=
mistral_tokenizer
.
tokenizer
.
encode
(
before_content
,
bos
=
False
,
eos
=
False
)
left_to_encode
=
param_dict
[
"content"
][
len
(
before_content
)
:]
# Normal situation.
else
:
before_token_ids
=
[]
left_to_encode
=
param_dict
[
"content"
]
content_tokens
=
parser
.
extract_content_ids
(
output_tokens
)
expected_token_ids
=
before_token_ids
+
mistral_tokenizer
.
tokenizer
.
encode
(
left_to_encode
,
bos
=
False
,
eos
=
False
)
)
assert
content_tokens
==
expected_token_ids
else
:
else
:
content
=
parser
.
extract_content_ids
(
output_tokens
)
content
=
parser
.
extract_content_ids
(
output_tokens
)
assert
content
==
[]
assert
content
==
[]
tests/reasoning/utils.py
View file @
a3f8d5dd
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
DeltaMessage
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
DeltaMessage
from
vllm.reasoning
import
ReasoningParser
from
vllm.reasoning
import
ReasoningParser
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
class
StreamingReasoningReconstructor
:
class
StreamingReasoningReconstructor
:
...
...
tests/standalone_tests/python_only_compile.sh
View file @
a3f8d5dd
...
@@ -3,12 +3,45 @@
...
@@ -3,12 +3,45 @@
# for users who do not have any compilers installed on their system
# for users who do not have any compilers installed on their system
set
-e
set
-e
set
-x
merge_base_commit
=
$(
git merge-base HEAD origin/main
)
merge_base_commit
=
$(
git merge-base HEAD origin/main
)
echo
"
C
urrent merge base commit with main:
$merge_base_commit
"
echo
"
INFO: c
urrent merge base commit with main:
$merge_base_commit
"
git show
--oneline
-s
$merge_base_commit
git show
--oneline
-s
$merge_base_commit
# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times
# this avoids cumbersome error messages & manual retries in case the precompiled wheel
# for the given commit is still being built in the release pipeline
meta_json_url
=
"https://wheels.vllm.ai/
$merge_base_commit
/vllm/metadata.json"
echo
"INFO: will use metadata.json from
$meta_json_url
"
for
i
in
{
1..5
}
;
do
echo
"Checking metadata.json URL (attempt
$i
)..."
if
curl
--fail
"
$meta_json_url
"
>
metadata.json
;
then
echo
"INFO: metadata.json URL is valid."
# check whether it is valid json by python
if
python3
-m
json.tool metadata.json
;
then
echo
"INFO: metadata.json is valid JSON. Proceeding with the test."
else
echo
"CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
exit
1
fi
break
fi
# failure handling
if
[
$i
-eq
5
]
;
then
echo
"ERROR: metadata.json URL is still not valid after 5 attempts."
echo
"ERROR: Please check whether the precompiled wheel for commit
$merge_base_commit
exists."
echo
" NOTE: If
$merge_base_commit
is a new commit on main, maybe try again after its release pipeline finishes."
echo
" NOTE: If it fails, please report in #sig-ci channel."
exit
1
else
echo
"WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
sleep
180
fi
done
set
-x
cd
/vllm-workspace/
cd
/vllm-workspace/
# uninstall vllm
# uninstall vllm
...
@@ -29,6 +62,6 @@ python3 -c 'import vllm'
...
@@ -29,6 +62,6 @@ python3 -c 'import vllm'
# Check if the clangd log file was created
# Check if the clangd log file was created
if
[
!
-f
/tmp/changed.file
]
;
then
if
[
!
-f
/tmp/changed.file
]
;
then
echo
"changed.file was not created, python only compilation failed"
echo
"
ERROR:
changed.file was not created, python only compilation failed"
exit
1
exit
1
fi
fi
tests/test_config.py
View file @
a3f8d5dd
...
@@ -89,64 +89,6 @@ def test_update_config():
...
@@ -89,64 +89,6 @@ def test_update_config():
new_config3
=
update_config
(
config3
,
{
"a"
:
"new_value"
})
new_config3
=
update_config
(
config3
,
{
"a"
:
"new_value"
})
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"generate"
,
"none"
,
"generate"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"none"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"none"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"none"
,
"embed"
),
(
"openai/whisper-small"
,
"generate"
,
"none"
,
"transcription"
),
],
)
def
test_auto_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"auto"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"pooling"
,
"embed"
,
"embed"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"classify"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"embed"
,
"embed"
),
(
"openai/whisper-small"
,
"pooling"
,
"embed"
,
"embed"
),
],
)
def
test_score_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"score"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
# Can remove once --task option is fully deprecated
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"openai/whisper-small"
,
"generate"
,
"none"
,
"transcription"
),
],
)
def
test_transcription_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"transcription"
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
convert_type
==
expected_convert_type
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
),
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
),
[
[
...
@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
...
@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
)
)
# Override one field but not others
# Override one field but not others
pass_config
=
PassConfig
(
e
nabl
e_noop
=
False
)
pass_config
=
PassConfig
(
e
liminat
e_noop
s
=
False
)
compilation_config
=
CompilationConfig
(
pass_config
=
pass_config
)
compilation_config
=
CompilationConfig
(
pass_config
=
pass_config
)
config
=
VllmConfig
(
config
=
VllmConfig
(
model_config
=
regular_model
,
model_config
=
regular_model
,
...
...
tests/test_envs.py
View file @
a3f8d5dd
...
@@ -8,6 +8,7 @@ import pytest
...
@@ -8,6 +8,7 @@ import pytest
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.envs
import
(
from
vllm.envs
import
(
disable_envs_cache
,
enable_envs_cache
,
enable_envs_cache
,
env_list_with_choices
,
env_list_with_choices
,
env_set_with_choices
,
env_set_with_choices
,
...
@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
...
@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
envs
.
__getattr__
=
envs
.
__getattr__
.
__wrapped__
envs
.
__getattr__
=
envs
.
__getattr__
.
__wrapped__
def
test_getattr_with_reset
(
monkeypatch
:
pytest
.
MonkeyPatch
)
->
None
:
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"1.1.1.1"
)
# __getattr__ is not decorated with functools.cache
assert
not
hasattr
(
envs
.
__getattr__
,
"cache_info"
)
# Enable envs cache and ignore ongoing environment changes
enable_envs_cache
()
assert
envs
.
VLLM_HOST_IP
==
"1.1.1.1"
# With cache enabled, the environment variable value is cached and unchanged
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"2.2.2.2"
)
assert
envs
.
VLLM_HOST_IP
==
"1.1.1.1"
disable_envs_cache
()
assert
envs
.
VLLM_HOST_IP
==
"2.2.2.2"
# After cache disabled, the environment variable value would be synced
# with os.environ
monkeypatch
.
setenv
(
"VLLM_HOST_IP"
,
"3.3.3.3"
)
assert
envs
.
VLLM_HOST_IP
==
"3.3.3.3"
def
test_is_envs_cache_enabled
()
->
None
:
assert
not
envs
.
_is_envs_cache_enabled
()
enable_envs_cache
()
assert
envs
.
_is_envs_cache_enabled
()
# Only wrap one-layer of cache, so we only need to
# call disable once to reset.
enable_envs_cache
()
enable_envs_cache
()
enable_envs_cache
()
disable_envs_cache
()
assert
not
envs
.
_is_envs_cache_enabled
()
disable_envs_cache
()
assert
not
envs
.
_is_envs_cache_enabled
()
class
TestEnvWithChoices
:
class
TestEnvWithChoices
:
"""Test cases for env_with_choices function."""
"""Test cases for env_with_choices function."""
...
...
tests/test_inputs.py
View file @
a3f8d5dd
...
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
...
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs
import
zip_enc_dec_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.parse
import
parse_raw_prompts
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.tokenizers
import
init
_tokenizer_from_config
from
vllm.tokenizers
import
cached
_tokenizer_from_config
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
...
@@ -34,6 +34,13 @@ INPUTS_SLICES = [
...
@@ -34,6 +34,13 @@ INPUTS_SLICES = [
]
]
# Test that a nested mixed-type list of lists raises a TypeError.
@
pytest
.
mark
.
parametrize
(
"invalid_input"
,
[[[
1
,
2
],
[
"foo"
,
"bar"
]]])
def
test_invalid_input_raise_type_error
(
invalid_input
):
with
pytest
.
raises
(
TypeError
):
parse_raw_prompts
(
invalid_input
)
def
test_parse_raw_single_batch_empty
():
def
test_parse_raw_single_batch_empty
():
with
pytest
.
raises
(
ValueError
,
match
=
"at least one prompt"
):
with
pytest
.
raises
(
ValueError
,
match
=
"at least one prompt"
):
parse_raw_prompts
([])
parse_raw_prompts
([])
...
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
...
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
)
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
def
test_preprocessor_always_mm_code_path
(
model_id
,
prompt
):
model_config
=
ModelConfig
(
model
=
model_id
)
model_config
=
ModelConfig
(
model
=
model_id
)
tokenizer
=
init
_tokenizer_from_config
(
model_config
)
tokenizer
=
cached
_tokenizer_from_config
(
model_config
)
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
# HF processor adds sep token
# HF processor adds sep token
...
...
tests/tokenizers_/test_basic.py
View file @
a3f8d5dd
...
@@ -3,38 +3,39 @@
...
@@ -3,38 +3,39 @@
from
typing
import
_get_protocol_attrs
# type: ignore
from
typing
import
_get_protocol_attrs
# type: ignore
import
pytest
import
pytest
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
(
PreTrainedTokenizer
,
PreTrainedTokenizerBase
,
PreTrainedTokenizerFast
,
)
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
def
_get_missing_attrs
(
obj
:
object
,
target
:
type
):
def
_get_missing_attrs
(
obj
:
object
,
target
:
type
):
return
[
k
for
k
in
_get_protocol_attrs
(
target
)
if
not
hasattr
(
obj
,
k
)]
return
[
k
for
k
in
_get_protocol_attrs
(
target
)
if
not
hasattr
(
obj
,
k
)]
def
_assert_tokenizer_like
(
tokenizer
:
object
):
missing_attrs
=
_get_missing_attrs
(
tokenizer
,
TokenizerLike
)
assert
not
missing_attrs
,
f
"Missing attrs:
{
missing_attrs
}
"
def
test_tokenizer_like_protocol
():
def
test_tokenizer_like_protocol
():
assert
not
(
tokenizer
=
get_tokenizer
(
"gpt2"
,
use_fast
=
False
)
missing_attrs
:
=
_get_missing_attrs
(
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
get_tokenizer
(
"gpt2"
,
use_fast
=
False
),
_assert_tokenizer_like
(
tokenizer
)
TokenizerLike
,
)
tokenizer
=
get_tokenizer
(
"gpt2"
,
use_fast
=
True
)
),
f
"Missing attrs:
{
missing_attrs
}
"
assert
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
)
_assert_tokenizer_like
(
tokenizer
)
assert
not
(
missing_attrs
:
=
_get_missing_attrs
(
tokenizer
=
get_tokenizer
(
get_tokenizer
(
"gpt2"
,
use_fast
=
True
),
"mistralai/Mistral-7B-Instruct-v0.3"
,
tokenizer_mode
=
"mistral"
TokenizerLike
,
)
)
assert
isinstance
(
tokenizer
,
MistralTokenizer
)
),
f
"Missing attrs:
{
missing_attrs
}
"
_assert_tokenizer_like
(
tokenizer
)
assert
not
(
missing_attrs
:
=
_get_missing_attrs
(
get_tokenizer
(
"mistralai/Mistral-7B-Instruct-v0.3"
,
tokenizer_mode
=
"mistral"
),
TokenizerLike
,
)
),
f
"Missing attrs:
{
missing_attrs
}
"
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"facebook/opt-125m"
,
"gpt2"
])
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"facebook/opt-125m"
,
"gpt2"
])
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment