Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da93439
Commit
0da93439
authored
Mar 26, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori
parents
25f2f756
298e5108
Changes
613
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
561 additions
and
96 deletions
+561
-96
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+1
-1
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_nemotron_vl.py
+1
-1
tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
...s/models/multimodal/processing/test_qwen2_5_omni_embed.py
+4
-2
tests/models/quantization/test_mxfp8.py
tests/models/quantization/test_mxfp8.py
+104
-0
tests/models/registry.py
tests/models/registry.py
+17
-1
tests/models/test_terratorch.py
tests/models/test_terratorch.py
+1
-1
tests/multimodal/media/test_audio.py
tests/multimodal/media/test_audio.py
+17
-23
tests/multimodal/media/test_connector.py
tests/multimodal/media/test_connector.py
+1
-1
tests/multimodal/media/test_video.py
tests/multimodal/media/test_video.py
+52
-0
tests/multimodal/test_audio.py
tests/multimodal/test_audio.py
+19
-19
tests/multimodal/test_embedding_shape_validation.py
tests/multimodal/test_embedding_shape_validation.py
+0
-0
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
...in/bge_m3_sparse_processor/sparse_embeddings_processor.py
+105
-19
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
...ins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
+31
-4
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
...rocessor_plugin/prithvi_io_processor/prithvi_processor.py
+1
-1
tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
.../plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
+24
-1
tests/plugins_tests/test_terratorch_io_processor_plugins.py
tests/plugins_tests/test_terratorch_io_processor_plugins.py
+1
-1
tests/quantization/test_mi3xx_moe.py
tests/quantization/test_mi3xx_moe.py
+6
-0
tests/reasoning/test_kimi_k2_reasoning_parser.py
tests/reasoning/test_kimi_k2_reasoning_parser.py
+155
-0
tests/reasoning/test_step3p5_reasoning_parser.py
tests/reasoning/test_step3p5_reasoning_parser.py
+20
-20
tests/renderers/test_sparse_tensor_validation.py
tests/renderers/test_sparse_tensor_validation.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
613 of 613+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/processing/test_internvl.py
View file @
0da93439
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.internvl
import
(
from
vllm.
transformers_utils.processor
s.internvl
import
(
calculate_internvl_targets
,
calculate_internvl_targets
,
get_internvl_target_ratios
,
get_internvl_target_ratios
,
)
)
...
...
tests/models/multimodal/processing/test_nemotron_vl.py
View file @
0da93439
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.nemotron_vl
import
(
from
vllm.
transformers_utils.processor
s.nemotron_vl
import
(
calculate_nemotron_vl_targets
,
calculate_nemotron_vl_targets
,
get_nemotron_vl_target_ratios
,
get_nemotron_vl_target_ratios
,
)
)
...
...
tests/models/multimodal/processing/test_qwen2_5_omni_embed.py
View file @
0da93439
...
@@ -185,14 +185,16 @@ def make_mock_model(hidden: int = 8):
...
@@ -185,14 +185,16 @@ def make_mock_model(hidden: int = 8):
# super().embed_input_ids → use SupportsMultiModal.embed_input_ids
# super().embed_input_ids → use SupportsMultiModal.embed_input_ids
def
fake_super_embed
(
def
fake_super_embed
(
ids
,
mm_embs
=
None
,
*
,
is_multimodal
=
None
,
handle_oov_mm_token
=
False
ids
,
mm_embs
=
None
,
*
,
is_multimodal
=
None
,
):
):
return
SupportsMultiModal
.
embed_input_ids
(
return
SupportsMultiModal
.
embed_input_ids
(
model
,
model
,
ids
,
ids
,
mm_embs
,
mm_embs
,
is_multimodal
=
is_multimodal
,
is_multimodal
=
is_multimodal
,
handle_oov_mm_token
=
handle_oov_mm_token
,
)
)
# Bind embed_input_ids as the real method
# Bind embed_input_ids as the real method
...
...
tests/models/quantization/test_mxfp8.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""E2E tests for online MXFP8 quantization.
Loads a BF16 model with ``--quantization mxfp8`` (online quantization) and
compares log-probabilities against the same model served in BF16 without
quantization. This exercises the full pipeline: config parsing,
``Mxfp8OnlineLinearMethod``, ``Mxfp8OnlineMoEMethod``, weight loading,
online quantization / shuffling, and inference through ``apply_monolithic``.
Layer skipping (``modules_to_not_convert``) is configured in the model's
``config.json`` under ``quantization_config`` and is not tested here.
``example_prompts`` is a pytest fixture (from conftest.py) that loads 8
diverse prompts from ``tests/prompts/example.txt``.
"""
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
check_logprobs_close
# A small MoE model that fits on a single GPU and has both linear + MoE layers.
MOE_MODEL
=
"Qwen/Qwen3-30B-A3B"
# A small dense model (no MoE) to validate the linear-only path.
DENSE_MODEL
=
"Qwen/Qwen3-0.6B"
MAX_MODEL_LEN
=
1024
MAX_TOKENS
=
4
NUM_LOG_PROBS
=
8
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"mxfp8"
),
reason
=
"mxfp8 is not supported on this GPU type (requires sm_100+)."
,
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
DENSE_MODEL
,
MOE_MODEL
],
ids
=
[
"dense"
,
"moe"
])
def
test_mxfp8_logprobs
(
vllm_runner
,
example_prompts
,
model
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Compare BF16 baseline logprobs against online MXFP8-quantized model.
Runs the same model twice -- once in BF16 (baseline) and once with
online MXFP8 quantization -- then checks that the top log-probabilities
are close. Only 4 tokens are generated to keep the test fast while
still catching numerical divergence.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
"true"
)
with
vllm_runner
(
model
,
max_model_len
=
MAX_MODEL_LEN
,
enforce_eager
=
True
,
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
MAX_TOKENS
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
max_model_len
=
MAX_MODEL_LEN
,
enforce_eager
=
True
,
quantization
=
"mxfp8"
,
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
MAX_TOKENS
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
baseline_outputs
,
outputs_1_lst
=
test_outputs
,
name_0
=
"bf16"
,
name_1
=
"mxfp8"
,
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"mxfp8"
),
reason
=
"mxfp8 is not supported on this GPU type (requires sm_100+)."
,
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
DENSE_MODEL
,
MOE_MODEL
],
ids
=
[
"dense"
,
"moe"
])
def
test_mxfp8_generation
(
vllm_runner
,
model
:
str
)
->
None
:
"""Smoke test: verify online MXFP8 model generates coherent text."""
prompt
=
"1 2 3 4 5"
with
vllm_runner
(
model
,
enforce_eager
=
True
,
quantization
=
"mxfp8"
,
max_model_len
=
MAX_MODEL_LEN
,
)
as
vllm_model
:
output
=
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
=
5
)
generated
=
output
[
0
][
1
]
assert
len
(
generated
)
>
len
(
prompt
),
(
f
"MXFP8 model produced no new tokens. Output:
{
generated
!
r
}
"
)
tests/models/registry.py
View file @
0da93439
...
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
...
@@ -628,6 +628,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"ColBERTJinaRobertaModel"
]},
hf_overrides
=
{
"architectures"
:
[
"ColBERTJinaRobertaModel"
]},
),
),
"ColBERTLfm2Model"
:
_HfExamplesInfo
(
"LiquidAI/LFM2-ColBERT-350M"
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"ColBERTLfm2Model"
]},
),
# [Multimodal]
# [Multimodal]
"ColModernVBertForRetrieval"
:
_HfExamplesInfo
(
"ColModernVBertForRetrieval"
:
_HfExamplesInfo
(
"ModernVBERT/colmodernvbert-merged"
,
"ModernVBERT/colmodernvbert-merged"
,
...
@@ -639,6 +644,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
...
@@ -639,6 +644,11 @@ _LATE_INTERACTION_EXAMPLE_MODELS = {
"OpsColQwen3Model"
:
_HfExamplesInfo
(
"OpsColQwen3Model"
:
_HfExamplesInfo
(
"OpenSearch-AI/Ops-Colqwen3-4B"
,
trust_remote_code
=
True
"OpenSearch-AI/Ops-Colqwen3-4B"
,
trust_remote_code
=
True
),
),
"ColQwen3_5"
:
_HfExamplesInfo
(
"athrael-soju/colqwen3.5-4.5B-v3"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
),
"Qwen3VLNemotronEmbedModel"
:
_HfExamplesInfo
(
"Qwen3VLNemotronEmbedModel"
:
_HfExamplesInfo
(
"nvidia/nemotron-colembed-vl-4b-v2"
,
"nvidia/nemotron-colembed-vl-4b-v2"
,
),
),
...
@@ -774,7 +784,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -774,7 +784,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr"
,
trust_remote_code
=
True
"rednote-hilab/dots.ocr"
,
trust_remote_code
=
True
),
),
"Eagle2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Eagle2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/Eagle2.5-8B"
,
trust_remote_code
=
True
,
is_available_online
=
False
"nvidia/Eagle2.5-8B"
,
trust_remote_code
=
True
,
),
),
"Emu3ForConditionalGeneration"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"Emu3ForConditionalGeneration"
:
_HfExamplesInfo
(
"BAAI/Emu3-Chat-hf"
),
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
...
@@ -1116,6 +1127,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -1116,6 +1127,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer_mode
=
"mistral"
,
tokenizer_mode
=
"mistral"
,
),
),
# [Encoder-decoder]
# [Encoder-decoder]
"CohereASRForConditionalGeneration"
:
_HfExamplesInfo
(
"/host/engines/vllm/audio/2b-release"
,
trust_remote_code
=
True
,
is_available_online
=
False
,
# TODO (ekagra): revert after asr release
),
"NemotronParseForConditionalGeneration"
:
_HfExamplesInfo
(
"NemotronParseForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
,
trust_remote_code
=
True
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
,
trust_remote_code
=
True
),
),
...
...
tests/models/test_terratorch.py
View file @
0da93439
...
@@ -8,7 +8,7 @@ from tests.conftest import VllmRunner
...
@@ -8,7 +8,7 @@ from tests.conftest import VllmRunner
from
tests.utils
import
create_new_process_for_each_test
from
tests.utils
import
create_new_process_for_each_test
@
create_new_process_for_each_test
()
#
Memory is not cleaned up properly
otherwise
@
create_new_process_for_each_test
()
#
Hangs
otherwise
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model"
,
"model"
,
[
[
...
...
tests/multimodal/media/test_audio.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
from
pathlib
import
Path
from
pathlib
import
Path
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
librosa
import
librosa
import
numpy
as
np
import
numpy
as
np
import
pybase64
as
base64
import
pytest
import
pytest
from
vllm.multimodal.media
import
AudioMediaIO
from
vllm.multimodal.media
import
AudioMediaIO
from
...conftest
import
AudioTestAssets
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
ASSETS_DIR
=
Path
(
__file__
).
parent
.
parent
/
"assets"
...
@@ -22,40 +24,32 @@ def dummy_audio():
...
@@ -22,40 +24,32 @@ def dummy_audio():
@
pytest
.
fixture
@
pytest
.
fixture
def
dummy_audio_bytes
():
def
dummy_audio_bytes
(
audio_assets
:
AudioTestAssets
):
return
b
"FAKEAUDIOBYTES"
with
open
(
audio_assets
[
0
].
get_local_path
(),
"rb"
)
as
f
:
return
f
.
read
()
def
test_audio_media_io_load_bytes
(
dummy_audio_bytes
):
def
test_audio_media_io_load_bytes
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
audio_io
=
AudioMediaIO
()
with
patch
(
"librosa.load"
)
as
mock_load
:
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
out
=
audio_io
.
load_bytes
(
dummy_audio_bytes
)
assert
out
[
1
]
==
16000
mock_load
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_base64
(
dummy_audio_bytes
):
def
test_audio_media_io_load_base64
(
dummy_audio_bytes
):
audio_io
=
AudioMediaIO
()
audio_io
=
AudioMediaIO
()
encoded
=
base64
.
b64encode
(
dummy_audio_bytes
).
decode
(
"utf-8"
)
encoded
=
base64
.
b64encode
(
dummy_audio_bytes
).
decode
(
"utf-8"
)
with
patch
.
object
(
AudioMediaIO
,
"load_bytes"
)
as
mock_load_bytes
:
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
mock_load_bytes
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
out
=
audio_io
.
load_base64
(
"audio/wav"
,
encoded
)
assert
out
[
1
]
==
16000
mock_load_bytes
.
assert_called_once
()
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_load_file
():
def
test_audio_media_io_load_file
(
audio_assets
:
AudioTestAssets
):
audio_io
=
AudioMediaIO
()
audio_io
=
AudioMediaIO
()
path
=
Path
(
"/fake/path.wav"
)
path
=
audio_assets
[
0
].
get_local_path
()
with
patch
(
"librosa.load"
)
as
mock_load
:
out
=
audio_io
.
load_file
(
path
)
mock_load
.
return_value
=
(
np
.
array
([
0.1
,
0.2
]),
16000
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
out
=
audio_io
.
load_file
(
path
)
assert
out
[
1
]
==
16000
mock_load
.
assert_called_once_with
(
path
,
sr
=
None
)
assert
isinstance
(
out
[
0
],
np
.
ndarray
)
assert
out
[
1
]
==
16000
def
test_audio_media_io_encode_base64
(
dummy_audio
):
def
test_audio_media_io_encode_base64
(
dummy_audio
):
...
...
tests/multimodal/media/test_connector.py
View file @
0da93439
...
@@ -2,13 +2,13 @@
...
@@ -2,13 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
asyncio
import
base64
import
mimetypes
import
mimetypes
import
os
import
os
from
tempfile
import
NamedTemporaryFile
,
TemporaryDirectory
from
tempfile
import
NamedTemporaryFile
,
TemporaryDirectory
import
aiohttp
import
aiohttp
import
numpy
as
np
import
numpy
as
np
import
pybase64
as
base64
import
pytest
import
pytest
import
requests
import
requests
import
torch
import
torch
...
...
tests/multimodal/media/test_video.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
io
from
pathlib
import
Path
from
pathlib
import
Path
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
import
pybase64
import
pytest
import
pytest
from
PIL
import
Image
from
PIL
import
Image
...
@@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
...
@@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
def
test_load_base64_jpeg_returns_metadata
():
"""Regression test: load_base64 with video/jpeg must return metadata.
Previously, base64 JPEG frame sequences returned an empty dict for
metadata, which broke downstream consumers that rely on fields like
total_num_frames and fps. See PR #37301.
"""
num_test_frames
=
3
frame_width
,
frame_height
=
8
,
8
# Build a few tiny JPEG frames and base64-encode them
b64_frames
=
[]
for
i
in
range
(
num_test_frames
):
img
=
Image
.
new
(
"RGB"
,
(
frame_width
,
frame_height
),
color
=
(
i
*
80
,
0
,
0
))
buf
=
io
.
BytesIO
()
img
.
save
(
buf
,
format
=
"JPEG"
)
b64_frames
.
append
(
pybase64
.
b64encode
(
buf
.
getvalue
()).
decode
(
"ascii"
))
data
=
","
.
join
(
b64_frames
)
imageio
=
ImageMediaIO
()
videoio
=
VideoMediaIO
(
imageio
,
num_frames
=
num_test_frames
)
frames
,
metadata
=
videoio
.
load_base64
(
"video/jpeg"
,
data
)
# Frames array shape: (num_frames, H, W, 3)
assert
frames
.
shape
[
0
]
==
num_test_frames
# All required metadata keys must be present
required_keys
=
{
"total_num_frames"
,
"fps"
,
"duration"
,
"video_backend"
,
"frames_indices"
,
"do_sample_frames"
,
}
assert
required_keys
.
issubset
(
metadata
.
keys
()),
(
f
"Missing metadata keys:
{
required_keys
-
metadata
.
keys
()
}
"
)
assert
metadata
[
"total_num_frames"
]
==
num_test_frames
assert
metadata
[
"video_backend"
]
==
"jpeg_sequence"
assert
metadata
[
"frames_indices"
]
==
list
(
range
(
num_test_frames
))
assert
metadata
[
"do_sample_frames"
]
is
False
# Default fps=1 → duration == num_frames
assert
metadata
[
"fps"
]
==
1.0
assert
metadata
[
"duration"
]
==
float
(
num_test_frames
)
tests/multimodal/test_audio.py
View file @
0da93439
...
@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
...
@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec
,
AudioSpec
,
ChannelReduction
,
ChannelReduction
,
normalize_audio
,
normalize_audio
,
resample_audio_
librosa
,
resample_audio_
pyav
,
resample_audio_scipy
,
resample_audio_scipy
,
split_audio
,
split_audio
,
)
)
...
@@ -25,14 +25,14 @@ def dummy_audio():
...
@@ -25,14 +25,14 @@ def dummy_audio():
return
np
.
array
([
0.0
,
0.1
,
0.2
,
0.3
,
0.4
],
dtype
=
float
)
return
np
.
array
([
0.0
,
0.1
,
0.2
,
0.3
,
0.4
],
dtype
=
float
)
def
test_resample_audio_
librosa
(
dummy_audio
):
def
test_resample_audio_
pyav
(
dummy_audio
):
with
patch
(
"vllm.multimodal.audio.librosa.resample"
)
as
mock_resample
:
out_down
=
resample_audio_pyav
(
dummy_audio
,
orig_sr
=
4
,
target_sr
=
2
)
mock_resample
.
return_value
=
dummy_audio
*
2
out_up
=
resample_audio_pyav
(
dummy_audio
,
orig_sr
=
2
,
target_sr
=
4
)
out
=
resample_audio_
librosa
(
dummy_audio
,
orig_sr
=
4
4100
,
target_sr
=
22050
)
out
_same
=
resample_audio_
pyav
(
dummy_audio
,
orig_sr
=
4
,
target_sr
=
4
)
mock_resample
.
assert_called_once_with
(
dummy_audio
,
orig_sr
=
44100
,
target_sr
=
22050
assert
len
(
out_down
)
==
3
)
assert
len
(
out_up
)
==
10
assert
np
.
all
(
out
==
dummy_audio
*
2
)
assert
np
.
all
(
out
_same
==
dummy_audio
)
def
test_resample_audio_scipy
(
dummy_audio
):
def
test_resample_audio_scipy
(
dummy_audio
):
...
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
...
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert
np
.
isfinite
(
out
).
all
()
assert
np
.
isfinite
(
out
).
all
()
def
test_audio_resampler_
librosa
_calls_resample
(
dummy_audio
):
def
test_audio_resampler_
pyav
_calls_resample
(
dummy_audio
):
resampler
=
AudioResampler
(
target_sr
=
22050
,
method
=
"
librosa
"
)
resampler
=
AudioResampler
(
target_sr
=
22050
,
method
=
"
pyav
"
)
with
patch
(
"vllm.multimodal.audio.resample_audio_
librosa
"
)
as
mock_resample
:
with
patch
(
"vllm.multimodal.audio.resample_audio_
pyav
"
)
as
mock_resample
:
mock_resample
.
return_value
=
dummy_audio
mock_resample
.
return_value
=
dummy_audio
out
=
resampler
.
resample
(
dummy_audio
,
orig_sr
=
44100
)
out
=
resampler
.
resample
(
dummy_audio
,
orig_sr
=
44100
)
mock_resample
.
assert_called_once_with
(
mock_resample
.
assert_called_once_with
(
...
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
...
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
16000
),
decimal
=
5
)
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
16000
),
decimal
=
5
)
def
test_
librosa
_mono_passthrough_e2e
(
self
):
def
test_
pyav
_mono_passthrough_e2e
(
self
):
"""Full pipeline:
librosa
mono format → preserved as mono."""
"""Full pipeline:
pyav
mono format → preserved as mono."""
from
vllm.multimodal.parse
import
MultiModalDataParser
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate
librosa
output: already mono (time,) format
# Simulate
pyav
output: already mono (time,) format
mono_
librosa
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
mono_
pyav
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
assert
mono_
librosa
.
shape
==
(
16000
,)
assert
mono_
pyav
.
shape
==
(
16000
,)
# Create parser with mono normalization
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
parser
=
MultiModalDataParser
(
...
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
...
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
)
# Process audio through the parser
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
mono_
librosa
,
16000
))
result
=
parser
.
_parse_audio_data
((
mono_
pyav
,
16000
))
audio_output
=
result
.
get
(
0
)
audio_output
=
result
.
get
(
0
)
# Verify output is still mono 1D
# Verify output is still mono 1D
...
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
...
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert
audio_output
.
shape
==
(
16000
,)
assert
audio_output
.
shape
==
(
16000
,)
# Verify audio content is preserved
# Verify audio content is preserved
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
mono_
librosa
)
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
mono_
pyav
)
def
test_multichannel_5_1_surround_to_mono_e2e
(
self
):
def
test_multichannel_5_1_surround_to_mono_e2e
(
self
):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
...
...
tests/
entrypoints/openai
/test_embedding_shape_validation.py
→
tests/
multimodal
/test_embedding_shape_validation.py
View file @
0da93439
File moved
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py
View file @
0da93439
...
@@ -3,10 +3,10 @@
...
@@ -3,10 +3,10 @@
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
vllm.config
import
VllmConfig
from
vllm.config
import
ModelConfig
,
PoolerConfig
,
VllmConfig
from
vllm.entrypoints.openai.engine.protocol
import
UsageInfo
from
vllm.entrypoints.openai.engine.protocol
import
UsageInfo
from
vllm.entrypoints.pooling.base.protocol
import
EmbedRequestMixin
from
vllm.inputs.data
import
PromptType
from
vllm.inputs.data
import
PromptType
from
vllm.logger
import
init_logger
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.outputs
import
PoolingRequestOutput
from
vllm.plugins.io_processors.interface
import
(
from
vllm.plugins.io_processors.interface
import
(
IOProcessor
,
IOProcessor
,
...
@@ -16,14 +16,13 @@ from vllm.renderers import BaseRenderer
...
@@ -16,14 +16,13 @@ from vllm.renderers import BaseRenderer
from
vllm.tokenizers.detokenizer_utils
import
convert_ids_list_to_tokens
from
vllm.tokenizers.detokenizer_utils
import
convert_ids_list_to_tokens
from
.types
import
(
from
.types
import
(
EMBED_TASKS
,
SparseEmbeddingCompletionRequestMixin
,
SparseEmbeddingCompletionRequestMixin
,
SparseEmbeddingResponse
,
SparseEmbeddingResponse
,
SparseEmbeddingResponseData
,
SparseEmbeddingResponseData
,
SparseEmbeddingTokenWeight
,
SparseEmbeddingTokenWeight
,
)
)
logger
=
init_logger
(
__name__
)
class
BgeM3SparseEmbeddingsProcessor
(
class
BgeM3SparseEmbeddingsProcessor
(
IOProcessor
[
SparseEmbeddingCompletionRequestMixin
,
SparseEmbeddingResponse
]
IOProcessor
[
SparseEmbeddingCompletionRequestMixin
,
SparseEmbeddingResponse
]
...
@@ -33,6 +32,22 @@ class BgeM3SparseEmbeddingsProcessor(
...
@@ -33,6 +32,22 @@ class BgeM3SparseEmbeddingsProcessor(
self
.
offline_requests
:
list
[
SparseEmbeddingCompletionRequestMixin
]
=
[]
self
.
offline_requests
:
list
[
SparseEmbeddingCompletionRequestMixin
]
=
[]
self
.
online_requests
:
dict
[
str
,
SparseEmbeddingCompletionRequestMixin
]
=
{}
self
.
online_requests
:
dict
[
str
,
SparseEmbeddingCompletionRequestMixin
]
=
{}
self
.
renderer
:
BaseRenderer
=
renderer
self
.
renderer
:
BaseRenderer
=
renderer
self
.
default_pooling_params
=
{}
pooler_config
:
PoolerConfig
=
vllm_config
.
model_config
.
pooler_config
if
pooler_config
is
not
None
:
for
param
in
[
"use_activation"
,
"dimensions"
]:
if
getattr
(
pooler_config
,
param
,
None
)
is
None
:
continue
self
.
default_pooling_params
[
param
]
=
getattr
(
pooler_config
,
param
)
self
.
embed_dimensions
=
vllm_config
.
model_config
.
embedding_size
self
.
embed_request_queue
:
list
[
EmbedRequestMixin
]
=
[]
def
__repr__
(
self
)
->
str
:
return
(
f
"BgeM3SparseEmbeddingsProcessor("
f
"embed_dimensions=
{
self
.
embed_dimensions
}
, "
f
"default_pooling_params=
{
self
.
default_pooling_params
}
)"
)
def
merge_pooling_params
(
def
merge_pooling_params
(
self
,
self
,
...
@@ -41,7 +56,57 @@ class BgeM3SparseEmbeddingsProcessor(
...
@@ -41,7 +56,57 @@ class BgeM3SparseEmbeddingsProcessor(
if
params
is
None
:
if
params
is
None
:
params
=
PoolingParams
()
params
=
PoolingParams
()
# refer to PoolingCompletionRequest.to_pooling_params
# refer to PoolingCompletionRequest.to_pooling_params
params
.
task
=
"token_classify"
# set and verify pooling params
params
.
skip_reading_prefix_cache
=
True
raw_embed_request
=
self
.
embed_request_queue
.
pop
(
0
)
if
raw_embed_request
.
embed_task
not
in
EMBED_TASKS
:
raise
ValueError
(
f
"Unsupported task
{
raw_embed_request
}
, "
f
"Supported tasks are
{
EMBED_TASKS
}
"
)
has_dense_embed
=
True
if
raw_embed_request
.
embed_task
==
"dense"
:
params
.
task
=
"embed"
params
.
skip_reading_prefix_cache
=
False
elif
raw_embed_request
.
embed_task
==
"sparse"
:
params
.
task
=
"token_classify"
has_dense_embed
=
False
else
:
params
.
task
=
"embed&token_classify"
params
.
use_activation
=
raw_embed_request
.
use_activation
if
params
.
use_activation
is
None
:
params
.
use_activation
=
True
if
not
has_dense_embed
:
params
.
dimensions
=
None
return
params
params
.
dimensions
=
raw_embed_request
.
dimensions
model_config
:
ModelConfig
=
self
.
vllm_config
.
model_config
for
param
in
self
.
default_pooling_params
:
if
getattr
(
params
,
param
,
None
)
is
None
:
setattr
(
params
,
param
,
self
.
default_pooling_params
[
param
])
if
params
.
dimensions
is
not
None
:
if
not
model_config
.
is_matryoshka
:
raise
ValueError
(
f
'Model "
{
model_config
.
served_model_name
}
" does not '
f
"support matryoshka representation, "
f
"changing output dimensions will lead to poor results."
)
mds
=
model_config
.
matryoshka_dimensions
if
mds
is
not
None
:
if
params
.
dimensions
not
in
mds
:
raise
ValueError
(
f
"Model
{
model_config
.
served_model_name
!
r
}
"
f
"only supports
{
str
(
mds
)
}
matryoshka dimensions, "
f
"use other output dimensions will "
f
"lead to poor results."
)
elif
params
.
dimensions
<
1
:
raise
ValueError
(
"Dimensions must be greater than 0"
)
return
params
return
params
def
parse_request
(
def
parse_request
(
...
@@ -61,14 +126,16 @@ class BgeM3SparseEmbeddingsProcessor(
...
@@ -61,14 +126,16 @@ class BgeM3SparseEmbeddingsProcessor(
if
request_id
is
not
None
:
if
request_id
is
not
None
:
assert
request_id
not
in
self
.
online_requests
,
"request_id duplicated"
assert
request_id
not
in
self
.
online_requests
,
"request_id duplicated"
self
.
online_requests
[
request_id
]
=
prompt
self
.
online_requests
[
request_id
]
=
prompt
self
.
embed_request_queue
.
extend
(
prompt
.
to_embed_requests_online
())
else
:
else
:
self
.
offline_requests
.
append
(
prompt
)
self
.
offline_requests
.
append
(
prompt
)
self
.
embed_request_queue
.
extend
(
prompt
.
to_embed_requests_offline
())
return
prompt
.
input
return
prompt
.
input
def
_get_sparse_embedding_request
(
self
,
request_id
:
str
|
None
=
None
):
def
_get_sparse_embedding_request
(
self
,
request_id
:
str
|
None
=
None
):
if
request_id
:
if
request_id
:
return
self
.
online_requests
.
pop
(
request_id
,
None
)
return
self
.
online_requests
.
pop
(
request_id
,
None
)
return
self
.
offline_requests
.
pop
()
return
self
.
offline_requests
.
pop
(
0
)
def
_build_sparse_embedding_token_weights
(
def
_build_sparse_embedding_token_weights
(
self
,
self
,
...
@@ -100,26 +167,45 @@ class BgeM3SparseEmbeddingsProcessor(
...
@@ -100,26 +167,45 @@ class BgeM3SparseEmbeddingsProcessor(
)
->
SparseEmbeddingResponse
:
)
->
SparseEmbeddingResponse
:
num_prompt_tokens
=
0
num_prompt_tokens
=
0
response_data
=
[]
response_data
=
[]
return_tokens
=
self
.
_get_sparse_embedding_request
(
request_id
).
return_tokens
raw_request
=
self
.
_get_sparse_embedding_request
(
request_id
)
has_dense_embed
=
raw_request
.
embed_task
in
[
"dense"
,
"dense&sparse"
]
has_sparse_embed
=
raw_request
.
embed_task
in
[
"sparse"
,
"dense&sparse"
]
embed_dimensions
=
0
if
has_dense_embed
:
embed_dimensions
=
(
self
.
embed_dimensions
if
raw_request
.
dimensions
is
None
else
raw_request
.
dimensions
)
for
idx
in
range
(
len
(
model_output
)):
for
idx
in
range
(
len
(
model_output
)):
mo
=
model_output
[
idx
]
mo
=
model_output
[
idx
]
sparse_embedding
:
dict
[
int
,
float
]
=
{}
sparse_embedding
_dict
:
dict
[
int
,
float
]
=
{}
num_prompt_tokens
+=
len
(
mo
.
prompt_token_ids
)
num_prompt_tokens
+=
len
(
mo
.
prompt_token_ids
)
if
len
(
mo
.
prompt_token_ids
)
!=
len
(
mo
.
outputs
.
data
):
dense_embedding
:
list
[
float
]
|
None
=
None
# this is the case that add_special_tokens is True,
sparse_embedding
:
list
[
SparseEmbeddingTokenWeight
]
|
None
=
None
# which means first token and last token are special tokens
if
has_dense_embed
:
mo
.
prompt_token_ids
=
mo
.
prompt_token_ids
[
1
:]
dense_embedding
=
mo
.
outputs
.
data
[:
embed_dimensions
].
tolist
()
for
token_id
,
weight
in
zip
(
mo
.
prompt_token_ids
,
mo
.
outputs
.
data
.
tolist
()):
if
has_sparse_embed
:
sparse_embedding
[
token_id
]
=
max
(
sparse_weights
=
mo
.
outputs
.
data
[
embed_dimensions
:].
tolist
()
weight
,
sparse_embedding
.
get
(
token_id
,
0.0
)
if
len
(
mo
.
prompt_token_ids
)
!=
len
(
sparse_weights
):
# this is the case that add_special_tokens is True,
# which means first token and last token are special tokens
mo
.
prompt_token_ids
=
mo
.
prompt_token_ids
[
1
:]
for
token_id
,
weight
in
zip
(
mo
.
prompt_token_ids
,
sparse_weights
):
sparse_embedding_dict
[
token_id
]
=
max
(
weight
,
sparse_embedding_dict
.
get
(
token_id
,
0.0
)
)
sparse_embedding
=
self
.
_build_sparse_embedding_token_weights
(
sparse_embedding_dict
,
raw_request
.
return_tokens
,
)
)
response_data
.
append
(
response_data
.
append
(
SparseEmbeddingResponseData
(
SparseEmbeddingResponseData
(
index
=
idx
,
index
=
idx
,
sparse_embedding
=
self
.
_build_sparse_embedding_token_weights
(
object
=
raw_request
.
embed_task
,
sparse_embedding
,
sparse_embedding
=
sparse_embedding
,
return_tokens
,
dense_embedding
=
dense_embedding
,
),
)
)
)
)
...
...
tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Literal
,
get_args
from
pydantic
import
BaseModel
,
Field
from
pydantic
import
BaseModel
,
Field
from
vllm.entrypoints.openai.engine.protocol
import
UsageInfo
from
vllm.entrypoints.openai.engine.protocol
import
UsageInfo
from
vllm.entrypoints.pooling.base.protocol
import
CompletionRequestMixin
from
vllm.entrypoints.pooling.base.protocol
import
(
CompletionRequestMixin
,
EmbedRequestMixin
,
)
EmbedTask
=
Literal
[
"sparse"
,
"dense"
,
"dense&sparse"
,
]
EMBED_TASKS
:
tuple
[
EmbedTask
,
...]
=
get_args
(
EmbedTask
)
class
SparseEmbeddingCompletionRequestMixin
(
CompletionRequestMixin
):
class
SparseEmbeddingCompletionRequestMixin
(
CompletionRequestMixin
,
EmbedRequestMixin
):
return_tokens
:
bool
|
None
=
Field
(
return_tokens
:
bool
|
None
=
Field
(
default
=
None
,
default
=
None
,
description
=
"Whether to return dict shows the mapping of token_id to text."
description
=
"Whether to return dict shows the mapping of token_id to text."
"`None` or False means not return."
,
"`None` or False means not return."
,
)
)
embed_task
:
EmbedTask
=
Field
(
default
=
"dense&sparse"
,
description
=
"embed task, can be one of 'sparse', 'dense' , 'dense&sparse', "
"default to 'dense&sparse'"
,
)
def
to_embed_requests_offline
(
self
)
->
list
[
EmbedRequestMixin
]:
if
isinstance
(
self
.
input
,
list
):
return
[
self
]
*
len
(
self
.
input
)
return
[
self
]
def
to_embed_requests_online
(
self
)
->
list
[
EmbedRequestMixin
]:
return
[
self
]
class
SparseEmbeddingTokenWeight
(
BaseModel
):
class
SparseEmbeddingTokenWeight
(
BaseModel
):
...
@@ -23,8 +49,9 @@ class SparseEmbeddingTokenWeight(BaseModel):
...
@@ -23,8 +49,9 @@ class SparseEmbeddingTokenWeight(BaseModel):
class
SparseEmbeddingResponseData
(
BaseModel
):
class
SparseEmbeddingResponseData
(
BaseModel
):
index
:
int
index
:
int
object
:
str
=
"sparse-embedding"
object
:
str
=
"dense&sparse"
sparse_embedding
:
list
[
SparseEmbeddingTokenWeight
]
sparse_embedding
:
list
[
SparseEmbeddingTokenWeight
]
|
None
dense_embedding
:
list
[
float
]
|
None
class
SparseEmbeddingResponse
(
BaseModel
):
class
SparseEmbeddingResponse
(
BaseModel
):
...
...
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
datetime
import
datetime
import
os
import
os
import
tempfile
import
tempfile
...
@@ -11,6 +10,7 @@ from typing import Any
...
@@ -11,6 +10,7 @@ from typing import Any
import
albumentations
import
albumentations
import
numpy
as
np
import
numpy
as
np
import
pybase64
as
base64
import
rasterio
import
rasterio
import
regex
as
re
import
regex
as
re
import
torch
import
torch
...
...
tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
View file @
0da93439
...
@@ -19,6 +19,12 @@ model_config = {
...
@@ -19,6 +19,12 @@ model_config = {
),
),
}
}
dense_embedding_sum
=
[
-
0.7214539647102356
,
# "What is the capital of France?"
-
0.6926871538162231
,
# "What is the capital of Germany?"
-
0.7129564881324768
,
# "What is the capital of Spain?"
]
def
_float_close
(
expected
:
object
,
result
:
object
):
def
_float_close
(
expected
:
object
,
result
:
object
):
assert
isinstance
(
expected
,
float
)
and
isinstance
(
result
,
float
),
(
assert
isinstance
(
expected
,
float
)
and
isinstance
(
result
,
float
),
(
...
@@ -33,6 +39,12 @@ def _get_attr_or_val(obj: object | dict, key: str):
...
@@ -33,6 +39,12 @@ def _get_attr_or_val(obj: object | dict, key: str):
return
getattr
(
obj
,
key
,
None
)
return
getattr
(
obj
,
key
,
None
)
def
_check_dense_embedding
(
data
,
index
=
0
):
assert
_float_close
(
sum
(
data
),
dense_embedding_sum
[
index
]),
(
"dense-embedding result not match"
)
def
_check_sparse_embedding
(
data
,
check_tokens
=
False
):
def
_check_sparse_embedding
(
data
,
check_tokens
=
False
):
expected_weights
=
[
expected_weights
=
[
{
"token_id"
:
32
,
"weight"
:
0.0552978515625
,
"token"
:
"?"
},
{
"token_id"
:
32
,
"weight"
:
0.0552978515625
,
"token"
:
"?"
},
...
@@ -109,7 +121,7 @@ async def test_bge_m3_sparse_plugin_online(
...
@@ -109,7 +121,7 @@ async def test_bge_m3_sparse_plugin_online(
assert
len
(
_get_attr_or_val
(
parsed_response
,
"data"
))
>
0
assert
len
(
_get_attr_or_val
(
parsed_response
,
"data"
))
>
0
data_entry
=
_get_attr_or_val
(
parsed_response
,
"data"
)[
0
]
data_entry
=
_get_attr_or_val
(
parsed_response
,
"data"
)[
0
]
assert
_get_attr_or_val
(
data_entry
,
"object"
)
==
"sparse
-embedding
"
assert
_get_attr_or_val
(
data_entry
,
"object"
)
==
"
dense&
sparse"
assert
_get_attr_or_val
(
data_entry
,
"sparse_embedding"
)
assert
_get_attr_or_val
(
data_entry
,
"sparse_embedding"
)
# Verify sparse embedding format
# Verify sparse embedding format
...
@@ -117,6 +129,11 @@ async def test_bge_m3_sparse_plugin_online(
...
@@ -117,6 +129,11 @@ async def test_bge_m3_sparse_plugin_online(
assert
isinstance
(
sparse_embedding
,
list
)
assert
isinstance
(
sparse_embedding
,
list
)
_check_sparse_embedding
(
sparse_embedding
,
return_tokens
)
_check_sparse_embedding
(
sparse_embedding
,
return_tokens
)
# Verify dense embedding format
dense_embedding
=
_get_attr_or_val
(
data_entry
,
"dense_embedding"
)
assert
isinstance
(
dense_embedding
,
list
)
_check_dense_embedding
(
dense_embedding
)
# Verify usage information
# Verify usage information
usage
=
_get_attr_or_val
(
parsed_response
,
"usage"
)
usage
=
_get_attr_or_val
(
parsed_response
,
"usage"
)
assert
usage
,
f
"usage not found for
{
parsed_response
}
"
assert
usage
,
f
"usage not found for
{
parsed_response
}
"
...
@@ -164,6 +181,9 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool):
...
@@ -164,6 +181,9 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool):
sparse_embedding
=
output
.
sparse_embedding
sparse_embedding
=
output
.
sparse_embedding
assert
isinstance
(
sparse_embedding
,
list
)
assert
isinstance
(
sparse_embedding
,
list
)
_check_sparse_embedding
(
sparse_embedding
,
return_tokens
)
_check_sparse_embedding
(
sparse_embedding
,
return_tokens
)
dense_embedding
=
output
.
dense_embedding
assert
isinstance
(
dense_embedding
,
list
)
_check_dense_embedding
(
dense_embedding
)
# Verify usage
# Verify usage
assert
response
.
usage
.
prompt_tokens
>
0
assert
response
.
usage
.
prompt_tokens
>
0
...
@@ -206,6 +226,9 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner):
...
@@ -206,6 +226,9 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner):
# Each output should have sparse embeddings
# Each output should have sparse embeddings
sparse_embedding
=
output
.
sparse_embedding
sparse_embedding
=
output
.
sparse_embedding
assert
isinstance
(
sparse_embedding
,
list
)
assert
isinstance
(
sparse_embedding
,
list
)
dense_embedding
=
output
.
dense_embedding
assert
isinstance
(
dense_embedding
,
list
)
_check_dense_embedding
(
dense_embedding
,
i
)
# Verify usage
# Verify usage
assert
response
.
usage
.
prompt_tokens
>
0
assert
response
.
usage
.
prompt_tokens
>
0
...
...
tests/plugins_tests/test_terratorch_io_processor_plugins.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
io
import
io
import
imagehash
import
imagehash
import
pybase64
as
base64
import
pytest
import
pytest
import
requests
import
requests
from
PIL
import
Image
from
PIL
import
Image
...
...
tests/quantization/test_mi3xx_moe.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def
test_mi3xx_moe
():
print
(
"TODO: add tests for Mi3xx MoE quantization"
)
tests/reasoning/test_kimi_k2_reasoning_parser.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.entrypoints.openai.chat_completion.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.engine.protocol
import
DeltaMessage
from
vllm.reasoning.identity_reasoning_parser
import
IdentityReasoningParser
from
vllm.reasoning.kimi_k2_reasoning_parser
import
KimiK2ReasoningParser
from
vllm.tokenizers
import
get_tokenizer
REASONING_MODEL_NAME
=
"moonshotai/Kimi-K2.5"
@
pytest
.
fixture
(
scope
=
"module"
)
def
kimi_k2_tokenizer
():
return
get_tokenizer
(
tokenizer_name
=
REASONING_MODEL_NAME
,
trust_remote_code
=
True
)
def
test_parser_selection_thinking_enabled
(
kimi_k2_tokenizer
):
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
,
chat_template_kwargs
=
{
"thinking"
:
True
}
)
assert
parser
.
_identity_parser
is
None
def
test_parser_selection_thinking_disabled
(
kimi_k2_tokenizer
):
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
,
chat_template_kwargs
=
{
"thinking"
:
False
}
)
assert
isinstance
(
parser
.
_identity_parser
,
IdentityReasoningParser
)
def
test_extract_reasoning_with_think_tags
(
kimi_k2_tokenizer
):
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
request
=
ChatCompletionRequest
(
model
=
"test-model"
,
messages
=
[],
temperature
=
1.0
)
reasoning
,
content
=
parser
.
extract_reasoning
(
"<think>step by step reasoning</think>final answer"
,
request
)
assert
reasoning
==
"step by step reasoning"
assert
content
==
"final answer"
def
test_extract_reasoning_empty_thinking
(
kimi_k2_tokenizer
):
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
request
=
ChatCompletionRequest
(
model
=
"test-model"
,
messages
=
[],
temperature
=
1.0
)
reasoning
,
content
=
parser
.
extract_reasoning
(
"<think></think>final answer"
,
request
)
assert
reasoning
==
""
assert
content
==
"final answer"
def
test_extract_reasoning_implicit_start
(
kimi_k2_tokenizer
):
"""When there's no <think> tag, everything is treated as reasoning."""
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
request
=
ChatCompletionRequest
(
model
=
"test-model"
,
messages
=
[],
temperature
=
1.0
)
reasoning
,
content
=
parser
.
extract_reasoning
(
"implicit reasoning with no tags"
,
request
)
assert
reasoning
==
"implicit reasoning with no tags"
assert
content
is
None
def
test_extract_reasoning_tool_section_ends_reasoning
(
kimi_k2_tokenizer
):
"""<|tool_calls_section_begin|> implicitly ends reasoning."""
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
request
=
ChatCompletionRequest
(
model
=
"test-model"
,
messages
=
[],
temperature
=
1.0
)
text
=
"some reasoning<|tool_calls_section_begin|>tool call data"
reasoning
,
content
=
parser
.
extract_reasoning
(
text
,
request
)
assert
reasoning
==
"some reasoning"
assert
content
==
"<|tool_calls_section_begin|>tool call data"
def
test_streaming_reasoning_then_content
(
kimi_k2_tokenizer
):
"""Token-by-token streaming: reasoning tokens then content after </think>."""
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
think_id
=
parser
.
_start_token_id
end_think_id
=
parser
.
_end_token_id
# Use a real token ID from the tokenizer for regular content
regular_id
=
kimi_k2_tokenizer
.
encode
(
"hello"
,
add_special_tokens
=
False
)[
0
]
# First token: <think> — single special token should be skipped
result
=
parser
.
extract_reasoning_streaming
(
previous_text
=
""
,
current_text
=
"<think>"
,
delta_text
=
"<think>"
,
previous_token_ids
=
[],
current_token_ids
=
[
think_id
],
delta_token_ids
=
[
think_id
],
)
assert
result
is
None
# Reasoning token
result
=
parser
.
extract_reasoning_streaming
(
previous_text
=
"<think>"
,
current_text
=
"<think>step one"
,
delta_text
=
"step one"
,
previous_token_ids
=
[
think_id
],
current_token_ids
=
[
think_id
,
regular_id
],
delta_token_ids
=
[
regular_id
],
)
assert
isinstance
(
result
,
DeltaMessage
)
assert
result
.
reasoning
==
"step one"
assert
result
.
content
is
None
# End token </think> as single token — should be skipped
result
=
parser
.
extract_reasoning_streaming
(
previous_text
=
"<think>step one"
,
current_text
=
"<think>step one</think>"
,
delta_text
=
"</think>"
,
previous_token_ids
=
[
think_id
,
regular_id
],
current_token_ids
=
[
think_id
,
regular_id
,
end_think_id
],
delta_token_ids
=
[
end_think_id
],
)
assert
result
is
None
# Content after </think>
content_id
=
kimi_k2_tokenizer
.
encode
(
"world"
,
add_special_tokens
=
False
)[
0
]
result
=
parser
.
extract_reasoning_streaming
(
previous_text
=
"<think>step one</think>"
,
current_text
=
"<think>step one</think>answer"
,
delta_text
=
"answer"
,
previous_token_ids
=
[
think_id
,
regular_id
,
end_think_id
],
current_token_ids
=
[
think_id
,
regular_id
,
end_think_id
,
content_id
],
delta_token_ids
=
[
content_id
],
)
assert
isinstance
(
result
,
DeltaMessage
)
assert
result
.
content
==
"answer"
def
test_streaming_tool_section_ends_reasoning
(
kimi_k2_tokenizer
):
"""<|tool_calls_section_begin|> in delta ends reasoning during streaming."""
parser
=
KimiK2ReasoningParser
(
kimi_k2_tokenizer
)
think_id
=
parser
.
_start_token_id
tool_begin_id
=
parser
.
_tool_section_start_token_id
regular_id
=
kimi_k2_tokenizer
.
encode
(
"hello"
,
add_special_tokens
=
False
)[
0
]
# Tool section token arrives — should transition from reasoning to content
result
=
parser
.
extract_reasoning_streaming
(
previous_text
=
"<think>thinking"
,
current_text
=
"<think>thinking<|tool_calls_section_begin|>"
,
delta_text
=
"<|tool_calls_section_begin|>"
,
previous_token_ids
=
[
think_id
,
regular_id
],
current_token_ids
=
[
think_id
,
regular_id
,
tool_begin_id
],
delta_token_ids
=
[
tool_begin_id
],
)
assert
isinstance
(
result
,
DeltaMessage
)
assert
result
.
content
==
"<|tool_calls_section_begin|>"
tests/reasoning/test_step3p5_reasoning_parser.py
View file @
0da93439
...
@@ -21,119 +21,119 @@ def step3p5_tokenizer():
...
@@ -21,119 +21,119 @@ def step3p5_tokenizer():
SIMPLE_REASONING
=
{
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section</think>This is the rest"
,
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
# need to get into parser again to remove newline after </think>
# need to get into parser again to remove newline after </think>
COMPLETE_REASONING
=
{
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section</think>"
,
"output"
:
"This is a reasoning section</think>"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
NO_CONTENT
=
{
NO_CONTENT
=
{
"output"
:
"This is content"
,
"output"
:
"This is content"
,
"reasoning
_content
"
:
"This is content"
,
"reasoning"
:
"This is content"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
NO_REASONING_STREAMING
=
{
NO_REASONING_STREAMING
=
{
"output"
:
"This is a reasoning section"
,
"output"
:
"This is a reasoning section"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
MULTIPLE_LINES
=
{
MULTIPLE_LINES
=
{
"output"
:
"This
\n
That</think>This is the rest
\n
That"
,
"output"
:
"This
\n
That</think>This is the rest
\n
That"
,
"reasoning
_content
"
:
"This
\n
That"
,
"reasoning"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
SHORTEST_REASONING_NO_STREAMING
=
{
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"</think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
SHORTEST_REASONING
=
{
SHORTEST_REASONING
=
{
"output"
:
"</think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
REASONING_WITH_THINK
=
{
REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
COMPLETE_REASONING_WITH_THINK
=
{
COMPLETE_REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>"
,
"output"
:
"<think>This is a reasoning section</think>"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
MULTIPLE_LINES_WITH_THINK
=
{
MULTIPLE_LINES_WITH_THINK
=
{
"output"
:
"<think>This
\n
That</think>This is the rest
\n
That"
,
"output"
:
"<think>This
\n
That</think>This is the rest
\n
That"
,
"reasoning
_content
"
:
"This
\n
That"
,
"reasoning"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
SHORTEST_REASONING_WITH_THINK
=
{
SHORTEST_REASONING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
THINK_NO_END
=
{
THINK_NO_END
=
{
"output"
:
"<think>This is a reasoning section"
,
"output"
:
"<think>This is a reasoning section"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
EMPTY
=
{
EMPTY
=
{
"output"
:
""
,
"output"
:
""
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
EMPTY_STREAMING
=
{
EMPTY_STREAMING
=
{
"output"
:
""
,
"output"
:
""
,
"reasoning
_content
"
:
None
,
"reasoning"
:
None
,
"content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
NEW_LINE
=
{
NEW_LINE
=
{
"output"
:
"
\n
<think>This is a reasoning section</think>
\n
This is the rest"
,
"output"
:
"
\n
<think>This is a reasoning section</think>
\n
This is the rest"
,
"reasoning
_content
"
:
"This is a reasoning section"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
NEW_LINE_STREAMING
=
{
NEW_LINE_STREAMING
=
{
"output"
:
"
\n
<think>This is a reasoning section
\n
</think>
\n
This is the rest"
,
"output"
:
"
\n
<think>This is a reasoning section
\n
</think>
\n
This is the rest"
,
"reasoning
_content
"
:
"
\n
This is a reasoning section"
,
"reasoning"
:
"
\n
This is a reasoning section"
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
NEW_LINE_STREAMING_COMPLEX_CONTENT
=
{
NEW_LINE_STREAMING_COMPLEX_CONTENT
=
{
"output"
:
"
\n
This is a
\n
reasoning section
\n\n\n
</think>
\n\n
This is the rest"
,
"output"
:
"
\n
This is a
\n
reasoning section
\n\n\n
</think>
\n\n
This is the rest"
,
"reasoning
_content
"
:
"
\n
This is a
\n
reasoning section
\n\n
"
,
"reasoning"
:
"
\n
This is a
\n
reasoning section
\n\n
"
,
"content"
:
"
\n
This is the rest"
,
"content"
:
"
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
MULTI_TURN_PROMPT_CONTENT
=
{
MULTI_TURN_PROMPT_CONTENT
=
{
"output"
:
"<think> This is last turn's reasoning section </think> hello <think>"
,
"output"
:
"<think> This is last turn's reasoning section </think> hello <think>"
,
"reasoning
_content
"
:
""
,
"reasoning"
:
""
,
"content"
:
""
,
"content"
:
""
,
"is_reasoning_end"
:
False
,
"is_reasoning_end"
:
False
,
}
}
...
@@ -296,7 +296,7 @@ def test_reasoning(
...
@@ -296,7 +296,7 @@ def test_reasoning(
print
(
f
"content:
{
content
}
"
)
print
(
f
"content:
{
content
}
"
)
test_id
=
request
.
node
.
callspec
.
id
if
hasattr
(
request
.
node
,
"callspec"
)
else
None
test_id
=
request
.
node
.
callspec
.
id
if
hasattr
(
request
.
node
,
"callspec"
)
else
None
if
request
.
node
.
callspec
.
id
!=
"multi_turn_prompt_content"
:
if
request
.
node
.
callspec
.
id
!=
"multi_turn_prompt_content"
:
assert
reasoning
==
param_dict
[
"reasoning
_content
"
]
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
# Test is_reasoning_end
...
...
tests/renderers/test_sparse_tensor_validation.py
View file @
0da93439
...
@@ -5,9 +5,9 @@ Tests verify that malicious sparse tensors are rejected before they can trigger
...
@@ -5,9 +5,9 @@ Tests verify that malicious sparse tensors are rejected before they can trigger
out-of-bounds memory writes during to_dense() operations.
out-of-bounds memory writes during to_dense() operations.
"""
"""
import
base64
import
io
import
io
import
pybase64
as
base64
import
pytest
import
pytest
import
torch
import
torch
...
...
Prev
1
…
10
11
12
13
14
15
16
17
18
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment