Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc67613a
"vscode:/vscode.git/clone" did not exist on "ccede2b264668d854cba4fce7f8fbbf203908f60"
Commit
fc67613a
authored
Apr 18, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.19.1' into v0.19.0
parents
31aec25b
b1388b1f
Changes
82
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1401 additions
and
26 deletions
+1401
-26
tests/models/multimodal/generation/test_phi4siglip.py
tests/models/multimodal/generation/test_phi4siglip.py
+198
-0
tests/models/multimodal/generation/test_voxtral.py
tests/models/multimodal/generation/test_voxtral.py
+4
-0
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+5
-0
tests/models/multimodal/pooling/test_colqwen3.py
tests/models/multimodal/pooling/test_colqwen3.py
+5
-0
tests/models/multimodal/pooling/test_intern_vit.py
tests/models/multimodal/pooling/test_intern_vit.py
+5
-0
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
+5
-0
tests/models/multimodal/processing/test_musicflamingo.py
tests/models/multimodal/processing/test_musicflamingo.py
+7
-0
tests/models/registry.py
tests/models/registry.py
+135
-8
tests/models/utils.py
tests/models/utils.py
+11
-1
tests/reasoning/test_gemma4_reasoning_parser.py
tests/reasoning/test_gemma4_reasoning_parser.py
+87
-8
tests/reasoning/test_step3p5_reasoning_parser.py
tests/reasoning/test_step3p5_reasoning_parser.py
+2
-2
tests/renderers/test_gemma4_chat_template.py
tests/renderers/test_gemma4_chat_template.py
+345
-0
tests/tool_parsers/test_gemma4_tool_parser.py
tests/tool_parsers/test_gemma4_tool_parser.py
+182
-0
tests/v1/e2e/spec_decode/test_spec_decode.py
tests/v1/e2e/spec_decode/test_spec_decode.py
+5
-1
vllm/_custom_ops.py
vllm/_custom_ops.py
+35
-0
vllm/compilation/decorators.py
vllm/compilation/decorators.py
+17
-2
vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
+340
-0
vllm/compilation/passes/pass_manager.py
vllm/compilation/passes/pass_manager.py
+4
-0
vllm/config/compilation.py
vllm/config/compilation.py
+7
-4
vllm/config/speculative.py
vllm/config/speculative.py
+2
-0
No files found.
tests/models/multimodal/generation/test_phi4siglip.py
0 → 100644
View file @
fc67613a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
importlib.metadata
import
version
import
pytest
import
regex
as
re
from
packaging.version
import
Version
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
vllm.logprobs
import
SampleLogprobs
from
vllm.multimodal.image
import
rescale_image_size
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
)
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
skipif
(
Version
(
"5.0"
)
<=
Version
(
version
(
"transformers"
)),
reason
=
(
"vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
"internals (filter_out_non_signature_kwargs) removed by "
"huggingface/transformers#43514"
),
)
MODEL_ID
=
"microsoft/Phi-4-reasoning-vision-15B"
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
(
{
"stop_sign"
:
"<|user|>
\n
<image>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"cherry_blossom"
:
"<|user|>
\n
<image>
\n
Please infer the season with reason in details.<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT
=
(
"<|user|>
\n
<image>
\n
<image>
\n
Describe these images.<|end|>
\n
<|assistant|>
\n
"
# noqa: E501
)
DTYPE
=
"half"
MAX_TOKENS
=
128
NUM_LOGPROBS
=
10
def
vllm_to_hf_output
(
vllm_output
:
tuple
[
list
[
int
],
str
,
SampleLogprobs
|
None
],
model
:
str
):
"""Sanitize vllm output to be comparable with hf output."""
_
,
output_str
,
out_logprobs
=
vllm_output
output_str_without_image
=
re
.
sub
(
r
"(<image>)+"
,
""
,
output_str
)
if
output_str_without_image
and
output_str_without_image
[
0
]
==
" "
:
output_str_without_image
=
output_str_without_image
[
1
:]
hf_output_str
=
output_str_without_image
+
"<|end|><|endoftext|>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
hf_output_ids
=
tokenizer
.
encode
(
output_str_without_image
)
if
hf_output_ids
and
hf_output_ids
[
0
]
==
tokenizer
.
bos_token_id
:
hf_output_ids
=
hf_output_ids
[
1
:]
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
_build_single_image_inputs
(
image_assets
,
)
->
list
[
tuple
[
list
[
str
],
PromptImageInput
]]:
"""Build single-image inputs for all size_factors at once."""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
all_inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]]
=
[]
for
size_factors
in
[[
1.0
],
[
0.25
,
0.5
,
1.0
]]:
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
):
all_inputs
.
append
(
(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
f
)
for
f
in
size_factors
],
)
)
return
all_inputs
def
_build_multi_image_inputs
(
image_assets
,
)
->
list
[
tuple
[
list
[
str
],
PromptImageInput
]]:
"""Build multi-image inputs for all size_factors at once."""
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
all_inputs
:
list
[
tuple
[
list
[
str
],
PromptImageInput
]]
=
[]
for
size_factors
in
[[
0.5
],
[
0.15
,
0.30
]]:
all_inputs
.
append
(
(
[
HF_MULTIIMAGE_IMAGE_PROMPT
for
_
in
size_factors
],
[
[
rescale_image_size
(
image
,
factor
)
for
image
in
images
]
for
factor
in
size_factors
],
)
)
return
all_inputs
def
_run_and_compare
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
all_inputs
:
Sequence
[
tuple
[
list
[
str
],
PromptImageInput
]],
model
:
str
,
max_model_len
:
int
,
max_num_seqs
:
int
,
mm_limit
:
int
,
gpu_memory_utilization
:
float
,
):
"""Load each runner once, run all inputs, then compare."""
# NOTE: run vLLM first, then HF. vLLM needs a fresh process without
# cuda initialization; running HF first would break the multiprocessing
# backend with fork method.
with
vllm_runner
(
model
,
runner
=
"generate"
,
max_model_len
=
max_model_len
,
max_num_seqs
=
max_num_seqs
,
gpu_memory_utilization
=
gpu_memory_utilization
,
dtype
=
DTYPE
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
2
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
)
as
vllm_model
:
vllm_outputs_per_case
=
[
vllm_model
.
generate_greedy_logprobs
(
prompts
,
MAX_TOKENS
,
num_logprobs
=
NUM_LOGPROBS
,
images
=
images
,
)
for
prompts
,
images
in
all_inputs
]
hf_model_kwargs
=
{
"_attn_implementation"
:
"sdpa"
,
"device_map"
:
"auto"
}
with
hf_runner
(
model
,
dtype
=
DTYPE
,
model_kwargs
=
hf_model_kwargs
,
auto_cls
=
AutoModelForCausalLM
,
trust_remote_code
=
True
,
)
as
hf_model
:
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
MAX_TOKENS
,
num_logprobs
=
NUM_LOGPROBS
,
images
=
images
,
)
for
prompts
,
images
in
all_inputs
]
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
MODEL_ID
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
)
->
None
:
all_inputs
=
_build_single_image_inputs
(
image_assets
)
_run_and_compare
(
hf_runner
,
vllm_runner
,
all_inputs
,
model
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
mm_limit
=
1
,
gpu_memory_utilization
=
0.80
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
MODEL_ID
])
def
test_multi_images_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
)
->
None
:
all_inputs
=
_build_multi_image_inputs
(
image_assets
)
_run_and_compare
(
hf_runner
,
vllm_runner
,
all_inputs
,
model
,
max_model_len
=
8192
,
max_num_seqs
=
2
,
mm_limit
=
2
,
gpu_memory_utilization
=
0.80
,
)
tests/models/multimodal/generation/test_voxtral.py
View file @
fc67613a
...
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
...
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
)
)
@
pytest
.
mark
.
skip
(
reason
=
"VoxtralProcessor.apply_chat_template() in transformers v5 "
"doesn't resolve chat_template=None to the default template"
)
def
test_hf_reference
(
hf_runner
,
vllm_runner
,
audio_assets
:
AudioTestAssets
):
def
test_hf_reference
(
hf_runner
,
vllm_runner
,
audio_assets
:
AudioTestAssets
):
"""Compare vLLM Mistral-format output against HF Transformers reference.
"""Compare vLLM Mistral-format output against HF Transformers reference.
...
...
tests/models/multimodal/generation/vlm_utils/core.py
View file @
fc67613a
...
@@ -80,6 +80,11 @@ def run_test(
...
@@ -80,6 +80,11 @@ def run_test(
if
vllm_runner_kwargs
:
if
vllm_runner_kwargs
:
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
# Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
# already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
if
"limit_mm_per_prompt"
in
vllm_runner_kwargs_
:
limit_mm_per_prompt
=
vllm_runner_kwargs_
.
pop
(
"limit_mm_per_prompt"
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
...
...
tests/models/multimodal/pooling/test_colqwen3.py
View file @
fc67613a
...
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
...
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from
....conftest
import
VllmRunner
from
....conftest
import
VllmRunner
pytestmark
=
pytest
.
mark
.
skip
(
reason
=
"ColQwen3 model's weight tying is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS
=
[
MODELS
=
[
"TomoroAI/tomoro-colqwen3-embed-4b"
,
"TomoroAI/tomoro-colqwen3-embed-4b"
,
"OpenSearch-AI/Ops-Colqwen3-4B"
,
"OpenSearch-AI/Ops-Colqwen3-4B"
,
...
...
tests/models/multimodal/pooling/test_intern_vit.py
View file @
fc67613a
...
@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
...
@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
ImageTestAssets
from
....conftest
import
ImageTestAssets
pytestmark
=
pytest
.
mark
.
skip
(
reason
=
"InternVisionModel's custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
# we use snapshot_download to prevent conflicts between
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN
=
[
"*.json"
,
"*.py"
,
"*.safetensors"
,
"*.txt"
,
"*.model"
]
DOWNLOAD_PATTERN
=
[
"*.json"
,
"*.py"
,
"*.safetensors"
,
"*.txt"
,
"*.model"
]
...
...
tests/models/multimodal/pooling/test_jinavl_reranker.py
View file @
fc67613a
...
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
...
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
pytestmark
=
pytest
.
mark
.
skip
(
reason
=
"jinaai/jina-reranker-m0 custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS
=
[
"jinaai/jina-reranker-m0"
]
MODELS
=
[
"jinaai/jina-reranker-m0"
]
MM_PROCESSOR_KWARGS
=
{
MM_PROCESSOR_KWARGS
=
{
...
...
tests/models/multimodal/processing/test_musicflamingo.py
View file @
fc67613a
...
@@ -17,11 +17,13 @@
...
@@ -17,11 +17,13 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
importlib.metadata
import
version
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch
import
torch
from
packaging.version
import
Version
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
tests.models.registry
import
HF_EXAMPLE_MODELS
from
tests.models.registry
import
HF_EXAMPLE_MODELS
...
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
...
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
assert
builder
.
get_dummy_text
({
"audio"
:
2
})
==
"<sound><sound>"
assert
builder
.
get_dummy_text
({
"audio"
:
2
})
==
"<sound><sound>"
@
pytest
.
mark
.
skipif
(
Version
(
version
(
"transformers"
))
>=
Version
(
"5.5"
),
reason
=
"transformers v5.5 added native MusicFlamingoForConditionalGeneration "
"with a different get_audio_features signature (requires input_ids)"
,
)
def
test_musicflamingo_audio_feature_pipeline_matches_hf_small_config
():
def
test_musicflamingo_audio_feature_pipeline_matches_hf_small_config
():
from
transformers.models.musicflamingo
import
(
from
transformers.models.musicflamingo
import
(
modeling_musicflamingo
as
hf_musicflamingo_modeling
,
modeling_musicflamingo
as
hf_musicflamingo_modeling
,
...
...
tests/models/registry.py
View file @
fc67613a
...
@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"internlm/internlm2-chat-7b"
,
trust_remote_code
=
True
"internlm/internlm2-chat-7b"
,
trust_remote_code
=
True
),
),
"InternLM2VEForCausalLM"
:
_HfExamplesInfo
(
"InternLM2VEForCausalLM"
:
_HfExamplesInfo
(
"OpenGVLab/Mono-InternVL-2B"
,
trust_remote_code
=
True
"OpenGVLab/Mono-InternVL-2B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
(
"Custom config cannot be loaded with Transformers "
"v5 because `vision_config` is not always set"
)
},
),
),
"InternLM3ForCausalLM"
:
_HfExamplesInfo
(
"InternLM3ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm3-8b-instruct"
,
trust_remote_code
=
True
"internlm/internlm3-8b-instruct"
,
trust_remote_code
=
True
...
@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Plamo2ForCausalLM"
:
_HfExamplesInfo
(
"Plamo2ForCausalLM"
:
_HfExamplesInfo
(
"pfnet/plamo-2-1b"
,
"pfnet/plamo-2-1b"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"hf"
:
(
"Custom model code uses `_tied_weight_keys: list[str]` but "
"Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
)
},
),
),
"Plamo3ForCausalLM"
:
_HfExamplesInfo
(
"Plamo3ForCausalLM"
:
_HfExamplesInfo
(
"pfnet/plamo-3-nict-2b-base"
,
"pfnet/plamo-3-nict-2b-base"
,
...
@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
is_available_online
=
True
,
is_available_online
=
True
,
max_transformers_version
=
"5.3"
,
transformers_version_reason
=
{
"vllm"
:
(
"vllm upgraded transformers above v5.4 where "
"validate_rope() no longer accepts ignore_keys param"
)
},
),
),
"SeedOssForCausalLM"
:
_HfExamplesInfo
(
"SeedOssForCausalLM"
:
_HfExamplesInfo
(
"ByteDance-Seed/Seed-OSS-36B-Instruct"
,
"ByteDance-Seed/Seed-OSS-36B-Instruct"
,
...
@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"xverse/XVERSE-7B-Chat"
,
"xverse/XVERSE-7B-Chat"
,
tokenizer
=
"meta-llama/Llama-2-7b"
,
tokenizer
=
"meta-llama/Llama-2-7b"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
"XVERSE tokenizer is incompatible with transformers v5 "
"(add_prefix_space / prepend_scheme mismatch)."
,
},
),
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
"XiaomiMiMo/MiMo-7B-RL"
,
trust_remote_code
=
True
),
...
@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only]
# [Decoder-only]
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"AudioFlamingo3ForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.0.0"
"nvidia/audio-flamingo-3-hf"
,
min_transformers_version
=
"5.3.0"
,
transformers_version_reason
=
{
"vllm"
:
"Needs https://github.com/huggingface/transformers/pull/43538"
},
),
),
"MusicFlamingoForConditionalGeneration"
:
_HfExamplesInfo
(
"MusicFlamingoForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/music-flamingo-2601-hf"
,
min_transformers_version
=
"5.3.0"
"nvidia/music-flamingo-2601-hf"
,
min_transformers_version
=
"5.3.0"
,
transformers_version_reason
=
{
"vllm"
:
"Needs https://github.com/huggingface/transformers/pull/43538"
},
),
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"AyaVisionForConditionalGeneration"
:
_HfExamplesInfo
(
"CohereLabs/aya-vision-8b"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
"BagelForConditionalGeneration"
:
_HfExamplesInfo
(
"ByteDance-Seed/BAGEL-7B-MoT"
),
...
@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
),
"FireRedASR2ForConditionalGeneration"
:
_HfExamplesInfo
(
"FireRedASR2ForConditionalGeneration"
:
_HfExamplesInfo
(
"allendou/FireRedASR2-LLM-vllm"
,
"allendou/FireRedASR2-LLM-vllm"
,
trust_remote_code
=
True
,
max_transformers_version
=
"5.1"
,
transformers_version_reason
=
{
"vllm"
:
"Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__')."
,
},
),
"FireRedLIDForConditionalGeneration"
:
_HfExamplesInfo
(
"PatchyTisa/FireRedLID-vllm"
,
trust_remote_code
=
True
,
max_transformers_version
=
"5.1"
,
transformers_version_reason
=
{
"vllm"
:
"Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__')."
,
},
),
),
"FunASRForConditionalGeneration"
:
_HfExamplesInfo
(
"FunASRForConditionalGeneration"
:
_HfExamplesInfo
(
"allendou/Fun-ASR-Nano-2512-vllm"
,
"allendou/Fun-ASR-Nano-2512-vllm"
,
trust_remote_code
=
True
,
max_transformers_version
=
"5.1"
,
transformers_version_reason
=
{
"vllm"
:
"Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__')."
,
},
),
),
"FunAudioChatForConditionalGeneration"
:
_HfExamplesInfo
(
"FunAudioChatForConditionalGeneration"
:
_HfExamplesInfo
(
"funaudiochat"
,
is_available_online
=
False
"funaudiochat"
,
is_available_online
=
False
...
@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
(
"Custom config cannot be loaded with Transformers "
"v5 because `text_config` is not always set"
)
},
),
),
"HCXVisionV2ForCausalLM"
:
_HfExamplesInfo
(
"HCXVisionV2ForCausalLM"
:
_HfExamplesInfo
(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
,
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
,
...
@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"0.2-2B-Preview"
:
"PerceptronAI/Isaac-0.2-2B-Preview"
},
extras
=
{
"0.2-2B-Preview"
:
"PerceptronAI/Isaac-0.2-2B-Preview"
},
),
),
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
"internlm/Intern-S1"
,
trust_remote_code
=
True
"internlm/Intern-S1"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
"Custom tokenizer code is not compatible with Transformers v5."
},
),
),
"InternS1ProForConditionalGeneration"
:
_HfExamplesInfo
(
"InternS1ProForConditionalGeneration"
:
_HfExamplesInfo
(
"internlm/Intern-S1-Pro"
,
"internlm/Intern-S1-Pro"
,
...
@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiDashengLMModel"
:
_HfExamplesInfo
(
"MiDashengLMModel"
:
_HfExamplesInfo
(
"mispeech/midashenglm-7b"
,
trust_remote_code
=
True
"mispeech/midashenglm-7b"
,
trust_remote_code
=
True
),
),
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
),
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"hf"
:
"Custom processor code is not compatible with Transformers v5."
},
),
"MiniCPMV"
:
_HfExamplesInfo
(
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
extras
=
{
...
@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"4.0"
:
"openbmb/MiniCPM-V-4"
,
"4.0"
:
"openbmb/MiniCPM-V-4"
,
"4.5"
:
"openbmb/MiniCPM-V-4_5"
,
"4.5"
:
"openbmb/MiniCPM-V-4_5"
,
},
},
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
(
"MiniCPMVBatchFeature is incompatible with its base class in "
"Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
)
},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
"MiniMaxVL01ForConditionalGeneration"
:
_HfExamplesInfo
(
...
@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"nano_vl_dummy"
,
is_available_online
=
False
,
trust_remote_code
=
True
"nano_vl_dummy"
,
is_available_online
=
False
,
trust_remote_code
=
True
),
),
"OpenCUAForConditionalGeneration"
:
_HfExamplesInfo
(
"OpenCUAForConditionalGeneration"
:
_HfExamplesInfo
(
"xlangai/OpenCUA-7B"
,
trust_remote_code
=
True
"xlangai/OpenCUA-7B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
"Tokenizer cannot be initialised in Transformers v5."
},
),
),
"OpenPanguVLForConditionalGeneration"
:
_HfExamplesInfo
(
"OpenPanguVLForConditionalGeneration"
:
_HfExamplesInfo
(
"FreedomIntelligence/openPangu-VL-7B"
,
"FreedomIntelligence/openPangu-VL-7B"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
enforce_eager
=
True
,
enforce_eager
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
(
"OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
"making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
)
},
),
),
"Ovis"
:
_HfExamplesInfo
(
"Ovis"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2-1B"
,
"AIDC-AI/Ovis2-1B"
,
...
@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma"
:
"AIDC-AI/Ovis1.6-Gemma2-9B"
,
"1.6-gemma"
:
"AIDC-AI/Ovis1.6-Gemma2-9B"
,
},
},
),
),
"Ovis2_5"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2.5-2B"
,
trust_remote_code
=
True
),
"Ovis2_5"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2.5-2B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
"Custom processor code is not compatible with Transformers v5."
},
),
"Ovis2_6ForCausalLM"
:
_HfExamplesInfo
(
"Ovis2_6ForCausalLM"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2.6-2B"
,
is_available_online
=
False
,
trust_remote_code
=
True
"AIDC-AI/Ovis2.6-2B"
,
is_available_online
=
False
,
trust_remote_code
=
True
),
),
"Ovis2_6_MoeForCausalLM"
:
_HfExamplesInfo
(
"Ovis2_6_MoeForCausalLM"
:
_HfExamplesInfo
(
"AIDC-AI/Ovis2.6-30B-A3B"
,
trust_remote_code
=
True
"AIDC-AI/Ovis2.6-30B-A3B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.57"
,
transformers_version_reason
=
{
"vllm"
:
"Custom processor code is not compatible with Transformers v5."
},
),
),
"PaddleOCRVLForConditionalGeneration"
:
_HfExamplesInfo
(
"PaddleOCRVLForConditionalGeneration"
:
_HfExamplesInfo
(
"PaddlePaddle/PaddleOCR-VL"
,
"PaddlePaddle/PaddleOCR-VL"
,
...
@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
},
# noqa: E501
},
# noqa: E501
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
},
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
},
),
),
"Phi4ForCausalLMV"
:
_HfExamplesInfo
(
"microsoft/Phi-4-reasoning-vision-15B"
,
trust_remote_code
=
True
,
max_transformers_version
=
"5.3"
,
transformers_version_reason
=
{
"vllm"
:
(
"vllm upgraded transformers above v5.4 where HF model "
"custom code uses siglip2 internals "
"(filter_out_non_signature_kwargs) removed "
"by huggingface/transformers#43514"
)
},
),
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
),
...
@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
],
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
],
"model_type"
:
"tarsier2"
,
"model_type"
:
"tarsier2"
,
},
},
max_transformers_version
=
"5.3"
,
transformers_version_reason
=
{
"vllm"
:
(
"Qwen2VLConfig was split into Qwen2VLConfig + "
"Qwen2VLTextConfig in transformers v5, breaking "
"attribute access (num_attention_heads, hidden_size, etc.)"
)
},
),
),
"VoxtralForConditionalGeneration"
:
_HfExamplesInfo
(
"VoxtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Voxtral-Mini-3B-2507"
,
"mistralai/Voxtral-Mini-3B-2507"
,
...
...
tests/models/utils.py
View file @
fc67613a
...
@@ -375,6 +375,7 @@ def softmax(data):
...
@@ -375,6 +375,7 @@ def softmax(data):
@
dataclass
@
dataclass
class
ModelInfo
:
class
ModelInfo
:
name
:
str
name
:
str
revision
:
str
|
None
=
None
architecture
:
str
=
""
architecture
:
str
=
""
dtype
:
str
=
"auto"
dtype
:
str
=
"auto"
max_model_len
:
int
|
None
=
None
max_model_len
:
int
|
None
=
None
...
@@ -468,7 +469,16 @@ def dummy_hf_overrides(
...
@@ -468,7 +469,16 @@ def dummy_hf_overrides(
else
:
else
:
# Use minimal layers for testing
# Use minimal layers for testing
num_layers
=
1
num_layers
=
1
num_hidden_layers
=
3
if
model_arch
==
"Gemma3nForConditionalGeneration"
else
1
num_hidden_layers
=
(
3
if
model_arch
in
(
"Gemma3nForConditionalGeneration"
,
"Gemma4ForCausalLM"
,
"Gemma4ForConditionalGeneration"
,
)
else
1
)
update_dict
=
{
update_dict
=
{
"num_layers"
:
num_layers
,
"num_layers"
:
num_layers
,
...
...
tests/reasoning/test_gemma4_reasoning_parser.py
View file @
fc67613a
...
@@ -4,6 +4,9 @@
...
@@ -4,6 +4,9 @@
import
pytest
import
pytest
from
tests.reasoning.utils
import
run_reasoning_extraction
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
ChatCompletionRequest
,
)
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
# Using mistral tokenizer as a generic mock since the actual model is not on HF
# Using mistral tokenizer as a generic mock since the actual model is not on HF
...
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
...
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
"is_reasoning_end"
:
True
,
"is_reasoning_end"
:
True
,
}
}
THOUGHT_PREFIX
=
{
"output"
:
"<|channel>thought
\n
Actual reasoning here<channel|>Final answer"
,
"reasoning"
:
"Actual reasoning here"
,
"content"
:
"Final answer"
,
"is_reasoning_end"
:
True
,
}
THOUGHT_PREFIX_ONLY
=
{
"output"
:
"<|channel>thought
\n
<channel|>"
,
"reasoning"
:
""
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
THOUGHT_PREFIX_MULTILINE
=
{
"output"
:
"<|channel>thought
\n
Line1
\n
Line2<channel|>Answer"
,
"reasoning"
:
"Line1
\n
Line2"
,
"content"
:
"Answer"
,
"is_reasoning_end"
:
True
,
}
# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
THOUGHT_PREFIX_DIVERGE
=
{
"output"
:
"<|channel>thousand reasons<channel|>Done"
,
"reasoning"
:
"thousand reasons"
,
"content"
:
"Done"
,
"is_reasoning_end"
:
True
,
}
# The model isn't reasoning if we're generating tool calls.
TOOL_CALL_STARTED
=
{
"output"
:
"<|tool_call>"
,
"reasoning"
:
None
,
"content"
:
"<|tool_call>"
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
TEST_CASES
=
[
pytest
.
param
(
False
,
INVALID_SIMPLE_NONSTREAMING
,
id
=
"invalid_simple"
),
pytest
.
param
(
False
,
INVALID_SIMPLE_NONSTREAMING
,
id
=
"invalid_simple"
),
pytest
.
param
(
True
,
INVALID_SIMPLE_STREAMING
,
id
=
"invalid_simple_streaming"
),
pytest
.
param
(
True
,
INVALID_SIMPLE_STREAMING
,
id
=
"invalid_simple_streaming"
),
...
@@ -120,17 +156,22 @@ TEST_CASES = [
...
@@ -120,17 +156,22 @@ TEST_CASES = [
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
),
pytest
.
param
(
False
,
NEW_LINE_NONSTREAMING
,
id
=
"new_line"
),
pytest
.
param
(
False
,
NEW_LINE_NONSTREAMING
,
id
=
"new_line"
),
pytest
.
param
(
True
,
NEW_LINE_STREAMING
,
id
=
"new_line_streaming"
),
pytest
.
param
(
True
,
NEW_LINE_STREAMING
,
id
=
"new_line_streaming"
),
pytest
.
param
(
False
,
THOUGHT_PREFIX
,
id
=
"thought_prefix"
),
pytest
.
param
(
True
,
THOUGHT_PREFIX
,
id
=
"thought_prefix_streaming"
),
pytest
.
param
(
False
,
THOUGHT_PREFIX_ONLY
,
id
=
"thought_prefix_only"
),
pytest
.
param
(
True
,
THOUGHT_PREFIX_ONLY
,
id
=
"thought_prefix_only_streaming"
),
pytest
.
param
(
False
,
THOUGHT_PREFIX_MULTILINE
,
id
=
"thought_prefix_multiline"
),
pytest
.
param
(
True
,
THOUGHT_PREFIX_MULTILINE
,
id
=
"thought_prefix_multiline_streaming"
),
pytest
.
param
(
False
,
THOUGHT_PREFIX_DIVERGE
,
id
=
"thought_prefix_diverge"
),
pytest
.
param
(
True
,
THOUGHT_PREFIX_DIVERGE
,
id
=
"thought_prefix_diverge_streaming"
),
pytest
.
param
(
False
,
TOOL_CALL_STARTED
,
id
=
"tool_call_started"
),
pytest
.
param
(
True
,
TOOL_CALL_STARTED
,
id
=
"tool_call_started_streaming"
),
]
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
gemma4_encode_output
(
generic_tokenizer
,
output
:
str
)
->
list
[
int
]:
def
test_gemma4_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
generic_tokenizer
,
):
output
=
param_dict
[
"output"
]
# Resolve token IDs dynamically from the real tokenizer
# Resolve token IDs dynamically from the real tokenizer
vocab
=
generic_tokenizer
.
get_vocab
()
vocab
=
generic_tokenizer
.
get_vocab
()
start_token_id
=
vocab
[
"<|channel>"
]
start_token_id
=
vocab
[
"<|channel>"
]
...
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
...
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
else
:
else
:
output_tokens
+=
_encode
(
output
)
output_tokens
+=
_encode
(
output
)
return
output_tokens
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_gemma4_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
generic_tokenizer
,
):
output
=
param_dict
[
"output"
]
output_tokens
=
gemma4_encode_output
(
generic_tokenizer
,
output
)
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
generic_tokenizer
generic_tokenizer
)
)
...
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
...
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
# Test is_reasoning_end
# Test is_reasoning_end
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_tokens
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_tokens
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
def
test_gemma4_adjust_request
(
generic_tokenizer
):
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
generic_tokenizer
)
request
=
ChatCompletionRequest
(
messages
=
[],
model
=
"test-model"
)
assert
request
.
skip_special_tokens
is
True
result
=
parser
.
adjust_request
(
request
)
assert
result
.
skip_special_tokens
is
False
assert
result
is
request
def
test_gemma4_previous_turn_reasoning_is_reasoning_end
(
generic_tokenizer
):
output
=
(
"<|channel>thought
\n
1st thought<channel|>1st content<turn|>
\n
"
"<|turn>user
\n
Thanks<|turn>model
\n
"
)
output_tokens
=
gemma4_encode_output
(
generic_tokenizer
,
output
)
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
generic_tokenizer
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_tokens
)
assert
not
is_reasoning_end
tests/reasoning/test_step3p5_reasoning_parser.py
View file @
fc67613a
...
@@ -2,10 +2,10 @@
...
@@ -2,10 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
from
vllm.tokenizers
import
get_tokenizer
parser_name
=
"step3p5"
parser_name
=
"step3p5"
start_token
=
"<think>"
start_token
=
"<think>"
...
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
...
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
step3p5_tokenizer
():
def
step3p5_tokenizer
():
return
AutoT
okenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
return
get_t
okenizer
(
tokenizer_name
=
REASONING_MODEL_NAME
)
SIMPLE_REASONING
=
{
SIMPLE_REASONING
=
{
...
...
tests/renderers/test_gemma4_chat_template.py
0 → 100644
View file @
fc67613a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Gemma4 chat template rendering."""
from
pathlib
import
Path
import
jinja2.sandbox
import
pytest
TEMPLATE_PATH
=
(
Path
(
__file__
).
resolve
().
parent
.
parent
.
parent
/
"examples"
/
"tool_chat_template_gemma4.jinja"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
gemma4_template
():
"""Load and compile the Gemma4 chat template."""
template_str
=
TEMPLATE_PATH
.
read_text
()
env
=
jinja2
.
sandbox
.
ImmutableSandboxedEnvironment
()
return
env
.
from_string
(
template_str
)
def
_render
(
template
,
messages
,
**
kwargs
):
"""Render the template with sensible defaults."""
kwargs
.
setdefault
(
"bos_token"
,
"<bos>"
)
kwargs
.
setdefault
(
"add_generation_prompt"
,
False
)
return
template
.
render
(
messages
=
messages
,
**
kwargs
)
class
TestGemma4ChatTemplate
:
def
test_basic_multiturn_thinking_disabled
(
self
,
gemma4_template
):
"""With enable_thinking=False (default), generation prompt ends with
an empty thought channel to suppress thinking."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hi there!"
},
{
"role"
:
"user"
,
"content"
:
"How are you?"
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
)
assert
"<|turn>user
\n
"
in
result
assert
"<|turn>model
\n
"
in
result
assert
"Hello"
in
result
assert
"Hi there!"
in
result
assert
"How are you?"
in
result
assert
result
.
rstrip
(
"
\n
"
).
endswith
(
"<|channel>thought
\n
<channel|>"
)
def
test_basic_multiturn_thinking_enabled
(
self
,
gemma4_template
):
"""With enable_thinking=True, generation prompt ends with model
turn opener (no thought suppression)."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hi there!"
},
{
"role"
:
"user"
,
"content"
:
"How are you?"
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
,
enable_thinking
=
True
,
)
assert
"<|turn>user
\n
"
in
result
assert
"<|turn>model
\n
"
in
result
assert
"Hello"
in
result
assert
"Hi there!"
in
result
assert
"How are you?"
in
result
assert
result
.
rstrip
(
"
\n
"
).
endswith
(
"<|turn>model"
)
def
test_system_message
(
self
,
gemma4_template
):
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are helpful."
},
{
"role"
:
"user"
,
"content"
:
"Hi"
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
"<|turn>system
\n
"
in
result
assert
"You are helpful."
in
result
def
test_thinking_enabled
(
self
,
gemma4_template
):
messages
=
[{
"role"
:
"user"
,
"content"
:
"Think about this"
}]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
,
enable_thinking
=
True
,
)
assert
"<|think|>"
in
result
assert
"<|turn>system
\n
"
in
result
def
test_tool_declarations
(
self
,
gemma4_template
):
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_weather"
,
"description"
:
"Get weather for a city"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"City name"
,
}
},
"required"
:
[
"city"
],
},
},
}
]
messages
=
[{
"role"
:
"user"
,
"content"
:
"What is the weather?"
}]
result
=
_render
(
gemma4_template
,
messages
,
tools
=
tools
,
add_generation_prompt
=
True
,
)
assert
"<|tool>"
in
result
assert
"declaration:get_weather"
in
result
assert
"<tool|>"
in
result
assert
'<|"|>City name<|"|>'
in
result
def
test_tool_calls_in_assistant
(
self
,
gemma4_template
):
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Weather in London?"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"id"
:
"call_1"
,
"function"
:
{
"name"
:
"get_weather"
,
"arguments"
:
{
"city"
:
"London"
},
},
}
],
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
"<|tool_call>call:get_weather{"
in
result
assert
"}<tool_call|>"
in
result
assert
'<|"|>London<|"|>'
in
result
def
test_tool_responses_openai_style
(
self
,
gemma4_template
):
"""role='tool' messages are formatted as <|tool_response> blocks
with content dumped as-is."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Weather?"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"id"
:
"call_1"
,
"function"
:
{
"name"
:
"get_weather"
,
"arguments"
:
{
"city"
:
"London"
},
},
}
],
},
{
"role"
:
"tool"
,
"tool_call_id"
:
"call_1"
,
"content"
:
'{"temperature": 15, "condition": "sunny"}'
,
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
)
assert
"<|tool_response>"
in
result
assert
"response:get_weather{"
in
result
assert
"<tool_response|>"
in
result
assert
'"temperature": 15'
in
result
def
test_tool_responses_legacy_style
(
self
,
gemma4_template
):
"""tool_responses embedded on the assistant message."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Weather?"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"function"
:
{
"name"
:
"get_weather"
,
"arguments"
:
{
"city"
:
"London"
},
},
}
],
"tool_responses"
:
[
{
"name"
:
"get_weather"
,
"response"
:
{
"temperature"
:
20
},
}
],
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
"<|tool_response>"
in
result
assert
"response:get_weather{"
in
result
assert
"temperature:"
in
result
def
test_generation_prompt_not_after_tool_response
(
self
,
gemma4_template
):
"""add_generation_prompt=True should NOT add <|turn>model when the
last message type was tool_response (the model turn continues)."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Weather?"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"id"
:
"call_1"
,
"function"
:
{
"name"
:
"get_weather"
,
"arguments"
:
{
"city"
:
"London"
},
},
}
],
},
{
"role"
:
"tool"
,
"tool_call_id"
:
"call_1"
,
"content"
:
"sunny"
,
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
)
assert
not
result
.
strip
().
endswith
(
"<|turn>model
\n
"
)
def
test_reasoning_in_tool_chains
(
self
,
gemma4_template
):
"""reasoning field on assistant with tool_calls after last user
message emits <|channel>thought
\\
n...<channel|>."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Calculate something"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"reasoning"
:
"Let me think about this..."
,
"tool_calls"
:
[
{
"function"
:
{
"name"
:
"calculator"
,
"arguments"
:
{
"expr"
:
"2+2"
},
},
}
],
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
"<|channel>thought
\n
"
in
result
assert
"Let me think about this..."
in
result
assert
"<channel|>"
in
result
def
test_reasoning_not_before_last_user
(
self
,
gemma4_template
):
"""reasoning on assistant BEFORE the last user message is dropped."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"First"
},
{
"role"
:
"assistant"
,
"content"
:
"Response"
,
"reasoning"
:
"Old reasoning that should be dropped"
,
"tool_calls"
:
[
{
"function"
:
{
"name"
:
"fn"
,
"arguments"
:
{},
},
}
],
},
{
"role"
:
"user"
,
"content"
:
"Second"
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
)
assert
"Old reasoning"
not
in
result
def
test_strip_thinking_in_model_content
(
self
,
gemma4_template
):
"""<|channel>...<channel|> in model content is stripped by the
strip_thinking macro."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hi"
},
{
"role"
:
"assistant"
,
"content"
:
(
"<|channel>internal thought<channel|>Visible answer"
),
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
"internal thought"
not
in
result
assert
"Visible answer"
in
result
def
test_multi_turn_tool_chain
(
self
,
gemma4_template
):
"""assistant->tool->assistant->tool produces exactly one
<|turn>model (later assistants continue the same turn)."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Do two things"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"id"
:
"c1"
,
"function"
:
{
"name"
:
"step1"
,
"arguments"
:
{}},
},
],
},
{
"role"
:
"tool"
,
"tool_call_id"
:
"c1"
,
"content"
:
"result1"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"id"
:
"c2"
,
"function"
:
{
"name"
:
"step2"
,
"arguments"
:
{}},
},
],
},
{
"role"
:
"tool"
,
"tool_call_id"
:
"c2"
,
"content"
:
"result2"
},
]
result
=
_render
(
gemma4_template
,
messages
,
add_generation_prompt
=
True
)
assert
result
.
count
(
"<|turn>model
\n
"
)
==
1
def
test_format_argument_types
(
self
,
gemma4_template
):
"""Strings wrapped in <|"|>, booleans as true/false, numbers bare."""
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Test"
},
{
"role"
:
"assistant"
,
"content"
:
""
,
"tool_calls"
:
[
{
"function"
:
{
"name"
:
"test_fn"
,
"arguments"
:
{
"name"
:
"Alice"
,
"active"
:
True
,
"count"
:
42
,
},
},
}
],
},
]
result
=
_render
(
gemma4_template
,
messages
)
assert
'<|"|>Alice<|"|>'
in
result
assert
"active:true"
in
result
assert
"count:42"
in
result
tests/tool_parsers/test_gemma4_tool_parser.py
View file @
fc67613a
...
@@ -85,6 +85,14 @@ class TestParseGemma4Args:
...
@@ -85,6 +85,14 @@ class TestParseGemma4Args:
result
=
_parse_gemma4_args
(
"flag:false"
)
result
=
_parse_gemma4_args
(
"flag:false"
)
assert
result
==
{
"flag"
:
False
}
assert
result
==
{
"flag"
:
False
}
def
test_null_value
(
self
):
# Bare `null` must parse as None (Python), not the string "null".
# Without this, tool_choice=auto would emit `{"param": "null"}`
# instead of `{"param": null}` for nullable tool parameters.
result
=
_parse_gemma4_args
(
"param:null"
)
assert
result
==
{
"param"
:
None
}
assert
json
.
dumps
(
result
)
==
'{"param": null}'
def
test_mixed_types
(
self
):
def
test_mixed_types
(
self
):
result
=
_parse_gemma4_args
(
result
=
_parse_gemma4_args
(
'name:<|"|>test<|"|>,count:42,active:true,score:3.14'
'name:<|"|>test<|"|>,count:42,active:true,score:3.14'
...
@@ -114,6 +122,19 @@ class TestParseGemma4Args:
...
@@ -114,6 +122,19 @@ class TestParseGemma4Args:
result
=
_parse_gemma4_args
(
"key:"
)
result
=
_parse_gemma4_args
(
"key:"
)
assert
result
==
{
"key"
:
""
}
assert
result
==
{
"key"
:
""
}
def
test_empty_value_partial_withheld
(
self
):
"""Key with no value is withheld in partial mode to avoid premature emission."""
result
=
_parse_gemma4_args
(
"key:"
,
partial
=
True
)
assert
result
==
{}
# also with a space after the colon
result
=
_parse_gemma4_args
(
"key: "
,
partial
=
True
)
assert
result
==
{}
def
test_empty_value_after_other_keys_partial_withheld
(
self
):
"""Trailing key with no value is withheld; earlier keys are kept."""
result
=
_parse_gemma4_args
(
'name:<|"|>test<|"|>,flag:'
,
partial
=
True
)
assert
result
==
{
"name"
:
"test"
}
class
TestParseGemma4Array
:
class
TestParseGemma4Array
:
def
test_string_array
(
self
):
def
test_string_array
(
self
):
...
@@ -491,6 +512,51 @@ class TestStreamingExtraction:
...
@@ -491,6 +512,51 @@ class TestStreamingExtraction:
assert
parsed_args
[
"count"
]
==
42
assert
parsed_args
[
"count"
]
==
42
assert
parsed_args
[
"active"
]
is
True
assert
parsed_args
[
"active"
]
is
True
def
test_streaming_boolean_split_across_chunks
(
self
,
parser
,
mock_request
):
"""Boolean value split across token boundaries must not corrupt JSON."""
chunks
=
[
"<|tool_call>"
,
"call:search{input:{all:"
+
"true"
[:
3
],
"e}}"
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
,
"No arguments were streamed"
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
[
"input"
][
"all"
]
is
True
def
test_streaming_false_split_across_chunks
(
self
,
parser
,
mock_request
):
"""Boolean false split across chunks."""
chunks
=
[
"<|tool_call>"
,
"call:set{flag:"
+
"false"
[:
4
],
"e}"
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
,
"No arguments were streamed"
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
[
"flag"
]
is
False
def
test_streaming_number_split_across_chunks
(
self
,
parser
,
mock_request
):
"""Number split across chunks must not change type."""
chunks
=
[
"<|tool_call>"
,
"call:set{count:4"
,
"2}"
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
,
"No arguments were streamed"
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
[
"count"
]
==
42
def
test_streaming_empty_args
(
self
,
parser
,
mock_request
):
def
test_streaming_empty_args
(
self
,
parser
,
mock_request
):
"""Tool call with no arguments."""
"""Tool call with no arguments."""
chunks
=
[
chunks
=
[
...
@@ -502,3 +568,119 @@ class TestStreamingExtraction:
...
@@ -502,3 +568,119 @@ class TestStreamingExtraction:
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
name
=
self
.
_collect_function_name
(
results
)
name
=
self
.
_collect_function_name
(
results
)
assert
name
==
"get_status"
assert
name
==
"get_status"
def
test_streaming_split_delimiter_no_invalid_json
(
self
,
parser
,
mock_request
):
"""Partial <|"|> delimiter chars must not leak into streamed JSON.
Reproduces the bug from https://github.com/vllm-project/vllm/issues/38946
where a token boundary splits the string delimiter, leaving fragments
like '<|' at the end of a parsed value which then corrupt the JSON.
"""
chunks
=
[
"<|tool_call>"
,
"call:todowrite{"
,
'content:<|"|>Buy milk<|'
,
'"|>}'
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
,
"No arguments were streamed"
# Must be valid JSON — the original bug caused a JSON parse error
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
[
"content"
]
==
"Buy milk"
# Ensure no raw delimiter fragments leaked into the JSON
assert
"<|"
not
in
args_text
,
(
f
"Partial delimiter leaked into JSON:
{
args_text
!
r
}
"
)
def
test_streaming_does_not_duplicate_plain_text_after_tool_call
(
self
,
parser
,
mock_request
,
monkeypatch
):
"""Buffered plain text after a tool call must not corrupt current_text."""
captured_current_texts
:
list
[
str
]
=
[]
original_extract_streaming
=
parser
.
_extract_streaming
def
wrapped_extract_streaming
(
previous_text
,
current_text
,
delta_text
):
captured_current_texts
.
append
(
current_text
)
return
original_extract_streaming
(
previous_text
,
current_text
,
delta_text
)
monkeypatch
.
setattr
(
parser
,
"_extract_streaming"
,
wrapped_extract_streaming
)
chunks
=
[
"<|tool_call>"
,
"call:get_weather{"
,
'location:<|"|>Paris<|"|>}'
,
"<tool_call|><"
,
"div>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
content_parts
=
[
delta
.
content
for
delta
,
_
in
results
if
delta
is
not
None
and
delta
.
content
]
assert
""
.
join
(
content_parts
)
==
"<div>"
assert
captured_current_texts
[
-
1
].
endswith
(
"<tool_call|><div>"
)
assert
not
captured_current_texts
[
-
1
].
endswith
(
"<tool_call|><<div>"
)
def
test_streaming_html_argument_does_not_duplicate_tag_prefixes
(
self
,
parser
,
mock_request
):
"""HTML content inside tool arguments must not be duplicated."""
chunks
=
[
"<|tool_call>"
,
"call:write_file{"
,
'path:<|"|>index.html<|"|>,'
,
'content:<|"|><!DOCTYPE html>
\n
<'
,
'html lang="zh-CN">
\n
<'
,
"head>
\n
<"
,
'meta charset="UTF-8">
\n
<'
,
'meta name="viewport" content="width=device-width">
\n
'
,
'<|"|>}'
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
[
"path"
]
==
"index.html"
assert
(
parsed_args
[
"content"
]
==
"<!DOCTYPE html>
\n
"
'<html lang="zh-CN">
\n
'
"<head>
\n
"
' <meta charset="UTF-8">
\n
'
' <meta name="viewport" content="width=device-width">
\n
'
)
def
test_streaming_trailing_bare_bool_not_duplicated
(
self
,
parser
,
mock_request
):
"""Trailing bare boolean must not be streamed twice."""
chunks
=
[
"<|tool_call>"
,
"call:Edit{"
,
'file_path:<|"|>src/env.py<|"|>,'
,
'old_string:<|"|>old_val<|"|>,'
,
'new_string:<|"|>new_val<|"|>,'
,
"replace_all:"
,
"false}"
,
"<tool_call|>"
,
]
results
=
self
.
_simulate_streaming
(
parser
,
mock_request
,
chunks
)
args_text
=
self
.
_collect_arguments
(
results
)
assert
args_text
,
"No arguments were streamed"
parsed_args
=
json
.
loads
(
args_text
)
assert
parsed_args
==
{
"file_path"
:
"src/env.py"
,
"old_string"
:
"old_val"
,
"new_string"
:
"new_val"
,
"replace_all"
:
False
,
}
assert
args_text
.
count
(
"replace_all"
)
==
1
tests/v1/e2e/spec_decode/test_spec_decode.py
View file @
fc67613a
...
@@ -542,12 +542,16 @@ def test_eagle_correctness_light(
...
@@ -542,12 +542,16 @@ def test_eagle_correctness_light(
"auto"
,
"auto"
,
0.8
,
0.8
,
),
),
(
pytest
.
param
(
(
"eagle3"
,
"Qwen/Qwen3-8B"
,
"AngelSlim/Qwen3-8B_eagle3"
,
1
),
(
"eagle3"
,
"Qwen/Qwen3-8B"
,
"AngelSlim/Qwen3-8B_eagle3"
,
1
),
False
,
False
,
False
,
False
,
"transformers"
,
"transformers"
,
0.8
,
0.8
,
# TODO(hmellor): figure out why memory usage is so high
marks
=
pytest
.
mark
.
skip
(
reason
=
"Feature is experimental and uses too much memory in CI"
,
),
),
),
pytest
.
param
(
pytest
.
param
(
(
(
...
...
vllm/_custom_ops.py
View file @
fc67613a
...
@@ -3397,3 +3397,38 @@ if hasattr(torch.ops._C, "hadacore_transform"):
...
@@ -3397,3 +3397,38 @@ if hasattr(torch.ops._C, "hadacore_transform"):
@
register_fake
(
"_C::hadacore_transform"
)
@
register_fake
(
"_C::hadacore_transform"
)
def
_hadacore_transform_fake
(
x
:
torch
.
Tensor
,
inplace
:
bool
)
->
torch
.
Tensor
:
def
_hadacore_transform_fake
(
x
:
torch
.
Tensor
,
inplace
:
bool
)
->
torch
.
Tensor
:
return
torch
.
empty_like
(
x
)
if
not
inplace
else
x
return
torch
.
empty_like
(
x
)
if
not
inplace
else
x
if
hasattr
(
torch
.
ops
.
_C
,
"minimax_allreduce_rms"
):
@
register_fake
(
"_C::minimax_allreduce_rms"
)
def
_minimax_allreduce_rms_fake
(
input
:
torch
.
Tensor
,
norm_weight
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
rank
:
int
,
nranks
:
int
,
eps
:
float
,
)
->
torch
.
Tensor
:
return
torch
.
empty_like
(
input
)
if
hasattr
(
torch
.
ops
.
_C
,
"minimax_allreduce_rms_qk"
):
@
register_fake
(
"_C::minimax_allreduce_rms_qk"
)
def
_minimax_allreduce_rms_qk_fake
(
qkv
:
torch
.
Tensor
,
norm_weight_q
:
torch
.
Tensor
,
norm_weight_k
:
torch
.
Tensor
,
workspace
:
torch
.
Tensor
,
q_size
:
int
,
kv_size
:
int
,
rank
:
int
,
nranks
:
int
,
eps
:
float
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
token_num
=
qkv
.
shape
[
0
]
return
(
torch
.
empty
([
token_num
,
q_size
],
dtype
=
qkv
.
dtype
,
device
=
qkv
.
device
),
torch
.
empty
([
token_num
,
kv_size
],
dtype
=
qkv
.
dtype
,
device
=
qkv
.
device
),
)
vllm/compilation/decorators.py
View file @
fc67613a
...
@@ -205,6 +205,8 @@ def support_torch_compile(
...
@@ -205,6 +205,8 @@ def support_torch_compile(
if
v
.
annotation
in
[
if
v
.
annotation
in
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
|
None
,
torch
.
Tensor
|
None
,
torch
.
FloatTensor
,
torch
.
FloatTensor
|
None
,
IntermediateTensors
,
IntermediateTensors
,
IntermediateTensors
|
None
,
IntermediateTensors
|
None
,
]:
]:
...
@@ -346,7 +348,7 @@ def _support_torch_compile(
...
@@ -346,7 +348,7 @@ def _support_torch_compile(
def
__init__
(
def
__init__
(
self
:
_T
,
self
:
_T
,
*
,
*
args
,
vllm_config
:
VllmConfig
|
None
=
None
,
vllm_config
:
VllmConfig
|
None
=
None
,
prefix
:
str
=
""
,
prefix
:
str
=
""
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
...
@@ -357,11 +359,24 @@ def _support_torch_compile(
...
@@ -357,11 +359,24 @@ def _support_torch_compile(
# NOTE: to support multimodal models (such as encoder),
# NOTE: to support multimodal models (such as encoder),
# we may not have vllm_config so we may need to patch it
# we may not have vllm_config so we may need to patch it
sig
=
inspect
.
signature
(
old_init
)
sig
=
inspect
.
signature
(
old_init
)
# Check that any positional arguments match the old_init method signature
annotations
=
[
p
.
annotation
for
p
in
sig
.
parameters
.
values
()]
for
arg
,
annotation
in
zip
(
args
,
annotations
):
if
annotation
is
inspect
.
_empty
:
continue
if
not
isinstance
(
arg
,
annotation
):
init
=
f
"'
{
type
(
self
).
__name__
}
.__init__'"
arg_type
=
f
"'
{
type
(
arg
).
__name__
}
'"
raise
TypeError
(
f
"
{
init
}
received a positional argument of type
{
arg_type
}
, "
"but no parameter of that type was found in the method signature. "
f
"Please either annotate
{
init
}
or pass it as a keyword argument."
)
if
"vllm_config"
in
sig
.
parameters
:
if
"vllm_config"
in
sig
.
parameters
:
kwargs
[
"vllm_config"
]
=
vllm_config
kwargs
[
"vllm_config"
]
=
vllm_config
if
"prefix"
in
sig
.
parameters
:
if
"prefix"
in
sig
.
parameters
:
kwargs
[
"prefix"
]
=
prefix
kwargs
[
"prefix"
]
=
prefix
old_init
(
self
,
**
kwargs
)
old_init
(
self
,
*
args
,
**
kwargs
)
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
compilation_config
=
self
.
vllm_config
.
compilation_config
self
.
compilation_config
=
self
.
vllm_config
.
compilation_config
...
...
vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
0 → 100644
View file @
fc67613a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Fusion pass: replace MiniMax QK allreduce + RMS norm with the Lamport
fused kernel (minimax_allreduce_rms_qk) for decode-size batches.
Pattern (inlined forward_qk in compiled graph):
q, k, v = qkv.split([q_size, kv_size, kv_size], -1)
q_fp32 = q.to(float32); k_fp32 = k.to(float32)
q_var = q_fp32.pow(2).mean(-1, keepdim=True)
k_var = k_fp32.pow(2).mean(-1, keepdim=True)
qk_var = cat([q_var, k_var], -1)
qk_var = allreduce(qk_var) / tp_world
q_var, k_var = qk_var.chunk(2, -1)
q_out = (q_fp32 * rsqrt(q_var + eps) * q_weight).to(orig_dtype)
k_out = (k_fp32 * rsqrt(k_var + eps) * k_weight).to(orig_dtype)
return q_out, k_out, v
Replacement (pure, no in-place on qkv/q/k):
q_out, k_out = minimax_qk_norm_fused(qkv, q_weight, k_weight, workspace, ...)
v = qkv.split([q_size, kv_size, kv_size], -1)[2]
return q_out, k_out, v
is_applicable_for_range: only fires for compile_range.end <= max_decode_tokens
so that large prefill batches fall through to the original forward_qk (= main).
"""
import
torch
import
torch._inductor.pattern_matcher
as
pm
import
torch.fx
as
fx
from
torch._inductor.pattern_matcher
import
PatternMatcherPass
from
vllm.config
import
VllmConfig
from
vllm.config.utils
import
Range
from
vllm.distributed
import
tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.logger
import
init_logger
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
..inductor_pass
import
enable_fake_mode
from
..vllm_inductor_pass
import
VllmInductorPass
,
VllmPatternMatcherPass
logger
=
init_logger
(
__name__
)
MAX_TOKEN_NUM
=
2048
_MINIMAX_QK_NORM_FUSED_OP
=
None
if
hasattr
(
torch
.
ops
.
_C
,
"minimax_allreduce_rms_qk"
):
def
_minimax_qk_norm_fused
(
qkv
:
torch
.
Tensor
,
norm_weight_q
:
torch
.
Tensor
,
norm_weight_k
:
torch
.
Tensor
,
q_size
:
int
,
kv_size
:
int
,
rank
:
int
,
nranks
:
int
,
eps
:
float
,
max_tokens
:
int
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm.distributed.parallel_state
import
get_tp_group
from
vllm.model_executor.layers.mamba.lamport_workspace
import
(
get_allreduce_workspace
,
)
workspace
=
get_allreduce_workspace
(
rank
=
rank
,
world_size
=
nranks
,
max_tokens
=
max_tokens
,
process_group
=
get_tp_group
().
cpu_group
,
)
return
torch
.
ops
.
_C
.
minimax_allreduce_rms_qk
(
qkv
,
norm_weight_q
,
norm_weight_k
,
workspace
,
q_size
,
kv_size
,
rank
,
nranks
,
eps
,
)
def
_minimax_qk_norm_fused_fake
(
qkv
:
torch
.
Tensor
,
norm_weight_q
:
torch
.
Tensor
,
norm_weight_k
:
torch
.
Tensor
,
q_size
:
int
,
kv_size
:
int
,
rank
:
int
,
nranks
:
int
,
eps
:
float
,
max_tokens
:
int
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
T
=
qkv
.
shape
[
0
]
return
(
torch
.
empty
([
T
,
q_size
],
dtype
=
qkv
.
dtype
,
device
=
qkv
.
device
),
torch
.
empty
([
T
,
kv_size
],
dtype
=
qkv
.
dtype
,
device
=
qkv
.
device
),
)
direct_register_custom_op
(
op_name
=
"minimax_qk_norm_fused"
,
op_func
=
_minimax_qk_norm_fused
,
fake_impl
=
_minimax_qk_norm_fused_fake
,
mutates_args
=
[],
)
_MINIMAX_QK_NORM_FUSED_OP
=
torch
.
ops
.
vllm
.
minimax_qk_norm_fused
.
default
class
MiniMaxQKNormPattern
:
"""
Match the forward_qk allreduce+rms pattern and replace with Lamport kernel.
"""
def
__init__
(
self
,
q_size
:
int
,
kv_size
:
int
,
eps
:
float
,
tp_world
:
int
,
tp_rank
:
int
,
max_tokens
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
|
None
,
)
->
None
:
self
.
q_size
=
q_size
self
.
kv_size
=
kv_size
self
.
eps
=
eps
self
.
tp_world
=
tp_world
self
.
tp_rank
=
tp_rank
self
.
max_tokens
=
max_tokens
self
.
dtype
=
dtype
self
.
device
=
device
def
get_inputs
(
self
)
->
list
[
torch
.
Tensor
]:
T
=
4
qkv
=
torch
.
empty
(
[
T
,
self
.
q_size
+
2
*
self
.
kv_size
],
device
=
self
.
device
,
dtype
=
self
.
dtype
,
)
q_weight
=
torch
.
empty
([
self
.
q_size
],
device
=
self
.
device
,
dtype
=
self
.
dtype
)
k_weight
=
torch
.
empty
([
self
.
kv_size
],
device
=
self
.
device
,
dtype
=
self
.
dtype
)
return
[
qkv
,
q_weight
,
k_weight
]
def
register
(
self
,
pm_pass
:
PatternMatcherPass
)
->
None
:
q_size
=
self
.
q_size
kv_size
=
self
.
kv_size
eps
=
self
.
eps
tp_world
=
self
.
tp_world
max_tokens
=
self
.
max_tokens
tp_rank
=
self
.
tp_rank
dtype
=
self
.
dtype
def
pattern
(
qkv
:
torch
.
Tensor
,
q_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
q
,
k
,
v
=
qkv
.
split
([
q_size
,
kv_size
,
kv_size
],
dim
=-
1
)
q_fp32
=
q
.
to
(
torch
.
float32
)
k_fp32
=
k
.
to
(
torch
.
float32
)
q_var
=
q_fp32
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
k_var
=
k_fp32
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
qk_var
=
torch
.
cat
([
q_var
,
k_var
],
dim
=-
1
)
qk_var
=
tensor_model_parallel_all_reduce
(
qk_var
)
/
tp_world
q_var
,
k_var
=
qk_var
.
chunk
(
2
,
dim
=-
1
)
q_out
=
(
q_fp32
*
torch
.
rsqrt
(
q_var
+
eps
)
*
q_weight
).
to
(
dtype
)
k_out
=
(
k_fp32
*
torch
.
rsqrt
(
k_var
+
eps
)
*
k_weight
).
to
(
dtype
)
return
q_out
,
k_out
,
v
def
replacement
(
qkv
:
torch
.
Tensor
,
q_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
assert
_MINIMAX_QK_NORM_FUSED_OP
is
not
None
q_out
,
k_out
=
torch
.
ops
.
vllm
.
minimax_qk_norm_fused
(
qkv
,
q_weight
,
k_weight
,
q_size
,
kv_size
,
tp_rank
,
tp_world
,
eps
,
max_tokens
,
)
_
,
_
,
v
=
qkv
.
split
([
q_size
,
kv_size
,
kv_size
],
dim
=-
1
)
return
q_out
,
k_out
,
v
pm
.
register_replacement
(
pattern
,
replacement
,
self
.
get_inputs
(),
pm
.
fwd_only
,
pm_pass
)
# Second pattern: three separate split_with_sizes nodes (one per output),
# each with _users=1. This occurs when the QKV projection uses a
# functional GEMM kernel (e.g. cutlass_scaled_mm via auto_functionalized),
# which causes inductor to generate one split per consumer.
def
pattern_split3
(
qkv
:
torch
.
Tensor
,
q_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
q
=
qkv
.
split
([
q_size
,
kv_size
,
kv_size
],
dim
=-
1
)[
0
]
k
=
qkv
.
split
([
q_size
,
kv_size
,
kv_size
],
dim
=-
1
)[
1
]
v
=
qkv
.
split
([
q_size
,
kv_size
,
kv_size
],
dim
=-
1
)[
2
]
q_fp32
=
q
.
to
(
torch
.
float32
)
k_fp32
=
k
.
to
(
torch
.
float32
)
q_var
=
q_fp32
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
k_var
=
k_fp32
.
pow
(
2
).
mean
(
dim
=-
1
,
keepdim
=
True
)
qk_var
=
torch
.
cat
([
q_var
,
k_var
],
dim
=-
1
)
qk_var
=
tensor_model_parallel_all_reduce
(
qk_var
)
/
tp_world
q_var
,
k_var
=
qk_var
.
chunk
(
2
,
dim
=-
1
)
q_out
=
(
q_fp32
*
torch
.
rsqrt
(
q_var
+
eps
)
*
q_weight
).
to
(
dtype
)
k_out
=
(
k_fp32
*
torch
.
rsqrt
(
k_var
+
eps
)
*
k_weight
).
to
(
dtype
)
return
q_out
,
k_out
,
v
pm
.
register_replacement
(
pattern_split3
,
replacement
,
self
.
get_inputs
(),
pm
.
fwd_only
,
pm_pass
)
class
MiniMaxQKNormPass
(
VllmPatternMatcherPass
):
"""
Replace forward_qk allreduce+norm with the Lamport fused kernel.
Only applied for decode-size compile ranges (small token counts).
"""
def
__init__
(
self
,
config
:
VllmConfig
)
->
None
:
super
().
__init__
(
config
)
self
.
disabled
=
True
if
_MINIMAX_QK_NORM_FUSED_OP
is
None
:
logger
.
warning_once
(
"minimax_allreduce_rms_qk op not found, MiniMaxQKNormPass disabled."
)
return
tp_world
=
get_tensor_model_parallel_world_size
()
if
tp_world
<=
1
:
logger
.
warning_once
(
"MiniMaxQKNormPass disabled: tp_size <= 1."
)
return
if
config
.
model_config
is
None
:
logger
.
warning_once
(
"MiniMaxQKNormPass disabled: no model_config."
)
return
hf_cfg
=
config
.
model_config
.
hf_config
model_name
=
getattr
(
hf_cfg
,
"architectures"
,
""
)[
0
]
if
model_name
!=
"MiniMaxM2ForCausalLM"
:
return
num_attention_heads
=
getattr
(
hf_cfg
,
"num_attention_heads"
,
0
)
num_key_value_heads
=
getattr
(
hf_cfg
,
"num_key_value_heads"
,
0
)
hidden_size
=
getattr
(
hf_cfg
,
"hidden_size"
,
0
)
head_dim
=
getattr
(
hf_cfg
,
"head_dim"
,
0
)
eps
:
float
=
getattr
(
hf_cfg
,
"rms_norm_eps"
,
1e-6
)
if
(
num_attention_heads
!=
48
or
num_key_value_heads
!=
8
or
hidden_size
!=
3072
or
head_dim
!=
128
):
logger
.
warning_once
(
"MiniMaxQKNormPass disabled: cannot infer model info from hf_config."
)
return
num_heads_per_rank
=
num_attention_heads
//
tp_world
num_kv_heads_per_rank
=
max
(
1
,
num_key_value_heads
//
tp_world
)
q_size
=
num_heads_per_rank
*
head_dim
kv_size
=
num_kv_heads_per_rank
*
head_dim
self
.
max_token_num
=
min
(
MAX_TOKEN_NUM
,
config
.
scheduler_config
.
max_num_batched_tokens
)
tp_rank
=
get_tensor_model_parallel_rank
()
# Allocate Lamport workspace first.
from
vllm.distributed.parallel_state
import
get_tp_group
from
vllm.model_executor.layers.mamba.lamport_workspace
import
(
get_allreduce_workspace
,
)
get_allreduce_workspace
(
rank
=
tp_rank
,
world_size
=
tp_world
,
max_tokens
=
self
.
max_token_num
,
process_group
=
get_tp_group
().
cpu_group
,
)
self
.
patterns
:
PatternMatcherPass
=
PatternMatcherPass
(
pass_name
=
"minimax_qk_norm_pass"
)
self
.
_register_patterns
(
q_size
,
kv_size
,
eps
,
tp_world
,
tp_rank
)
self
.
dump_patterns
(
config
,
self
.
patterns
)
self
.
disabled
=
False
@
enable_fake_mode
def
_register_patterns
(
self
,
q_size
:
int
,
kv_size
:
int
,
eps
:
float
,
tp_world
:
int
,
tp_rank
:
int
,
)
->
None
:
MiniMaxQKNormPattern
(
q_size
=
q_size
,
kv_size
=
kv_size
,
eps
=
eps
,
tp_world
=
tp_world
,
tp_rank
=
tp_rank
,
max_tokens
=
self
.
max_token_num
,
dtype
=
self
.
model_dtype
,
device
=
self
.
device
,
).
register
(
self
.
patterns
)
def
is_applicable_for_range
(
self
,
compile_range
:
Range
)
->
bool
:
if
self
.
disabled
:
return
False
return
bool
(
compile_range
.
end
<=
self
.
max_token_num
)
@
VllmInductorPass
.
time_and_log
def
__call__
(
self
,
graph
:
fx
.
Graph
)
->
None
:
if
self
.
disabled
:
return
self
.
matched_count
=
self
.
patterns
.
apply
(
graph
)
logger
.
debug
(
"MiniMaxQKNormPass replaced %s patterns"
,
self
.
matched_count
)
def
uuid
(
self
)
->
str
:
return
VllmInductorPass
.
hash_source
(
self
,
MiniMaxQKNormPattern
)
vllm/compilation/passes/pass_manager.py
View file @
fc67613a
...
@@ -36,6 +36,7 @@ if current_platform.is_cuda_alike():
...
@@ -36,6 +36,7 @@ if current_platform.is_cuda_alike():
if
current_platform
.
is_cuda
():
if
current_platform
.
is_cuda
():
from
.fusion.allreduce_rms_fusion
import
AllReduceFusionPass
from
.fusion.allreduce_rms_fusion
import
AllReduceFusionPass
from
.fusion.collective_fusion
import
AsyncTPPass
from
.fusion.collective_fusion
import
AsyncTPPass
from
.fusion.minimax_qk_norm_fusion
import
MiniMaxQKNormPass
from
.inductor_pass
import
(
from
.inductor_pass
import
(
CustomGraphPass
,
CustomGraphPass
,
...
@@ -124,6 +125,9 @@ class PostGradPassManager(CustomGraphPass): # type: ignore[misc]
...
@@ -124,6 +125,9 @@ class PostGradPassManager(CustomGraphPass): # type: ignore[misc]
if
self
.
pass_config
.
fuse_allreduce_rms
:
if
self
.
pass_config
.
fuse_allreduce_rms
:
self
.
passes
+=
[
AllReduceFusionPass
(
config
)]
self
.
passes
+=
[
AllReduceFusionPass
(
config
)]
if
self
.
pass_config
.
fuse_minimax_qk_norm
:
self
.
passes
+=
[
MiniMaxQKNormPass
(
config
)]
if
self
.
pass_config
.
fuse_norm_quant
:
if
self
.
pass_config
.
fuse_norm_quant
:
self
.
passes
+=
[
RMSNormQuantFusionPass
(
config
)]
self
.
passes
+=
[
RMSNormQuantFusionPass
(
config
)]
if
rocm_aiter_ops
.
is_enabled
():
if
rocm_aiter_ops
.
is_enabled
():
...
...
vllm/config/compilation.py
View file @
fc67613a
...
@@ -132,6 +132,8 @@ class PassConfig:
...
@@ -132,6 +132,8 @@ class PassConfig:
"""Enable async TP."""
"""Enable async TP."""
fuse_allreduce_rms
:
bool
=
None
# type: ignore[assignment]
fuse_allreduce_rms
:
bool
=
None
# type: ignore[assignment]
"""Enable flashinfer allreduce fusion."""
"""Enable flashinfer allreduce fusion."""
fuse_minimax_qk_norm
:
bool
=
None
# type: ignore[assignment]
"""Enable fused allreduce+RMSNorm for MiniMax QK norm."""
enable_qk_norm_rope_fusion
:
bool
=
False
enable_qk_norm_rope_fusion
:
bool
=
False
"""Enable fused Q/K RMSNorm + RoPE pass."""
"""Enable fused Q/K RMSNorm + RoPE pass."""
...
@@ -282,7 +284,7 @@ class PassConfig:
...
@@ -282,7 +284,7 @@ class PassConfig:
"""
"""
enabled_fusions
=
[
enabled_fusions
=
[
f
.
name
[
len
(
"fuse_"
)
:]
f
.
name
[
len
(
"fuse_"
)
:]
for
f
in
fields
(
self
)
for
f
in
fields
(
self
)
# type: ignore[arg-type]
if
getattr
(
self
,
f
.
name
)
and
f
.
name
.
startswith
(
"fuse_"
)
if
getattr
(
self
,
f
.
name
)
and
f
.
name
.
startswith
(
"fuse_"
)
]
]
...
@@ -486,9 +488,10 @@ class CompilationConfig:
...
@@ -486,9 +488,10 @@ class CompilationConfig:
If empty list [], no ops are excluded (suitable for full cudagraphs)."""
If empty list [], no ops are excluded (suitable for full cudagraphs)."""
compile_mm_encoder
:
bool
=
False
compile_mm_encoder
:
bool
=
False
"""Whether or not to compile the multimodal encoder.
"""Whether or not to compile the multimodal encoder.
Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models
Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models on selected
on selected platforms. Disabled by default until more models
platforms. It may also work for models loaded with the Transformers modeling backend
are supported/tested to work."""
if the encoder is compilable. Disabled by default until more models are
supported/tested to work."""
# Vision encoder CUDA graph
# Vision encoder CUDA graph
cudagraph_mm_encoder
:
bool
=
False
cudagraph_mm_encoder
:
bool
=
False
...
...
vllm/config/speculative.py
View file @
fc67613a
...
@@ -805,6 +805,8 @@ class SpeculativeConfig:
...
@@ -805,6 +805,8 @@ class SpeculativeConfig:
"deepseek_v3"
,
"deepseek_v3"
,
"kimi_k2"
,
"kimi_k2"
,
"kimi_k25"
,
"kimi_k25"
,
"minimax_m2"
,
"gemma4"
,
]
]
if
(
if
(
self
.
method
in
(
"eagle3"
,
"extract_hidden_states"
)
self
.
method
in
(
"eagle3"
,
"extract_hidden_states"
)
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment