Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
425eb81e
Commit
425eb81e
authored
Feb 24, 2026
by
jujl1
Browse files
Merge branch 'v0.15.1-dev' into 'v0.15.1-dev-w4a8+pp_balance'
# Conflicts: # vllm/envs.py
parents
7b2122d9
358bc2c5
Changes
36
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2070 additions
and
34 deletions
+2070
-34
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+72
-5
vllm/model_executor/models/config.py
vllm/model_executor/models/config.py
+27
-0
vllm/model_executor/models/glm4_moe_mtp.py
vllm/model_executor/models/glm4_moe_mtp.py
+2
-0
vllm/model_executor/models/qwen3_5.py
vllm/model_executor/models/qwen3_5.py
+993
-0
vllm/model_executor/models/qwen3_5_mtp.py
vllm/model_executor/models/qwen3_5_mtp.py
+447
-0
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+26
-3
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+14
-9
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+10
-0
vllm/renderers/hf.py
vllm/renderers/hf.py
+12
-5
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+8
-0
vllm/transformers_utils/configs/qwen3_5.py
vllm/transformers_utils/configs/qwen3_5.py
+193
-0
vllm/transformers_utils/configs/qwen3_5_moe.py
vllm/transformers_utils/configs/qwen3_5_moe.py
+205
-0
vllm/transformers_utils/model_arch_config_convertor.py
vllm/transformers_utils/model_arch_config_convertor.py
+6
-0
vllm/utils/__init__.py
vllm/utils/__init__.py
+17
-8
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+2
-0
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+36
-4
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
425eb81e
...
@@ -47,6 +47,8 @@ from vllm.utils.flashinfer import (
...
@@ -47,6 +47,8 @@ from vllm.utils.flashinfer import (
should_use_flashinfer_for_blockscale_fp8_gemm
,
should_use_flashinfer_for_blockscale_fp8_gemm
,
)
)
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
vllm.utils.torch_utils
import
direct_register_custom_op
from
lmslim
import
quant_ops
from
lmslim.quantize.quant_ops
import
BlockSize
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -357,6 +359,7 @@ class W8A8BlockFp8LinearOp:
...
@@ -357,6 +359,7 @@ class W8A8BlockFp8LinearOp:
act_quant_group_shape
:
GroupShape
,
act_quant_group_shape
:
GroupShape
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
use_aiter_and_is_supported
:
bool
=
False
,
use_aiter_and_is_supported
:
bool
=
False
,
use_blaslt
:
bool
=
False
,
):
):
self
.
weight_group_shape
=
weight_group_shape
self
.
weight_group_shape
=
weight_group_shape
self
.
act_quant_group_shape
=
act_quant_group_shape
self
.
act_quant_group_shape
=
act_quant_group_shape
...
@@ -364,14 +367,13 @@ class W8A8BlockFp8LinearOp:
...
@@ -364,14 +367,13 @@ class W8A8BlockFp8LinearOp:
self
.
is_hopper
=
current_platform
.
is_device_capability
(
90
)
self
.
is_hopper
=
current_platform
.
is_device_capability
(
90
)
self
.
use_deep_gemm_e8m0
=
is_deep_gemm_e8m0_used
()
self
.
use_deep_gemm_e8m0
=
is_deep_gemm_e8m0_used
()
self
.
is_flashinfer_supported
=
is_flashinfer_fp8_blockscale_gemm_supported
()
self
.
is_flashinfer_supported
=
is_flashinfer_fp8_blockscale_gemm_supported
()
# Get the correct blockscale mul and input quant operations.
# Get the correct blockscale mul and input quant operations.
# We can't use _dispatch_w8a8_blockscale_op to figure out if we want
# We can't use _dispatch_w8a8_blockscale_op to figure out if we want
# to use deepgemm because we don't know the shape of weights (and
# to use deepgemm because we don't know the shape of weights (and
# whether deepgemm supports it) at the init time.
# whether deepgemm supports it) at the init time.
self
.
w8a8_blockscale_op
,
self
.
input_quant_op
=
(
self
.
w8a8_blockscale_op
,
self
.
input_quant_op
=
(
self
.
_dispatch_w8a8_blockscale_op
(
self
.
_dispatch_w8a8_blockscale_op
(
cutlass_block_fp8_supported
,
use_aiter_and_is_supported
cutlass_block_fp8_supported
,
use_aiter_and_is_supported
,
use_blaslt
)
)
)
)
self
.
deepgemm_input_quant_op
=
(
self
.
deepgemm_input_quant_op
=
(
...
@@ -397,8 +399,14 @@ class W8A8BlockFp8LinearOp:
...
@@ -397,8 +399,14 @@ class W8A8BlockFp8LinearOp:
assert
input_scale
is
None
assert
input_scale
is
None
# View input as 2D matrix for fp8 methods
# View input as 2D matrix for fp8 methods
input_2d
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
input_2d
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
0
]
]
output_shape
=
[]
output_dtype
=
input
.
dtype
output_dtype
=
input
.
dtype
if
envs
.
VLLM_W8A8_BACKEND
==
3
:
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
-
1
]]
out_features
=
int
(
weight
.
shape
[
-
1
])
else
:
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
0
]]
out_features
=
int
(
weight
.
shape
[
0
])
if
should_use_flashinfer_for_blockscale_fp8_gemm
(
if
should_use_flashinfer_for_blockscale_fp8_gemm
(
self
.
is_flashinfer_supported
,
output_dtype
,
input_2d
,
weight
self
.
is_flashinfer_supported
,
output_dtype
,
input_2d
,
weight
...
@@ -413,7 +421,7 @@ class W8A8BlockFp8LinearOp:
...
@@ -413,7 +421,7 @@ class W8A8BlockFp8LinearOp:
output
=
self
.
_run_deepgemm
(
input_2d
,
weight
,
weight_scale
)
output
=
self
.
_run_deepgemm
(
input_2d
,
weight
,
weight_scale
)
else
:
else
:
output
=
self
.
w8a8_blockscale_op
(
output
=
self
.
w8a8_blockscale_op
(
input_2d
,
weight
,
weight_scale
,
input_scale
out_features
,
input_2d
,
weight
,
weight_scale
,
input_scale
)
)
if
bias
is
not
None
:
if
bias
is
not
None
:
...
@@ -535,6 +543,37 @@ class W8A8BlockFp8LinearOp:
...
@@ -535,6 +543,37 @@ class W8A8BlockFp8LinearOp:
input_2d
.
dtype
,
input_2d
.
dtype
,
)
)
def
_run_hipblaslt_blockwise
(
self
,
out_features
:
int
,
input_2d
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
weight_scale
:
torch
.
Tensor
,
input_scale
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
m
,
k
=
input_2d
.
shape
n
=
out_features
if
input_scale
is
None
:
q_input
,
input_scale
=
self
.
input_quant_op
(
input_2d
)
else
:
q_input
=
input_2d
enum_block_size
=
BlockSize
.
block_128x128
if
hasattr
(
self
,
"block_size"
)
and
self
.
block_size
[
0
]
==
64
:
enum_block_size
=
BlockSize
.
block_64x64
output
=
hipblaslt_w8a8_block_fp8_matmul
(
A
=
q_input
,
B
=
weight
,
As
=
input_scale
,
Bs
=
weight_scale
,
block_size
=
enum_block_size
,
output_dtype
=
torch
.
bfloat16
,
)
return
output
def
_run_flashinfer
(
def
_run_flashinfer
(
self
,
self
,
input_2d
:
torch
.
Tensor
,
input_2d
:
torch
.
Tensor
,
...
@@ -562,6 +601,7 @@ class W8A8BlockFp8LinearOp:
...
@@ -562,6 +601,7 @@ class W8A8BlockFp8LinearOp:
self
,
self
,
use_cutlass
:
bool
,
use_cutlass
:
bool
,
use_aiter_and_is_supported
:
bool
,
use_aiter_and_is_supported
:
bool
,
use_blaslt
:
bool
,
)
->
tuple
[
)
->
tuple
[
Callable
[
Callable
[
[
[
...
@@ -585,6 +625,16 @@ class W8A8BlockFp8LinearOp:
...
@@ -585,6 +625,16 @@ class W8A8BlockFp8LinearOp:
)
)
if
use_aiter_and_is_supported
:
if
use_aiter_and_is_supported
:
return
self
.
_run_aiter
,
None
return
self
.
_run_aiter
,
None
if
envs
.
VLLM_W8A8_BACKEND
==
3
or
use_blaslt
:
return
(
self
.
_run_hipblaslt_blockwise
,
QuantFP8
(
False
,
self
.
act_quant_group_shape
,
column_major_scales
=
False
,
use_ue8m0
=
False
,
),
)
return
self
.
_run_triton
,
(
return
self
.
_run_triton
,
(
QuantFP8
(
QuantFP8
(
False
,
False
,
...
@@ -1179,6 +1229,19 @@ def get_w8a8_block_fp8_configs(
...
@@ -1179,6 +1229,19 @@ def get_w8a8_block_fp8_configs(
)
)
return
None
return
None
def
hipblaslt_w8a8_block_fp8_matmul
(
A
:
torch
.
Tensor
,
B
:
torch
.
Tensor
,
As
:
torch
.
Tensor
,
Bs
:
torch
.
Tensor
,
block_size
:
BlockSize
,
output_dtype
:
torch
.
dtype
=
torch
.
float16
,
)
->
torch
.
Tensor
:
assert
A
.
shape
[
1
]
==
B
.
shape
[
0
]
m
,
k
=
A
.
shape
_
,
n
=
B
.
shape
_
,
d
=
quant_ops
.
hipblaslt_w8a8_blockwise_gemm
(
A
,
B
,
As
,
Bs
,
m
,
n
,
k
,
'NN'
,
output_dtype
,
block_size
,
None
)
return
d
def
w8a8_triton_block_scaled_mm
(
def
w8a8_triton_block_scaled_mm
(
A
:
torch
.
Tensor
,
A
:
torch
.
Tensor
,
...
@@ -1597,7 +1660,11 @@ def process_fp8_weight_block_strategy(
...
@@ -1597,7 +1660,11 @@ def process_fp8_weight_block_strategy(
weight
=
weight
,
weight_scale
=
weight_scale
weight
=
weight
,
weight_scale
=
weight_scale
)
)
weight
=
_maybe_pad_fp8_weight
(
weight
)
if
envs
.
VLLM_W8A8_BACKEND
==
3
:
weight
=
weight
.
T
.
contiguous
()
weight_scale
=
weight_scale
.
T
.
contiguous
()
else
:
weight
=
_maybe_pad_fp8_weight
(
weight
)
return
weight
,
weight_scale
return
weight
,
weight_scale
...
...
vllm/model_executor/models/config.py
View file @
425eb81e
...
@@ -581,6 +581,31 @@ class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
...
@@ -581,6 +581,31 @@ class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
)
)
cache_config
.
mamba_ssm_cache_dtype
=
mamba_ssm_cache_dtype
cache_config
.
mamba_ssm_cache_dtype
=
mamba_ssm_cache_dtype
class
Qwen3_5ForConditionalGenerationConfig
(
VerifyAndUpdateConfig
):
@
staticmethod
def
verify_and_update_config
(
vllm_config
:
"VllmConfig"
)
->
None
:
"""Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
(or not explicitly set), to the value specified in the HF config's
mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
different value.
"""
cache_config
=
vllm_config
.
cache_config
hf_text_config
=
vllm_config
.
model_config
.
hf_text_config
mamba_ssm_dtype
=
getattr
(
hf_text_config
,
"mamba_ssm_dtype"
,
None
)
if
cache_config
.
mamba_ssm_cache_dtype
==
"auto"
:
if
mamba_ssm_dtype
is
not
None
:
cache_config
.
mamba_ssm_cache_dtype
=
mamba_ssm_dtype
elif
(
mamba_ssm_dtype
is
not
None
and
cache_config
.
mamba_ssm_cache_dtype
!=
mamba_ssm_dtype
):
logger
.
warning
(
"Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
"but --mamba-ssm-cache-dtype='%s' was passed. "
"Using the user-specified value."
,
mamba_ssm_dtype
,
cache_config
.
mamba_ssm_cache_dtype
,
)
MODELS_CONFIG_MAP
:
dict
[
str
,
type
[
VerifyAndUpdateConfig
]]
=
{
MODELS_CONFIG_MAP
:
dict
[
str
,
type
[
VerifyAndUpdateConfig
]]
=
{
"GteModel"
:
SnowflakeGteNewModelConfig
,
"GteModel"
:
SnowflakeGteNewModelConfig
,
...
@@ -603,4 +628,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
...
@@ -603,4 +628,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"FalconMambaForCausalLM"
:
MambaModelConfig
,
"FalconMambaForCausalLM"
:
MambaModelConfig
,
"DeepseekV32ForCausalLM"
:
DeepseekV32ForCausalLM
,
"DeepseekV32ForCausalLM"
:
DeepseekV32ForCausalLM
,
"NemotronHForCausalLM"
:
NemotronHForCausalLMConfig
,
"NemotronHForCausalLM"
:
NemotronHForCausalLMConfig
,
"Qwen3_5ForConditionalGeneration"
:
Qwen3_5ForConditionalGenerationConfig
,
"Qwen3_5MoeForConditionalGeneration"
:
Qwen3_5ForConditionalGenerationConfig
,
}
}
vllm/model_executor/models/glm4_moe_mtp.py
View file @
425eb81e
...
@@ -49,6 +49,7 @@ from .glm4_moe import (
...
@@ -49,6 +49,7 @@ from .glm4_moe import (
get_spec_layer_idx_from_weight_name
,
get_spec_layer_idx_from_weight_name
,
)
)
from
.utils
import
maybe_prefix
from
.utils
import
maybe_prefix
from
vllm.compilation.decorators
import
support_torch_compile
class
SharedHead
(
nn
.
Module
):
class
SharedHead
(
nn
.
Module
):
...
@@ -184,6 +185,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
...
@@ -184,6 +185,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
return
logits
return
logits
@
support_torch_compile
class
Glm4MoeMTP
(
nn
.
Module
,
Glm4MixtureOfExperts
):
class
Glm4MoeMTP
(
nn
.
Module
,
Glm4MixtureOfExperts
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
...
...
vllm/model_executor/models/qwen3_5.py
0 → 100644
View file @
425eb81e
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/qwen3_5_mtp.py
0 → 100644
View file @
425eb81e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Inference-only Qwen3_5 MTP model."""
import
typing
from
collections.abc
import
Callable
,
Iterable
import
torch
from
torch
import
nn
from
transformers.models.qwen3_5.configuration_qwen3_5
import
Qwen3_5TextConfig
from
transformers.models.qwen3_5_moe.configuration_qwen3_5_moe
import
(
Qwen3_5MoeTextConfig
,
)
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
VllmConfig
from
vllm.distributed.parallel_state
import
get_pp_group
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.qwen3_5
import
Qwen3_5DecoderLayer
,
Qwen3_5RMSNorm
from
vllm.model_executor.models.qwen3_next
import
QwenNextMixtureOfExperts
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsMultiModal
,
_require_is_multimodal
,
)
from
.utils
import
(
AutoWeightsLoader
,
PPMissingLayer
,
_merge_multimodal_embeddings
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
maybe_prefix
,
)
logger
=
init_logger
(
__name__
)
@
support_torch_compile
(
dynamic_arg_dims
=
{
"input_ids"
:
0
,
# positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
# otherwise (seq_len, ).
"positions"
:
-
1
,
"intermediate_tensors"
:
0
,
"inputs_embeds"
:
0
,
"hidden_states"
:
0
,
}
)
class
Qwen3_5MultiTokenPredictor
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
model_config
=
vllm_config
.
model_config
quant_config
=
vllm_config
.
quant_config
config
:
Qwen3_5TextConfig
|
Qwen3_5MoeTextConfig
=
model_config
.
hf_text_config
self
.
config
=
config
self
.
vocab_size
=
config
.
vocab_size
self
.
mtp_start_layer_idx
=
config
.
num_hidden_layers
self
.
num_mtp_layers
=
getattr
(
config
,
"mtp_num_hidden_layers"
,
1
)
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
)
self
.
fc
=
ColumnParallelLinear
(
self
.
config
.
hidden_size
*
2
,
self
.
config
.
hidden_size
,
gather_output
=
True
,
bias
=
False
,
return_bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.fc"
,
)
self
.
layers
=
torch
.
nn
.
ModuleList
(
Qwen3_5DecoderLayer
(
vllm_config
,
layer_type
=
"full_attention"
,
prefix
=
f
"
{
prefix
}
.layers.
{
idx
}
"
,
)
for
idx
in
range
(
self
.
num_mtp_layers
)
)
self
.
make_empty_intermediate_tensors
=
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
)
self
.
norm
=
Qwen3_5RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
pre_fc_norm_hidden
=
Qwen3_5RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
pre_fc_norm_embedding
=
Qwen3_5RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
None
:
inputs_embeds
=
self
.
embed_input_ids
(
input_ids
)
assert
hidden_states
.
shape
[
-
1
]
==
inputs_embeds
.
shape
[
-
1
]
inputs_embeds
=
self
.
pre_fc_norm_embedding
(
inputs_embeds
)
hidden_states
=
self
.
pre_fc_norm_hidden
(
hidden_states
)
hidden_states
=
torch
.
cat
([
inputs_embeds
,
hidden_states
],
dim
=-
1
)
hidden_states
=
self
.
fc
(
hidden_states
)
residual
=
None
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
current_step_idx
=
spec_step_idx
%
self
.
num_mtp_layers
hidden_states
,
residual
=
self
.
layers
[
current_step_idx
](
positions
=
positions
,
hidden_states
=
hidden_states
,
residual
=
residual
,
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
(
{
"hidden_states"
:
hidden_states
,
"residual"
:
residual
}
)
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
return
hidden_states
def
load_fused_expert_weights
(
self
,
name
:
str
,
params_dict
:
dict
,
loaded_weight
:
torch
.
Tensor
,
shard_id
:
str
,
num_experts
:
int
,
)
->
bool
:
param
=
params_dict
[
name
]
weight_loader
=
typing
.
cast
(
Callable
[...,
bool
],
param
.
weight_loader
)
loaded_local_expert
=
False
for
expert_id
in
range
(
num_experts
):
curr_expert_weight
=
loaded_weight
[
expert_id
]
success
=
weight_loader
(
param
,
curr_expert_weight
,
name
,
shard_id
,
expert_id
,
return_success
=
True
,
)
if
success
:
loaded_local_expert
=
True
return
loaded_local_expert
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
self
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
num_experts
=
self
.
config
.
num_experts
if
hasattr
(
self
.
config
,
"num_experts"
)
else
0
,
)
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
is_fused_expert
=
False
fused_expert_params_mapping
=
[
(
"experts.w13_weight"
,
"experts.gate_up_proj"
,
0
,
"w1"
),
(
"experts.w2_weight"
,
"experts.down_proj"
,
0
,
"w2"
),
]
num_experts
=
(
self
.
config
.
num_experts
if
hasattr
(
self
.
config
,
"num_experts"
)
else
0
)
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
"experts.gate_up_proj"
in
name
or
"experts.down_proj"
in
name
:
is_fused_expert
=
True
expert_params_mapping
=
fused_expert_params_mapping
if
weight_name
not
in
name
:
continue
if
"mlp.experts"
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# Skip layers on other devices.
if
is_pp_missing_parameter
(
name
,
self
):
continue
if
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
is_expert_weight
=
False
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
expert_id
,
shard_id
=
mapping
if
weight_name
not
in
name
:
continue
is_expert_weight
=
True
name_mapped
=
name
.
replace
(
weight_name
,
param_name
)
# Skip layers on other devices.
if
is_pp_missing_parameter
(
name_mapped
,
self
):
continue
if
is_fused_expert
:
# qwen3.5 no need to transpose
# loaded_weight = loaded_weight.transpose(-1, -2)
if
"experts.gate_up_proj"
in
name
:
loaded_weight
=
loaded_weight
.
chunk
(
2
,
dim
=-
2
)
success_w1
=
self
.
load_fused_expert_weights
(
name_mapped
,
params_dict
,
loaded_weight
[
0
],
"w1"
,
num_experts
,
)
success_w3
=
self
.
load_fused_expert_weights
(
name_mapped
,
params_dict
,
loaded_weight
[
1
],
"w3"
,
num_experts
,
)
success
=
success_w1
and
success_w3
else
:
# down_proj
success
=
self
.
load_fused_expert_weights
(
name_mapped
,
params_dict
,
loaded_weight
,
shard_id
,
num_experts
,
)
if
success
:
name
=
name_mapped
break
else
:
# Skip loading extra bias for GPTQ models.
if
(
name_mapped
.
endswith
(
".bias"
)
or
name_mapped
.
endswith
(
"_bias"
)
)
and
name_mapped
not
in
params_dict
:
continue
param
=
params_dict
[
name_mapped
]
weight_loader
=
param
.
weight_loader
success
=
weight_loader
(
param
,
loaded_weight
,
name_mapped
,
shard_id
=
shard_id
,
expert_id
=
expert_id
,
return_success
=
True
,
)
if
success
:
name
=
name_mapped
break
else
:
if
is_expert_weight
:
# We've checked that this is an expert weight
# However it's not mapped locally to this rank
# So we simply skip it
continue
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
if
name
not
in
params_dict
:
logger
.
warning_once
(
f
"Parameter
{
name
}
not found in params_dict, skip loading"
)
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
@
support_torch_compile
(
dynamic_arg_dims
=
{
"input_ids"
:
0
,
# positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
# otherwise (seq_len, ).
"positions"
:
-
1
,
"intermediate_tensors"
:
0
,
"inputs_embeds"
:
0
,
"hidden_states"
:
0
,
}
)
class
Qwen3_5MTP
(
nn
.
Module
,
SupportsMultiModal
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"up_proj"
,
"down_proj"
],
}
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_text_config
self
.
vllm_config
=
vllm_config
cache_config
=
vllm_config
.
cache_config
if
cache_config
.
mamba_cache_mode
==
"all"
:
raise
NotImplementedError
(
"Qwen3_5MTP currently does not support 'all' prefix caching, "
"please use '--mamba-cache-mode=align' instead"
)
self
.
quant_config
=
vllm_config
.
quant_config
super
().
__init__
()
self
.
config
=
config
self
.
model
=
Qwen3_5MultiTokenPredictor
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"mtp"
)
)
if
get_pp_group
().
is_last_rank
:
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
model
.
embed_tokens
else
:
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
prefix
=
maybe_prefix
(
prefix
,
"lm_head"
),
)
else
:
self
.
lm_head
=
PPMissingLayer
()
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
,
multimodal_embeddings
:
MultiModalEmbeddings
|
None
=
None
,
*
,
is_multimodal
:
torch
.
Tensor
|
None
=
None
,
handle_oov_mm_token
:
bool
=
False
,
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
_embed_text_input_ids
(
input_ids
,
self
.
model
.
embed_input_ids
,
is_multimodal
=
is_multimodal
,
handle_oov_mm_token
=
handle_oov_mm_token
,
)
if
multimodal_embeddings
is
None
or
len
(
multimodal_embeddings
)
==
0
:
return
inputs_embeds
is_multimodal
=
_require_is_multimodal
(
is_multimodal
)
inputs_embeds
=
_merge_multimodal_embeddings
(
inputs_embeds
=
inputs_embeds
,
multimodal_embeddings
=
multimodal_embeddings
,
is_multimodal
=
is_multimodal
,
)
return
inputs_embeds
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
**
kwargs
:
object
,
):
hidden_states
=
self
.
model
(
input_ids
,
positions
,
hidden_states
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
|
None
:
return
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
def
remap_weight_names
(
weights
):
for
name
,
weight
in
weights
:
if
name
.
startswith
(
"mtp."
):
name
=
name
.
replace
(
"mtp."
,
"model."
)
elif
any
(
key
in
name
for
key
in
[
"embed_tokens"
,
"lm_head"
]):
if
"embed_tokens"
in
name
:
name
=
name
.
replace
(
"language_model."
,
""
)
else
:
continue
yield
name
,
weight
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
remap_weight_names
(
weights
))
class
Qwen3_5MoeMTP
(
Qwen3_5MTP
,
QwenNextMixtureOfExperts
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
self
.
set_moe_parameters
()
\ No newline at end of file
vllm/model_executor/models/qwen3_moe.py
View file @
425eb81e
...
@@ -46,6 +46,19 @@ from vllm.distributed import (
...
@@ -46,6 +46,19 @@ from vllm.distributed import (
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
SharedFusedMoE
from
vllm.model_executor.layers.fused_moe
import
SharedFusedMoE
try
:
from
vllm.model_executor.layers.fused_moe.router_capture
import
(
maybe_record_router_logits
,
)
except
ImportError
:
def
maybe_record_router_logits
(
*
,
layer_name
:
str
,
router_logits
:
torch
.
Tensor
,
top_k
:
int
,
)
->
None
:
return
None
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -152,6 +165,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
...
@@ -152,6 +165,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
self
.
n_routed_experts
=
config
.
num_experts
self
.
n_routed_experts
=
config
.
num_experts
self
.
is_sequence_parallel
=
parallel_config
.
use_sequence_parallel_moe
self
.
is_sequence_parallel
=
parallel_config
.
use_sequence_parallel_moe
self
.
_router_top_k
=
int
(
config
.
num_experts_per_tok
)
self
.
_router_capture_layer_name
=
prefix
if
self
.
tp_size
>
config
.
num_experts
:
if
self
.
tp_size
>
config
.
num_experts
:
raise
ValueError
(
raise
ValueError
(
...
@@ -235,6 +250,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
...
@@ -235,6 +250,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
# router_logits: (num_tokens, n_experts)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
not
(
hasattr
(
torch
,
"compiler"
)
and
torch
.
compiler
.
is_compiling
()):
capture_enabled
=
envs
.
VLLM_MOE_ROUTER_CAPTURE
if
capture_enabled
:
maybe_record_router_logits
(
layer_name
=
self
.
_router_capture_layer_name
,
router_logits
=
router_logits
,
top_k
=
self
.
_router_top_k
,
)
shared_out
,
fused_out
=
self
.
experts
(
shared_out
,
fused_out
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
)
...
@@ -341,13 +364,13 @@ class Qwen3MoeAttention(nn.Module):
...
@@ -341,13 +364,13 @@ class Qwen3MoeAttention(nn.Module):
def
rms_rotary_embedding_fuse
(
def
rms_rotary_embedding_fuse
(
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
|
None
,
head_size
:
int
,
head_size
:
int
,
cos_sin_cache
:
torch
.
Tensor
,
cos_sin_cache
:
torch
.
Tensor
,
is_neox_style
:
bool
,
is_neox_style
:
bool
,
q_weight
:
torch
.
Tensor
,
q_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
epsilon
:
float
,
epsilon
:
float
,
key
:
torch
.
Tensor
|
None
=
None
,
q_bias
:
torch
.
Tensor
|
None
=
None
,
q_bias
:
torch
.
Tensor
|
None
=
None
,
k_bias
:
torch
.
Tensor
|
None
=
None
,
k_bias
:
torch
.
Tensor
|
None
=
None
,
)
->
None
:
)
->
None
:
...
@@ -371,13 +394,13 @@ class Qwen3MoeAttention(nn.Module):
...
@@ -371,13 +394,13 @@ class Qwen3MoeAttention(nn.Module):
# k_out:torch.Tensor,
# k_out:torch.Tensor,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
|
None
,
head_size
:
int
,
head_size
:
int
,
cos_sin_cache
:
torch
.
Tensor
,
cos_sin_cache
:
torch
.
Tensor
,
is_neox_style
:
bool
,
is_neox_style
:
bool
,
q_weight
:
torch
.
Tensor
,
q_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
k_weight
:
torch
.
Tensor
,
epsilon
:
float
,
epsilon
:
float
,
key
:
torch
.
Tensor
|
None
=
None
,
q_bias
:
torch
.
Tensor
|
None
=
None
,
q_bias
:
torch
.
Tensor
|
None
=
None
,
k_bias
:
torch
.
Tensor
|
None
=
None
,
k_bias
:
torch
.
Tensor
|
None
=
None
,
)
->
None
:
)
->
None
:
...
@@ -485,9 +508,9 @@ class Qwen3MoeAttention(nn.Module):
...
@@ -485,9 +508,9 @@ class Qwen3MoeAttention(nn.Module):
self
.
rotary_emb
.
is_neox_style
,
self
.
rotary_emb
.
is_neox_style
,
self
.
q_norm
.
weight
,
self
.
q_norm
.
weight
,
self
.
k_norm
.
weight
,
self
.
k_norm
.
weight
,
self
.
q_norm
.
variance_epsilon
,
None
,
None
,
None
,
None
,
self
.
q_norm
.
variance_epsilon
,
)
)
elif
envs
.
VLLM_USE_FUSED_RMS_ROPE
and
positions
.
ndim
==
2
and
getattr
(
elif
envs
.
VLLM_USE_FUSED_RMS_ROPE
and
positions
.
ndim
==
2
and
getattr
(
self
.
rotary_emb
,
"mrope_section"
,
None
)
is
not
None
:
self
.
rotary_emb
,
"mrope_section"
,
None
)
is
not
None
:
...
...
vllm/model_executor/models/qwen3_next.py
View file @
425eb81e
...
@@ -95,6 +95,7 @@ from .utils import (
...
@@ -95,6 +95,7 @@ from .utils import (
make_layers
,
make_layers
,
maybe_prefix
,
maybe_prefix
,
)
)
import
vllm.envs
as
envs
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -105,7 +106,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
...
@@ -105,7 +106,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_
text_
config
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
quant_config
=
vllm_config
.
quant_config
quant_config
=
vllm_config
.
quant_config
...
@@ -176,7 +177,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
...
@@ -176,7 +177,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
hidden_size
=
config
.
hidden_size
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
moe_intermediate_size
,
intermediate_size
=
config
.
moe_intermediate_size
,
reduce_results
=
False
,
reduce_results
=
False
,
renormalize
=
config
.
norm_topk_prob
,
renormalize
=
getattr
(
config
,
"
norm_topk_prob
"
,
True
)
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.experts"
,
prefix
=
f
"
{
prefix
}
.experts"
,
enable_eplb
=
self
.
enable_eplb
,
enable_eplb
=
self
.
enable_eplb
,
...
@@ -533,9 +534,13 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
...
@@ -533,9 +534,13 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
a
=
a
[:
num_actual_tokens
]
a
=
a
[:
num_actual_tokens
]
# 1. Convolution sequence transformation
# 1. Convolution sequence transformation
conv_weights
=
self
.
conv1d
.
weight
.
view
(
if
envs
.
VLLM_USE_NN
:
self
.
conv1d
.
weight
.
size
(
0
),
self
.
conv1d
.
weight
.
size
(
2
)
conv_weights
=
self
.
conv1d
.
weight
.
squeeze
(
1
).
transpose
(
)
0
,
1
).
contiguous
()
else
:
conv_weights
=
self
.
conv1d
.
weight
.
view
(
self
.
conv1d
.
weight
.
size
(
0
),
self
.
conv1d
.
weight
.
size
(
2
)
)
if
spec_sequence_masks
is
not
None
:
if
spec_sequence_masks
is
not
None
:
if
attn_metadata
.
num_prefills
==
0
and
attn_metadata
.
num_decodes
==
0
:
if
attn_metadata
.
num_prefills
==
0
and
attn_metadata
.
num_decodes
==
0
:
...
@@ -965,7 +970,7 @@ class Qwen3NextModel(nn.Module):
...
@@ -965,7 +970,7 @@ class Qwen3NextModel(nn.Module):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
:
Qwen3NextConfig
=
vllm_config
.
model_config
.
hf_config
config
:
Qwen3NextConfig
=
vllm_config
.
model_config
.
hf_
text_
config
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
eplb_config
=
parallel_config
.
eplb_config
eplb_config
=
parallel_config
.
eplb_config
...
@@ -1042,7 +1047,7 @@ class Qwen3NextModel(nn.Module):
...
@@ -1042,7 +1047,7 @@ class Qwen3NextModel(nn.Module):
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
num_experts
=
self
.
config
.
num_experts
,
num_experts
=
getattr
(
self
.
config
,
"
num_experts
"
,
0
)
,
num_redundant_experts
=
self
.
num_redundant_experts
,
num_redundant_experts
=
self
.
num_redundant_experts
,
)
)
...
@@ -1201,7 +1206,7 @@ class Qwen3NextForCausalLM(
...
@@ -1201,7 +1206,7 @@ class Qwen3NextForCausalLM(
}
}
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_
text_
config
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
self
.
model_config
=
vllm_config
.
model_config
cache_config
=
vllm_config
.
cache_config
cache_config
=
vllm_config
.
cache_config
...
@@ -1265,7 +1270,7 @@ class Qwen3NextForCausalLM(
...
@@ -1265,7 +1270,7 @@ class Qwen3NextForCausalLM(
cls
,
vllm_config
:
"VllmConfig"
cls
,
vllm_config
:
"VllmConfig"
)
->
tuple
[
tuple
[
int
,
int
],
tuple
[
int
,
int
]]:
)
->
tuple
[
tuple
[
int
,
int
],
tuple
[
int
,
int
]]:
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
hf_config
=
vllm_config
.
model_config
.
hf_config
hf_config
=
vllm_config
.
model_config
.
hf_
text_
config
tp_size
=
parallel_config
.
tensor_parallel_size
tp_size
=
parallel_config
.
tensor_parallel_size
num_spec
=
(
num_spec
=
(
vllm_config
.
speculative_config
.
num_speculative_tokens
vllm_config
.
speculative_config
.
num_speculative_tokens
...
...
vllm/model_executor/models/registry.py
View file @
425eb81e
...
@@ -438,6 +438,14 @@ _MULTIMODAL_MODELS = {
...
@@ -438,6 +438,14 @@ _MULTIMODAL_MODELS = {
"qwen3_vl_moe"
,
"qwen3_vl_moe"
,
"Qwen3VLMoeForConditionalGeneration"
,
"Qwen3VLMoeForConditionalGeneration"
,
),
),
"Qwen3_5ForConditionalGeneration"
:
(
"qwen3_5"
,
"Qwen3_5ForConditionalGeneration"
,
),
"Qwen3_5MoeForConditionalGeneration"
:
(
"qwen3_5"
,
"Qwen3_5MoeForConditionalGeneration"
,
),
"SkyworkR1VChatModel"
:
(
"skyworkr1v"
,
"SkyworkR1VChatModel"
),
"SkyworkR1VChatModel"
:
(
"skyworkr1v"
,
"SkyworkR1VChatModel"
),
"Step3VLForConditionalGeneration"
:
(
"step3_vl"
,
"Step3VLForConditionalGeneration"
),
# noqa: E501
"Step3VLForConditionalGeneration"
:
(
"step3_vl"
,
"Step3VLForConditionalGeneration"
),
# noqa: E501
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
# noqa: E501
"TarsierForConditionalGeneration"
:
(
"tarsier"
,
"TarsierForConditionalGeneration"
),
# noqa: E501
...
@@ -480,6 +488,8 @@ _SPECULATIVE_DECODING_MODELS = {
...
@@ -480,6 +488,8 @@ _SPECULATIVE_DECODING_MODELS = {
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
"Step3p5MTP"
:
(
"step3p5_mtp"
,
"Step3p5MTP"
),
"Step3p5MTP"
:
(
"step3p5_mtp"
,
"Step3p5MTP"
),
"Qwen3_5MTP"
:
(
"qwen3_5_mtp"
,
"Qwen3_5MTP"
),
"Qwen3_5MoeMTP"
:
(
"qwen3_5_mtp"
,
"Qwen3_5MoeMTP"
),
# Temporarily disabled.
# Temporarily disabled.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
...
...
vllm/renderers/hf.py
View file @
425eb81e
...
@@ -35,6 +35,7 @@ from vllm.transformers_utils.processor import cached_get_processor
...
@@ -35,6 +35,7 @@ from vllm.transformers_utils.processor import cached_get_processor
from
vllm.utils.func_utils
import
supports_kw
from
vllm.utils.func_utils
import
supports_kw
from
.protocol
import
RendererLike
from
.protocol
import
RendererLike
import
vllm.envs
as
envs
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -448,11 +449,17 @@ def safe_apply_chat_template(
...
@@ -448,11 +449,17 @@ def safe_apply_chat_template(
model_config
=
model_config
,
model_config
=
model_config
,
)
)
if
chat_template
is
None
:
if
chat_template
is
None
:
raise
ChatTemplateResolutionError
(
if
envs
.
VLLM_USE_V32_ENCODE
:
"As of transformers v4.44, default chat template is no longer "
from
vllm.entrypoints.encoding_dsv32
import
encode_messages
"allowed, so you must provide a chat template if the tokenizer "
encode_config
=
dict
(
thinking_mode
=
"thinking"
,
drop_thinking
=
True
,
add_default_bos_token
=
True
)
"does not define one."
prompt
=
encode_messages
(
conversation
,
**
encode_config
)
)
return
tokenizer
.
encode
(
prompt
)
else
:
raise
ChatTemplateResolutionError
(
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
)
resolved_kwargs
=
resolve_chat_template_kwargs
(
resolved_kwargs
=
resolve_chat_template_kwargs
(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
...
...
vllm/transformers_utils/configs/__init__.py
View file @
425eb81e
...
@@ -53,6 +53,10 @@ _CLASS_TO_MODULE: dict[str, str] = {
...
@@ -53,6 +53,10 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Step3p5Config"
:
"vllm.transformers_utils.configs.step3p5"
,
"Step3p5Config"
:
"vllm.transformers_utils.configs.step3p5"
,
"Qwen3ASRConfig"
:
"vllm.transformers_utils.configs.qwen3_asr"
,
"Qwen3ASRConfig"
:
"vllm.transformers_utils.configs.qwen3_asr"
,
"Qwen3NextConfig"
:
"vllm.transformers_utils.configs.qwen3_next"
,
"Qwen3NextConfig"
:
"vllm.transformers_utils.configs.qwen3_next"
,
"Qwen3_5Config"
:
"vllm.transformers_utils.configs.qwen3_5"
,
"Qwen3_5TextConfig"
:
"vllm.transformers_utils.configs.qwen3_5"
,
"Qwen3_5MoeConfig"
:
"vllm.transformers_utils.configs.qwen3_5_moe"
,
"Qwen3_5MoeTextConfig"
:
"vllm.transformers_utils.configs.qwen3_5_moe"
,
"Tarsier2Config"
:
"vllm.transformers_utils.configs.tarsier2"
,
"Tarsier2Config"
:
"vllm.transformers_utils.configs.tarsier2"
,
# Special case: DeepseekV3Config is from HuggingFace Transformers
# Special case: DeepseekV3Config is from HuggingFace Transformers
"DeepseekV3Config"
:
"transformers"
,
"DeepseekV3Config"
:
"transformers"
,
...
@@ -95,6 +99,10 @@ __all__ = [
...
@@ -95,6 +99,10 @@ __all__ = [
"Step3p5Config"
,
"Step3p5Config"
,
"Qwen3ASRConfig"
,
"Qwen3ASRConfig"
,
"Qwen3NextConfig"
,
"Qwen3NextConfig"
,
"Qwen3_5Config"
,
"Qwen3_5TextConfig"
,
"Qwen3_5MoeConfig"
,
"Qwen3_5MoeTextConfig"
,
"Tarsier2Config"
,
"Tarsier2Config"
,
]
]
...
...
vllm/transformers_utils/configs/qwen3_5.py
0 → 100644
View file @
425eb81e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3.5 model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
,
layer_type_validation
class
Qwen3_5TextConfig
(
PretrainedConfig
):
model_type
=
"qwen3_5_text"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
base_model_tp_plan
=
{
"layers.*.self_attn.q_proj"
:
"colwise"
,
"layers.*.self_attn.k_proj"
:
"colwise"
,
"layers.*.self_attn.v_proj"
:
"colwise"
,
"layers.*.self_attn.o_proj"
:
"rowwise"
,
"layers.*.mlp.gate_proj"
:
"colwise"
,
"layers.*.mlp.up_proj"
:
"colwise"
,
"layers.*.mlp.down_proj"
:
"rowwise"
,
}
base_model_pp_plan
=
{
"embed_tokens"
:
([
"input_ids"
],
[
"inputs_embeds"
]),
"layers"
:
([
"hidden_states"
,
"attention_mask"
],
[
"hidden_states"
]),
"norm"
:
([
"hidden_states"
],
[
"hidden_states"
]),
}
base_config_key
=
"text_config"
def
__init__
(
self
,
vocab_size
=
248320
,
hidden_size
=
4096
,
intermediate_size
=
12288
,
num_hidden_layers
=
32
,
num_attention_heads
=
16
,
num_key_value_heads
=
4
,
hidden_act
=
"silu"
,
max_position_embeddings
=
32768
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
tie_word_embeddings
=
False
,
rope_parameters
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
head_dim
=
256
,
linear_conv_kernel_dim
=
4
,
linear_key_head_dim
=
128
,
linear_value_head_dim
=
128
,
linear_num_key_heads
=
16
,
linear_num_value_heads
=
32
,
layer_types
=
None
,
pad_token_id
=
None
,
bos_token_id
=
None
,
eos_token_id
=
None
,
**
kwargs
,
):
kwargs
[
"ignore_keys_at_rope_validation"
]
=
[
"mrope_section"
,
"mrope_interleaved"
,
]
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
head_dim
=
head_dim
self
.
rope_parameters
=
rope_parameters
kwargs
.
setdefault
(
"partial_rotary_factor"
,
0.25
)
self
.
layer_types
=
layer_types
if
self
.
layer_types
is
None
:
interval_pattern
=
kwargs
.
get
(
"full_attention_interval"
,
4
)
self
.
layer_types
=
[
"linear_attention"
if
bool
((
i
+
1
)
%
interval_pattern
)
else
"full_attention"
for
i
in
range
(
self
.
num_hidden_layers
)
]
layer_type_validation
(
self
.
layer_types
,
self
.
num_hidden_layers
)
# linear attention part
self
.
linear_conv_kernel_dim
=
linear_conv_kernel_dim
self
.
linear_key_head_dim
=
linear_key_head_dim
self
.
linear_value_head_dim
=
linear_value_head_dim
self
.
linear_num_key_heads
=
linear_num_key_heads
self
.
linear_num_value_heads
=
linear_num_value_heads
super
().
__init__
(
**
kwargs
)
# Set these AFTER super().__init__() because transformers v4's
# PretrainedConfig.__init__ has these as explicit params with different
# defaults (e.g. tie_word_embeddings=True) that would overwrite our values.
self
.
pad_token_id
=
pad_token_id
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
tie_word_embeddings
=
tie_word_embeddings
class
Qwen3_5VisionConfig
(
PretrainedConfig
):
model_type
=
"qwen3_5"
base_config_key
=
"vision_config"
def
__init__
(
self
,
depth
=
27
,
hidden_size
=
1152
,
hidden_act
=
"gelu_pytorch_tanh"
,
intermediate_size
=
4304
,
num_heads
=
16
,
in_channels
=
3
,
patch_size
=
16
,
spatial_merge_size
=
2
,
temporal_patch_size
=
2
,
out_hidden_size
=
3584
,
num_position_embeddings
=
2304
,
initializer_range
=
0.02
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
depth
=
depth
self
.
hidden_size
=
hidden_size
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
patch_size
=
patch_size
self
.
spatial_merge_size
=
spatial_merge_size
self
.
temporal_patch_size
=
temporal_patch_size
self
.
out_hidden_size
=
out_hidden_size
self
.
num_position_embeddings
=
num_position_embeddings
self
.
initializer_range
=
initializer_range
class
Qwen3_5Config
(
PretrainedConfig
):
model_type
=
"qwen3_5"
sub_configs
=
{
"vision_config"
:
Qwen3_5VisionConfig
,
"text_config"
:
Qwen3_5TextConfig
,
}
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
text_config
=
None
,
vision_config
=
None
,
image_token_id
=
248056
,
video_token_id
=
248057
,
vision_start_token_id
=
248053
,
vision_end_token_id
=
248054
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
if
isinstance
(
vision_config
,
dict
):
self
.
vision_config
=
self
.
sub_configs
[
"vision_config"
](
**
vision_config
)
elif
vision_config
is
None
:
self
.
vision_config
=
self
.
sub_configs
[
"vision_config"
]()
if
isinstance
(
text_config
,
dict
):
self
.
text_config
=
self
.
sub_configs
[
"text_config"
](
**
text_config
)
elif
text_config
is
None
:
self
.
text_config
=
self
.
sub_configs
[
"text_config"
]()
self
.
image_token_id
=
image_token_id
self
.
video_token_id
=
video_token_id
self
.
vision_start_token_id
=
vision_start_token_id
self
.
vision_end_token_id
=
vision_end_token_id
super
().
__init__
(
**
kwargs
)
# Set after super().__init__() to avoid v4 PretrainedConfig overwrite
self
.
tie_word_embeddings
=
tie_word_embeddings
__all__
=
[
"Qwen3_5Config"
,
"Qwen3_5TextConfig"
]
vllm/transformers_utils/configs/qwen3_5_moe.py
0 → 100644
View file @
425eb81e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen3.5-MoE model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
,
layer_type_validation
class
Qwen3_5MoeTextConfig
(
PretrainedConfig
):
model_type
=
"qwen3_5_moe_text"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
base_model_tp_plan
=
{
"layers.*.self_attn.q_proj"
:
"colwise"
,
"layers.*.self_attn.k_proj"
:
"colwise"
,
"layers.*.self_attn.v_proj"
:
"colwise"
,
"layers.*.self_attn.o_proj"
:
"rowwise"
,
"layers.*.mlp.experts.gate_up_proj"
:
"packed_colwise"
,
"layers.*.mlp.experts.down_proj"
:
"rowwise"
,
"layers.*.mlp.shared_expert.gate_proj"
:
"colwise"
,
"layers.*.mlp.shared_expert.up_proj"
:
"colwise"
,
"layers.*.mlp.shared_expert.down_proj"
:
"rowwise"
,
}
base_model_pp_plan
=
{
"embed_tokens"
:
([
"input_ids"
],
[
"inputs_embeds"
]),
"layers"
:
([
"hidden_states"
,
"attention_mask"
],
[
"hidden_states"
]),
"norm"
:
([
"hidden_states"
],
[
"hidden_states"
]),
}
base_config_key
=
"text_config"
def
__init__
(
self
,
vocab_size
=
248320
,
hidden_size
=
2048
,
num_hidden_layers
=
40
,
num_attention_heads
=
16
,
num_key_value_heads
=
2
,
hidden_act
=
"silu"
,
max_position_embeddings
=
32768
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
tie_word_embeddings
=
False
,
rope_parameters
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
head_dim
=
256
,
linear_conv_kernel_dim
=
4
,
linear_key_head_dim
=
128
,
linear_value_head_dim
=
128
,
linear_num_key_heads
=
16
,
linear_num_value_heads
=
32
,
moe_intermediate_size
=
512
,
shared_expert_intermediate_size
=
512
,
num_experts_per_tok
=
8
,
num_experts
=
256
,
output_router_logits
=
False
,
router_aux_loss_coef
=
0.001
,
layer_types
=
None
,
pad_token_id
=
None
,
bos_token_id
=
None
,
eos_token_id
=
None
,
**
kwargs
,
):
kwargs
[
"ignore_keys_at_rope_validation"
]
=
[
"mrope_section"
,
"mrope_interleaved"
,
]
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
head_dim
=
head_dim
self
.
rope_parameters
=
rope_parameters
kwargs
.
setdefault
(
"partial_rotary_factor"
,
0.25
)
self
.
layer_types
=
layer_types
if
self
.
layer_types
is
None
:
interval_pattern
=
kwargs
.
get
(
"full_attention_interval"
,
4
)
self
.
layer_types
=
[
"linear_attention"
if
bool
((
i
+
1
)
%
interval_pattern
)
else
"full_attention"
for
i
in
range
(
self
.
num_hidden_layers
)
]
layer_type_validation
(
self
.
layer_types
,
self
.
num_hidden_layers
)
# linear attention part
self
.
linear_conv_kernel_dim
=
linear_conv_kernel_dim
self
.
linear_key_head_dim
=
linear_key_head_dim
self
.
linear_value_head_dim
=
linear_value_head_dim
self
.
linear_num_key_heads
=
linear_num_key_heads
self
.
linear_num_value_heads
=
linear_num_value_heads
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
shared_expert_intermediate_size
=
shared_expert_intermediate_size
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
num_experts
=
num_experts
self
.
output_router_logits
=
output_router_logits
self
.
router_aux_loss_coef
=
router_aux_loss_coef
super
().
__init__
(
**
kwargs
)
# Set these AFTER super().__init__() because transformers v4's
# PretrainedConfig.__init__ has these as explicit params with different
# defaults (e.g. tie_word_embeddings=True) that would overwrite our values.
self
.
pad_token_id
=
pad_token_id
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
tie_word_embeddings
=
tie_word_embeddings
class
Qwen3_5MoeVisionConfig
(
PretrainedConfig
):
model_type
=
"qwen3_5_moe"
base_config_key
=
"vision_config"
def
__init__
(
self
,
depth
=
27
,
hidden_size
=
1152
,
hidden_act
=
"gelu_pytorch_tanh"
,
intermediate_size
=
4304
,
num_heads
=
16
,
in_channels
=
3
,
patch_size
=
16
,
spatial_merge_size
=
2
,
temporal_patch_size
=
2
,
out_hidden_size
=
3584
,
num_position_embeddings
=
2304
,
initializer_range
=
0.02
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
depth
=
depth
self
.
hidden_size
=
hidden_size
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
patch_size
=
patch_size
self
.
spatial_merge_size
=
spatial_merge_size
self
.
temporal_patch_size
=
temporal_patch_size
self
.
out_hidden_size
=
out_hidden_size
self
.
num_position_embeddings
=
num_position_embeddings
self
.
initializer_range
=
initializer_range
class
Qwen3_5MoeConfig
(
PretrainedConfig
):
model_type
=
"qwen3_5_moe"
sub_configs
=
{
"vision_config"
:
Qwen3_5MoeVisionConfig
,
"text_config"
:
Qwen3_5MoeTextConfig
,
}
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
text_config
=
None
,
vision_config
=
None
,
image_token_id
=
248056
,
video_token_id
=
248057
,
vision_start_token_id
=
248053
,
vision_end_token_id
=
248054
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
if
isinstance
(
vision_config
,
dict
):
self
.
vision_config
=
self
.
sub_configs
[
"vision_config"
](
**
vision_config
)
elif
vision_config
is
None
:
self
.
vision_config
=
self
.
sub_configs
[
"vision_config"
]()
if
isinstance
(
text_config
,
dict
):
self
.
text_config
=
self
.
sub_configs
[
"text_config"
](
**
text_config
)
elif
text_config
is
None
:
self
.
text_config
=
self
.
sub_configs
[
"text_config"
]()
self
.
image_token_id
=
image_token_id
self
.
video_token_id
=
video_token_id
self
.
vision_start_token_id
=
vision_start_token_id
self
.
vision_end_token_id
=
vision_end_token_id
super
().
__init__
(
**
kwargs
)
# Set after super().__init__() to avoid v4 PretrainedConfig overwrite
self
.
tie_word_embeddings
=
tie_word_embeddings
__all__
=
[
"Qwen3_5MoeConfig"
,
"Qwen3_5MoeTextConfig"
]
vllm/transformers_utils/model_arch_config_convertor.py
View file @
425eb81e
...
@@ -371,6 +371,11 @@ class Qwen3NextMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
...
@@ -371,6 +371,11 @@ class Qwen3NextMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
return
getattr
(
self
.
hf_text_config
,
"num_nextn_predict_layers"
,
0
)
return
getattr
(
self
.
hf_text_config
,
"num_nextn_predict_layers"
,
0
)
class
Qwen3_5MTPModelArchConfigConvertor
(
ModelArchConfigConvertorBase
):
def
get_num_hidden_layers
(
self
)
->
int
:
return
getattr
(
self
.
hf_text_config
,
"mtp_num_hidden_layers"
,
0
)
class
PanguUltraMoeMTPModelArchConfigConvertor
(
ModelArchConfigConvertorBase
):
class
PanguUltraMoeMTPModelArchConfigConvertor
(
ModelArchConfigConvertorBase
):
def
get_num_hidden_layers
(
self
)
->
int
:
def
get_num_hidden_layers
(
self
)
->
int
:
return
getattr
(
self
.
hf_text_config
,
"num_nextn_predict_layers"
,
0
)
return
getattr
(
self
.
hf_text_config
,
"num_nextn_predict_layers"
,
0
)
...
@@ -396,6 +401,7 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
...
@@ -396,6 +401,7 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
"nemotron-nas"
:
NemotronNasModelArchConfigConvertor
,
"nemotron-nas"
:
NemotronNasModelArchConfigConvertor
,
"deepseek_mtp"
:
DeepSeekMTPModelArchConfigConvertor
,
"deepseek_mtp"
:
DeepSeekMTPModelArchConfigConvertor
,
"qwen3_next_mtp"
:
Qwen3NextMTPModelArchConfigConvertor
,
"qwen3_next_mtp"
:
Qwen3NextMTPModelArchConfigConvertor
,
"qwen3_5_mtp"
:
Qwen3_5MTPModelArchConfigConvertor
,
"mimo_mtp"
:
MimoMTPModelArchConfigConvertor
,
"mimo_mtp"
:
MimoMTPModelArchConfigConvertor
,
"glm4_moe_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"glm4_moe_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"ernie_mtp"
:
ErnieMTPModelArchConfigConvertor
,
"ernie_mtp"
:
ErnieMTPModelArchConfigConvertor
,
...
...
vllm/utils/__init__.py
View file @
425eb81e
...
@@ -61,7 +61,7 @@ class W8a8GetCacheJSON:
...
@@ -61,7 +61,7 @@ class W8a8GetCacheJSON:
self
.
moe_weight_shapes
=
[]
self
.
moe_weight_shapes
=
[]
arch_name
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
.
split
(
':'
)[
0
]
arch_name
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
.
split
(
':'
)[
0
]
arch_cu
=
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
multi_processor_count
arch_cu
=
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
multi_processor_count
self
.
cache_json_data
=
{}
device_name
=
arch_name
+
'_'
+
str
(
arch_cu
)
+
'cu'
device_name
=
arch_name
+
'_'
+
str
(
arch_cu
)
+
'cu'
self
.
device_name
=
device_name
self
.
device_name
=
device_name
self
.
topk
=
1
self
.
topk
=
1
...
@@ -162,21 +162,30 @@ class W8a8GetCacheJSON:
...
@@ -162,21 +162,30 @@ class W8a8GetCacheJSON:
def
get_blockint8json_name
(
self
,
n
,
k
,
block_n
,
block_k
):
def
get_blockint8json_name
(
self
,
n
,
k
,
block_n
,
block_k
):
return
self
.
triton_json_dir
+
f
"/linear_
{
n
}
_
{
k
}
_block[
{
block_n
}
,
{
block_k
}
]_
{
self
.
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/linear_
{
n
}
_
{
k
}
_block[
{
block_n
}
,
{
block_k
}
]_
{
self
.
device_name
}
.json"
def
get_moeint8json_name
(
self
,
E
,
N1
,
N2
,
K
,
TOPK
,
def
get_moeint8json_name
(
self
,
E
,
N1
,
N2
,
K
,
TOPK
,
block_size
:
list
|
None
=
None
,
use_int4_w4a8
:
bool
|
None
=
False
):
block_size
:
list
|
None
=
None
,
use_int4_w4a8
:
bool
|
None
=
False
,
use_int8_w8a8
:
bool
|
None
=
False
):
if
use_int4_w4a8
:
if
use_int4_w4a8
:
if
block_size
is
not
None
:
if
block_size
is
not
None
:
return
self
.
triton_json_dir
+
f
"/MOE_W4A8INT8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/MOE_W4A8INT8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
else
:
return
self
.
triton_json_dir
+
f
"/MOE_W4A8INT8_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
elif
use_int8_w8a8
:
if
block_size
is
not
None
:
return
self
.
triton_json_dir
+
f
"/MOE_BLOCKINT8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
else
:
else
:
return
self
.
triton_json_dir
+
f
"/MOE_W
4
A8INT8_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/MOE_W
8
A8INT8_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
else
:
else
:
if
block_size
is
not
None
:
if
block_size
is
not
None
:
return
self
.
triton_json_dir
+
f
"/MOE_BLOCK
INT
8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/MOE_BLOCK
FP
8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
else
:
else
:
return
self
.
triton_json_dir
+
f
"/MOE_W8A8
INT
8_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/MOE_W8A8
FP
8_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
def
get_moeint8_triton_cache
(
self
,
file_path
,
E
,
N1
,
N2
,
K
,
TOPK
):
def
get_moeint8_triton_cache
(
self
,
file_path
,
E
,
N1
,
N2
,
K
,
TOPK
):
if
file_path
in
self
.
cache_json_data
:
# 直接返回缓存数据,避免重复读取
return
self
.
cache_json_data
[
file_path
]
cache_json_file
=
file_path
cache_json_file
=
file_path
if
os
.
path
.
exists
(
file_path
):
if
os
.
path
.
exists
(
file_path
):
...
@@ -192,7 +201,7 @@ class W8a8GetCacheJSON:
...
@@ -192,7 +201,7 @@ class W8a8GetCacheJSON:
for
sub_key
,
sub_value
in
value
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_dict
[
configs_key
]
=
sub_value
configs_dict
[
configs_key
]
=
sub_value
self
.
cache_json_data
[
file_path
]
=
configs_dict
return
configs_dict
return
configs_dict
vllm/v1/spec_decode/eagle.py
View file @
425eb81e
...
@@ -1147,6 +1147,8 @@ class SpecDecodeBaseProposer:
...
@@ -1147,6 +1147,8 @@ class SpecDecodeBaseProposer:
"Qwen3VLForConditionalGeneration"
,
"Qwen3VLForConditionalGeneration"
,
"Qwen3VLMoeForConditionalGeneration"
,
"Qwen3VLMoeForConditionalGeneration"
,
"GlmOcrForConditionalGeneration"
,
"GlmOcrForConditionalGeneration"
,
"Qwen3_5ForConditionalGeneration"
,
"Qwen3_5MoeForConditionalGeneration"
,
]:
]:
self
.
model
.
config
.
image_token_index
=
target_model
.
config
.
image_token_id
self
.
model
.
config
.
image_token_index
=
target_model
.
config
.
image_token_id
elif
self
.
get_model_name
(
target_model
)
==
"PixtralForConditionalGeneration"
:
elif
self
.
get_model_name
(
target_model
)
==
"PixtralForConditionalGeneration"
:
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
425eb81e
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Datastructures defining a GPU input batch
# Datastructures defining a GPU input batch
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
,
cast
from
typing
import
Optional
,
cast
import
numpy
as
np
import
numpy
as
np
...
@@ -47,6 +47,12 @@ class CachedRequestState:
...
@@ -47,6 +47,12 @@ class CachedRequestState:
lora_request
:
LoRARequest
|
None
=
None
lora_request
:
LoRARequest
|
None
=
None
prompt_embeds
:
torch
.
Tensor
|
None
=
None
prompt_embeds
:
torch
.
Tensor
|
None
=
None
_prompt_token_ids_np
:
np
.
ndarray
|
None
=
field
(
default
=
None
,
init
=
False
,
repr
=
False
,
compare
=
False
,
)
# Used when both async_scheduling and spec_decode are enabled.
# Used when both async_scheduling and spec_decode are enabled.
prev_num_draft_len
:
int
=
0
prev_num_draft_len
:
int
=
0
...
@@ -332,15 +338,41 @@ class InputBatch:
...
@@ -332,15 +338,41 @@ class InputBatch:
)
)
self
.
num_prompt_tokens
[
req_index
]
=
num_prompt_tokens
self
.
num_prompt_tokens
[
req_index
]
=
num_prompt_tokens
start_idx
=
num_prompt_tokens
start_idx
=
num_prompt_tokens
end_idx
=
start_idx
+
len
(
request
.
output_token_ids
)
if
request
.
prompt_token_ids
is
not
None
:
if
request
.
prompt_token_ids
is
not
None
:
self
.
token_ids_cpu
[
req_index
,
:
num_prompt_tokens
]
=
request
.
prompt_token_ids
if
not
envs
.
VLLM_V1_FAST_TOKEN_ID_COPY
:
self
.
token_ids_cpu
[
req_index
,
:
num_prompt_tokens
]
=
request
.
prompt_token_ids
else
:
prompt_token_ids_np
=
request
.
_prompt_token_ids_np
rebuild_prompt_cache
=
True
if
prompt_token_ids_np
is
not
None
:
rebuild_prompt_cache
=
(
prompt_token_ids_np
.
dtype
!=
np
.
int32
or
prompt_token_ids_np
.
size
!=
num_prompt_tokens
)
if
rebuild_prompt_cache
:
prompt_token_ids_np
=
np
.
asarray
(
request
.
prompt_token_ids
,
dtype
=
np
.
int32
)
request
.
_prompt_token_ids_np
=
prompt_token_ids_np
np
.
copyto
(
self
.
token_ids_cpu
[
req_index
,
:
num_prompt_tokens
],
prompt_token_ids_np
,
casting
=
"no"
,
)
self
.
is_token_ids
[
req_index
,
:
num_prompt_tokens
]
=
True
self
.
is_token_ids
[
req_index
,
:
num_prompt_tokens
]
=
True
else
:
else
:
self
.
is_token_ids
[
req_index
,
:
num_prompt_tokens
]
=
False
self
.
is_token_ids
[
req_index
,
:
num_prompt_tokens
]
=
False
if
request
.
prompt_embeds
is
not
None
:
if
request
.
prompt_embeds
is
not
None
:
self
.
req_prompt_embeds
[
req_index
]
=
request
.
prompt_embeds
self
.
req_prompt_embeds
[
req_index
]
=
request
.
prompt_embeds
self
.
token_ids_cpu
[
req_index
,
start_idx
:
end_idx
]
=
request
.
output_token_ids
if
not
envs
.
VLLM_V1_FAST_TOKEN_ID_COPY
:
end_idx
=
start_idx
+
len
(
request
.
output_token_ids
)
self
.
token_ids_cpu
[
req_index
,
start_idx
:
end_idx
]
=
request
.
output_token_ids
else
:
output_token_ids_np
=
np
.
asarray
(
request
.
output_token_ids
,
dtype
=
np
.
int32
)
end_idx
=
start_idx
+
output_token_ids_np
.
size
np
.
copyto
(
self
.
token_ids_cpu
[
req_index
,
start_idx
:
end_idx
],
output_token_ids_np
,
casting
=
"no"
,
)
self
.
is_token_ids
[
req_index
,
start_idx
:
end_idx
]
=
True
self
.
is_token_ids
[
req_index
,
start_idx
:
end_idx
]
=
True
# Number of tokens without spec decode tokens.
# Number of tokens without spec decode tokens.
self
.
num_tokens_no_spec
[
req_index
]
=
request
.
num_tokens
self
.
num_tokens_no_spec
[
req_index
]
=
request
.
num_tokens
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment