Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc7980db
Commit
fc7980db
authored
Feb 05, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.15.1' into v0.15.1-ori
parents
3eab7fef
1892993b
Changes
62
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3033 additions
and
130 deletions
+3033
-130
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+3
-12
vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py
...el_executor/layers/quantization/kernels/scaled_mm/rocm.py
+1
-0
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
..._executor/layers/quantization/utils/flashinfer_fp4_moe.py
+0
-27
vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
...l_executor/layers/quantization/utils/nvfp4_moe_support.py
+0
-67
vllm/model_executor/layers/utils.py
vllm/model_executor/layers/utils.py
+2
-0
vllm/model_executor/models/adapters.py
vllm/model_executor/models/adapters.py
+10
-4
vllm/model_executor/models/minimax_m2.py
vllm/model_executor/models/minimax_m2.py
+1
-0
vllm/model_executor/models/nemotron_parse.py
vllm/model_executor/models/nemotron_parse.py
+4
-1
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+2
-0
vllm/model_executor/models/step3p5.py
vllm/model_executor/models/step3p5.py
+894
-0
vllm/model_executor/models/step3p5_mtp.py
vllm/model_executor/models/step3p5_mtp.py
+315
-0
vllm/reasoning/__init__.py
vllm/reasoning/__init__.py
+4
-0
vllm/reasoning/step3p5_reasoning_parser.py
vllm/reasoning/step3p5_reasoning_parser.py
+153
-0
vllm/tool_parsers/__init__.py
vllm/tool_parsers/__init__.py
+4
-0
vllm/tool_parsers/step3p5_tool_parser.py
vllm/tool_parsers/step3p5_tool_parser.py
+1511
-0
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+2
-0
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+4
-0
vllm/transformers_utils/configs/step3p5.py
vllm/transformers_utils/configs/step3p5.py
+100
-0
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+0
-12
vllm/v1/core/kv_cache_coordinator.py
vllm/v1/core/kv_cache_coordinator.py
+23
-7
No files found.
vllm/model_executor/layers/quantization/fp8.py
View file @
fc7980db
...
@@ -26,7 +26,6 @@ from vllm.model_executor.layers.fused_moe import (
...
@@ -26,7 +26,6 @@ from vllm.model_executor.layers.fused_moe import (
)
)
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEQuantConfig
,
FusedMoEQuantConfig
,
RoutingMethodType
,
)
)
from
vllm.model_executor.layers.fused_moe.layer
import
UnquantizedFusedMoEMethod
from
vllm.model_executor.layers.fused_moe.layer
import
UnquantizedFusedMoEMethod
from
vllm.model_executor.layers.fused_moe.oracle.fp8
import
(
from
vllm.model_executor.layers.fused_moe.oracle.fp8
import
(
...
@@ -964,17 +963,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
...
@@ -964,17 +963,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
if
self
.
block_quant
:
if
self
.
block_quant
:
import
vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe
# noqa: E501, F401
import
vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe
# noqa: E501, F401
e_score_correction_bias
=
(
layer
.
e_score_correction_bias
.
to
(
x
.
dtype
)
if
layer
.
e_score_correction_bias
is
not
None
else
None
)
routing_method_type
=
layer
.
routing_method_type
return
torch
.
ops
.
vllm
.
flashinfer_fused_moe_blockscale_fp8
(
return
torch
.
ops
.
vllm
.
flashinfer_fused_moe_blockscale_fp8
(
routing_logits
=
router_logits
.
to
(
torch
.
float32
)
routing_logits
=
router_logits
,
if
routing_method_type
==
RoutingMethodType
.
DeepSeekV3
routing_bias
=
layer
.
e_score_correction_bias
,
else
router_logits
,
routing_bias
=
e_score_correction_bias
,
x
=
x
,
x
=
x
,
w13_weight
=
layer
.
w13_weight
,
w13_weight
=
layer
.
w13_weight
,
w13_weight_scale_inv
=
layer
.
w13_weight_scale_inv
,
w13_weight_scale_inv
=
layer
.
w13_weight_scale_inv
,
...
@@ -988,7 +979,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
...
@@ -988,7 +979,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
expert_offset
=
layer
.
ep_rank
*
layer
.
local_num_experts
,
expert_offset
=
layer
.
ep_rank
*
layer
.
local_num_experts
,
local_num_experts
=
layer
.
local_num_experts
,
local_num_experts
=
layer
.
local_num_experts
,
block_shape
=
self
.
weight_block_size
,
block_shape
=
self
.
weight_block_size
,
routing_method_type
=
routing_method_type
,
routing_method_type
=
layer
.
routing_method_type
,
routed_scaling
=
layer
.
routed_scaling_factor
,
routed_scaling
=
layer
.
routed_scaling_factor
,
)
)
else
:
else
:
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/rocm.py
View file @
fc7980db
...
@@ -28,6 +28,7 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
...
@@ -28,6 +28,7 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
A
.
shape
[
0
]
==
1
A
.
shape
[
0
]
==
1
and
B
.
shape
[
1
]
%
16
==
0
and
B
.
shape
[
1
]
%
16
==
0
and
((
bias
is
None
)
or
(
bias
.
dtype
==
out_dtype
))
and
((
bias
is
None
)
or
(
bias
.
dtype
==
out_dtype
))
and
A
.
is_contiguous
()
):
):
output
=
ops
.
wvSplitKQ
(
output
=
ops
.
wvSplitKQ
(
B
.
t
(),
B
.
t
(),
...
...
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
View file @
fc7980db
...
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING
...
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING
import
torch
import
torch
import
vllm.envs
as
envs
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -22,10 +21,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
...
@@ -22,10 +21,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
swizzle_blockscale
,
swizzle_blockscale
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
(
has_flashinfer_cutedsl_grouped_gemm_nt_masked
,
has_flashinfer_cutlass_fused_moe
,
)
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.model_executor.layers.fused_moe.oracle.nvfp4
import
(
from
vllm.model_executor.layers.fused_moe.oracle.nvfp4
import
(
...
@@ -36,8 +31,6 @@ logger = init_logger(__name__)
...
@@ -36,8 +31,6 @@ logger = init_logger(__name__)
__all__
=
[
__all__
=
[
"is_flashinfer_fp4_cutlass_moe_available"
,
"is_flashinfer_fp4_cutedsl_moe_available"
,
"reorder_w1w3_to_w3w1"
,
"reorder_w1w3_to_w3w1"
,
]
]
...
@@ -122,26 +115,6 @@ def is_supported_config_trtllm(
...
@@ -122,26 +115,6 @@ def is_supported_config_trtllm(
return
True
,
None
return
True
,
None
def
is_flashinfer_fp4_cutlass_moe_available
()
->
bool
:
"""Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
return
(
envs
.
VLLM_USE_FLASHINFER_MOE_FP4
and
has_flashinfer_cutlass_fused_moe
()
and
current_platform
.
is_cuda
()
and
current_platform
.
has_device_capability
(
100
)
)
def
is_flashinfer_fp4_cutedsl_moe_available
()
->
bool
:
"""Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used."""
return
(
envs
.
VLLM_USE_FLASHINFER_MOE_FP4
and
has_flashinfer_cutedsl_grouped_gemm_nt_masked
()
and
current_platform
.
is_cuda
()
and
current_platform
.
is_device_capability_family
(
100
)
)
def
reorder_w1w3_to_w3w1
(
def
reorder_w1w3_to_w3w1
(
weight
:
torch
.
Tensor
,
scale
:
torch
.
Tensor
,
dim
:
int
=
-
2
weight
:
torch
.
Tensor
,
scale
:
torch
.
Tensor
,
dim
:
int
=
-
2
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
...
...
vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
deleted
100644 → 0
View file @
3eab7fef
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe
import
(
is_flashinfer_fp4_cutedsl_moe_available
,
is_flashinfer_fp4_cutlass_moe_available
,
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp4
import
(
is_fp4_marlin_supported
,
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
cutlass_fp4_supported
,
)
__all__
=
[
"detect_nvfp4_moe_support"
,
"NvFp4Support"
]
_logger
=
init_logger
(
__name__
)
@
dataclass
(
frozen
=
True
)
class
NvFp4Support
:
"""Result container for NV-FP4 capability probing."""
cutlass_supported
:
bool
allow_flashinfer
:
bool
use_marlin
:
bool
def
detect_nvfp4_moe_support
(
class_name
:
str
=
""
)
->
NvFp4Support
:
"""Detect platform support for NV-FP4 fused-MoE path"""
cutlass_supported
=
cutlass_fp4_supported
()
allow_flashinfer
=
cutlass_supported
and
(
is_flashinfer_fp4_cutlass_moe_available
()
or
is_flashinfer_fp4_cutedsl_moe_available
()
)
if
allow_flashinfer
:
_logger
.
info_once
(
"Using FlashInfer kernels for %s."
,
class_name
or
"NVFP4 path"
)
else
:
if
envs
.
VLLM_USE_FLASHINFER_MOE_FP4
:
_logger
.
warning_once
(
"FlashInfer kernels unavailable for %s on current platform."
,
class_name
or
"NVFP4 path"
,
)
use_marlin
=
False
if
not
cutlass_supported
:
if
is_fp4_marlin_supported
():
use_marlin
=
True
_logger
.
info_once
(
"Falling back to Marlin FP4 MoE kernel."
)
else
:
raise
ValueError
(
"Current platform does not support NVFP4 quantization. "
"Please use Blackwell GPUs or enable FlashInfer."
)
return
NvFp4Support
(
cutlass_supported
=
cutlass_supported
,
allow_flashinfer
=
allow_flashinfer
,
use_marlin
=
use_marlin
,
)
vllm/model_executor/layers/utils.py
View file @
fc7980db
...
@@ -146,6 +146,7 @@ def rocm_unquantized_gemm_impl(
...
@@ -146,6 +146,7 @@ def rocm_unquantized_gemm_impl(
and
n
<=
128
and
n
<=
128
and
k
>
512
and
k
>
512
and
math
.
ceil
(
k
/
512
)
*
math
.
ceil
(
m
/
16
)
<
get_cu_count
()
and
math
.
ceil
(
k
/
512
)
*
math
.
ceil
(
m
/
16
)
<
get_cu_count
()
and
x
.
is_contiguous
()
)
)
# k == 2880 and (m == 640 or m == 128))
# k == 2880 and (m == 640 or m == 128))
)
)
...
@@ -165,6 +166,7 @@ def rocm_unquantized_gemm_impl(
...
@@ -165,6 +166,7 @@ def rocm_unquantized_gemm_impl(
and
on_gfx9
()
and
on_gfx9
()
and
x
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]
and
x
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]
and
k
%
8
==
0
and
k
%
8
==
0
and
x
.
is_contiguous
()
)
)
if
use_skinny
is
not
True
:
if
use_skinny
is
not
True
:
...
...
vllm/model_executor/models/adapters.py
View file @
fc7980db
...
@@ -466,6 +466,7 @@ def load_weights_using_from_2_way_softmax(
...
@@ -466,6 +466,7 @@ def load_weights_using_from_2_way_softmax(
language_model
=
_get_language_model_for_seq_cls
(
model
)
language_model
=
_get_language_model_for_seq_cls
(
model
)
is_vlm
=
language_model
is
not
model
is_vlm
=
language_model
is
not
model
using_vlm_head
=
is_vlm
and
hasattr
(
language_model
,
"score"
)
language_model
.
lm_head
=
ParallelLMHead
(
language_model
.
lm_head
=
ParallelLMHead
(
text_config
.
vocab_size
,
text_config
.
hidden_size
,
quant_config
=
quant_config
text_config
.
vocab_size
,
text_config
.
hidden_size
,
quant_config
=
quant_config
...
@@ -506,14 +507,16 @@ def load_weights_using_from_2_way_softmax(
...
@@ -506,14 +507,16 @@ def load_weights_using_from_2_way_softmax(
torch
.
float32
torch
.
float32
)
-
lm_head_weight
.
data
[[
false_id
]].
to
(
torch
.
float32
)
)
-
lm_head_weight
.
data
[[
false_id
]].
to
(
torch
.
float32
)
score_layer
=
language_model
.
score
if
is_vlm
else
model
.
score
score_layer
=
language_model
.
score
if
using_vlm_head
else
model
.
score
param
=
score_layer
.
weight
param
=
score_layer
.
weight
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
score_weight
)
weight_loader
(
param
,
score_weight
)
del
language_model
.
lm_head
del
language_model
.
lm_head
score_weight_name
=
"language_model.score.weight"
if
is_vlm
else
"score.weight"
score_weight_name
=
(
"language_model.score.weight"
if
using_vlm_head
else
"score.weight"
)
loaded_weights
.
add
(
score_weight_name
)
loaded_weights
.
add
(
score_weight_name
)
lm_head_name
=
"lm_head.weight"
lm_head_name
=
"lm_head.weight"
...
@@ -537,6 +540,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
...
@@ -537,6 +540,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
language_model
=
_get_language_model_for_seq_cls
(
model
)
language_model
=
_get_language_model_for_seq_cls
(
model
)
is_vlm
=
language_model
is
not
model
is_vlm
=
language_model
is
not
model
using_vlm_head
=
is_vlm
and
hasattr
(
language_model
,
"score"
)
language_model
.
lm_head
=
ParallelLMHead
(
language_model
.
lm_head
=
ParallelLMHead
(
text_config
.
vocab_size
,
text_config
.
hidden_size
,
quant_config
=
quant_config
text_config
.
vocab_size
,
text_config
.
hidden_size
,
quant_config
=
quant_config
...
@@ -572,14 +576,16 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
...
@@ -572,14 +576,16 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
t
)
for
t
in
tokens
]
token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
t
)
for
t
in
tokens
]
score_weight
=
language_model
.
lm_head
.
weight
.
data
[
token_ids
]
score_weight
=
language_model
.
lm_head
.
weight
.
data
[
token_ids
]
score_layer
=
language_model
.
score
if
is_vlm
else
model
.
score
score_layer
=
language_model
.
score
if
using_vlm_head
else
model
.
score
param
=
score_layer
.
weight
param
=
score_layer
.
weight
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
score_weight
)
weight_loader
(
param
,
score_weight
)
del
language_model
.
lm_head
del
language_model
.
lm_head
score_weight_name
=
"language_model.score.weight"
if
is_vlm
else
"score.weight"
score_weight_name
=
(
"language_model.score.weight"
if
using_vlm_head
else
"score.weight"
)
loaded_weights
.
add
(
score_weight_name
)
loaded_weights
.
add
(
score_weight_name
)
lm_head_name
=
"lm_head.weight"
lm_head_name
=
"lm_head.weight"
...
...
vllm/model_executor/models/minimax_m2.py
View file @
fc7980db
...
@@ -107,6 +107,7 @@ class MiniMaxM2MoE(nn.Module):
...
@@ -107,6 +107,7 @@ class MiniMaxM2MoE(nn.Module):
renormalize
=
True
,
renormalize
=
True
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.experts"
,
prefix
=
f
"
{
prefix
}
.experts"
,
router_logits_dtype
=
torch
.
float32
,
)
)
self
.
gate
=
ReplicatedLinear
(
self
.
gate
=
ReplicatedLinear
(
...
...
vllm/model_executor/models/nemotron_parse.py
View file @
fc7980db
...
@@ -11,7 +11,6 @@ import math
...
@@ -11,7 +11,6 @@ import math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Literal
from
typing
import
Annotated
,
Literal
import
cv2
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
...
@@ -416,6 +415,8 @@ class NemotronParseImageProcessor:
...
@@ -416,6 +415,8 @@ class NemotronParseImageProcessor:
else
:
else
:
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
self
.
target_height
=
self
.
target_width
=
int
(
self
.
final_size
)
import
cv2
self
.
transform
=
A
.
Compose
(
self
.
transform
=
A
.
Compose
(
[
[
A
.
PadIfNeeded
(
A
.
PadIfNeeded
(
...
@@ -457,6 +458,8 @@ class NemotronParseImageProcessor:
...
@@ -457,6 +458,8 @@ class NemotronParseImageProcessor:
new_height
=
int
(
new_width
/
aspect_ratio
)
new_height
=
int
(
new_width
/
aspect_ratio
)
# Use cv2.INTER_LINEAR like the original
# Use cv2.INTER_LINEAR like the original
import
cv2
return
cv2
.
resize
(
return
cv2
.
resize
(
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
image
,
(
new_width
,
new_height
),
interpolation
=
cv2
.
INTER_LINEAR
)
)
...
...
vllm/model_executor/models/registry.py
View file @
fc7980db
...
@@ -188,6 +188,7 @@ _TEXT_GENERATION_MODELS = {
...
@@ -188,6 +188,7 @@ _TEXT_GENERATION_MODELS = {
"SeedOssForCausalLM"
:
(
"seed_oss"
,
"SeedOssForCausalLM"
),
"SeedOssForCausalLM"
:
(
"seed_oss"
,
"SeedOssForCausalLM"
),
"Step1ForCausalLM"
:
(
"step1"
,
"Step1ForCausalLM"
),
"Step1ForCausalLM"
:
(
"step1"
,
"Step1ForCausalLM"
),
"Step3TextForCausalLM"
:
(
"step3_text"
,
"Step3TextForCausalLM"
),
"Step3TextForCausalLM"
:
(
"step3_text"
,
"Step3TextForCausalLM"
),
"Step3p5ForCausalLM"
:
(
"step3p5"
,
"Step3p5ForCausalLM"
),
"StableLMEpochForCausalLM"
:
(
"stablelm"
,
"StablelmForCausalLM"
),
"StableLMEpochForCausalLM"
:
(
"stablelm"
,
"StablelmForCausalLM"
),
"StableLmForCausalLM"
:
(
"stablelm"
,
"StablelmForCausalLM"
),
"StableLmForCausalLM"
:
(
"stablelm"
,
"StablelmForCausalLM"
),
"Starcoder2ForCausalLM"
:
(
"starcoder2"
,
"Starcoder2ForCausalLM"
),
"Starcoder2ForCausalLM"
:
(
"starcoder2"
,
"Starcoder2ForCausalLM"
),
...
@@ -478,6 +479,7 @@ _SPECULATIVE_DECODING_MODELS = {
...
@@ -478,6 +479,7 @@ _SPECULATIVE_DECODING_MODELS = {
"MedusaModel"
:
(
"medusa"
,
"Medusa"
),
"MedusaModel"
:
(
"medusa"
,
"Medusa"
),
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"OpenPanguMTPModel"
:
(
"openpangu_mtp"
,
"OpenPanguMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
"Qwen3NextMTP"
:
(
"qwen3_next_mtp"
,
"Qwen3NextMTP"
),
"Step3p5MTP"
:
(
"step3p5_mtp"
,
"Step3p5MTP"
),
# Temporarily disabled.
# Temporarily disabled.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
...
...
vllm/model_executor/models/step3p5.py
0 → 100644
View file @
fc7980db
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/step3p5_mtp.py
0 → 100644
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Iterable
import
torch
import
torch.nn
as
nn
from
transformers
import
PretrainedConfig
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.layernorm
import
GemmaRMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
IntermediateTensors
from
.step3p5
import
Step3p5DecoderLayer
,
get_spec_layer_idx_from_weight_name
from
.utils
import
maybe_prefix
logger
=
init_logger
(
__name__
)
class
SharedHead
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
QuantizationConfig
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
norm
=
GemmaRMSNorm
(
config
.
hidden_size
,
config
.
rms_norm_eps
)
self
.
head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
norm
(
hidden_states
)
class
Step3p5AMultiTokenPredictorLayer
(
nn
.
Module
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
enorm
=
GemmaRMSNorm
(
config
.
hidden_size
,
config
.
rms_norm_eps
)
self
.
hnorm
=
GemmaRMSNorm
(
config
.
hidden_size
,
config
.
rms_norm_eps
)
self
.
eh_proj
=
nn
.
Linear
(
config
.
hidden_size
*
2
,
config
.
hidden_size
,
bias
=
False
)
self
.
shared_head
=
SharedHead
(
config
=
config
,
quant_config
=
quant_config
)
self
.
mtp_block
=
Step3p5DecoderLayer
(
vllm_config
,
prefix
=
f
"
{
prefix
}
.mtp_block"
,
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
previous_hidden_states
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_index
:
int
=
0
,
)
->
torch
.
Tensor
:
assert
inputs_embeds
is
not
None
inputs_embeds
=
self
.
enorm
(
inputs_embeds
)
previous_hidden_states
=
self
.
hnorm
(
previous_hidden_states
)
hidden_states
=
self
.
eh_proj
(
torch
.
cat
([
inputs_embeds
,
previous_hidden_states
],
dim
=-
1
)
)
hidden_states
=
self
.
mtp_block
(
positions
=
positions
,
hidden_states
=
hidden_states
)
return
hidden_states
class
Step3p5AMultiTokenPredictor
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
mtp_start_layer_idx
=
config
.
num_hidden_layers
self
.
num_mtp_layers
=
config
.
num_nextn_predict_layers
# to map the exact layer index from weights
self
.
layers
=
torch
.
nn
.
ModuleDict
(
{
str
(
idx
):
Step3p5AMultiTokenPredictorLayer
(
vllm_config
,
f
"
{
prefix
}
.layers.
{
idx
}
"
,
)
for
idx
in
range
(
self
.
mtp_start_layer_idx
,
self
.
mtp_start_layer_idx
+
self
.
num_mtp_layers
,
)
}
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
previous_hidden_states
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
if
inputs_embeds
is
None
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
current_step_idx
=
spec_step_idx
%
self
.
num_mtp_layers
return
self
.
layers
[
str
(
self
.
mtp_start_layer_idx
+
current_step_idx
)](
input_ids
,
positions
,
previous_hidden_states
,
inputs_embeds
,
current_step_idx
,
)
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
current_step_idx
=
spec_step_idx
%
self
.
num_mtp_layers
mtp_layer
=
self
.
layers
[
str
(
self
.
mtp_start_layer_idx
+
current_step_idx
)]
logits
=
self
.
logits_processor
(
mtp_layer
.
shared_head
.
head
,
mtp_layer
.
shared_head
(
hidden_states
)
)
return
logits
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
class
Step3p5MTP
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
self
.
config
=
vllm_config
.
model_config
.
hf_config
self
.
vllm_config
=
vllm_config
self
.
model
=
Step3p5AMultiTokenPredictor
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
)
)
def
embed_input_ids
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
embed_input_ids
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
hidden_states
,
inputs_embeds
,
spec_step_idx
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
spec_step_idx
:
int
=
0
,
)
->
torch
.
Tensor
|
None
:
return
self
.
model
.
compute_logits
(
hidden_states
,
spec_step_idx
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
expert_params_mapping
=
[
(
".moe.experts.w13_weight"
,
".moe.gate_proj.weight"
,
"w1"
),
(
".moe.experts.w13_weight"
,
".moe.up_proj.weight"
,
"w3"
),
(
".moe.experts.w2_weight"
,
".moe.down_proj.weight"
,
"w2"
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
spec_layer
=
get_spec_layer_idx_from_weight_name
(
self
.
config
,
name
)
if
"embed_tokens"
not
in
name
and
spec_layer
is
None
:
continue
name
=
self
.
_rewrite_spec_layer_name
(
spec_layer
,
name
)
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
# Skip non-stacked layers and experts (experts handled below).
if
weight_name
not
in
name
:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if
(
"mlp.experts."
in
name
)
and
name
not
in
params_dict
:
continue
if
"experts"
in
name
or
"moe"
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
shard_id
=
mapping
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
(
name
.
endswith
(
".bias"
)
or
name
.
endswith
(
"_bias"
)
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
for
expert_id
in
range
(
loaded_weight
.
shape
[
0
]):
loaded_weight_expert
=
loaded_weight
[
expert_id
]
weight_loader
(
param
,
loaded_weight_expert
,
name
,
shard_id
=
shard_id
,
expert_id
=
expert_id
,
)
loaded_params
.
add
(
name
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
(
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
or
"tok_embeddings"
in
name
):
continue
if
spec_layer
is
not
None
and
".transformer."
in
name
:
name
=
name
.
replace
(
".transformer."
,
"."
)
if
"shared_head"
in
name
:
name
=
name
.
replace
(
"shared_head.output"
,
"shared_head.head"
)
if
"embed_tokens"
in
name
:
assert
(
hasattr
(
self
.
config
,
"num_nextn_predict_layers"
)
and
self
.
config
.
num_nextn_predict_layers
>
0
)
name
=
"model.embed_tokens.weight"
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
params_need_to_load
=
set
(
params_dict
.
keys
())
# Some KV cache scales are optional: checkpoints may omit them and vLLM
# will fall back to default scales during initialization.
optional_params
=
{
name
for
name
,
param
in
params_dict
.
items
()
if
name
.
endswith
((
".k_scale"
,
".v_scale"
,
".q_scale"
,
".prob_scale"
))
and
getattr
(
param
,
"numel"
,
lambda
:
0
)()
==
1
and
getattr
(
param
,
"requires_grad"
,
False
)
is
False
}
params_need_to_load
-=
optional_params
if
params_need_to_load
!=
loaded_params
:
missing_params
=
list
(
params_need_to_load
-
loaded_params
)
param_name_example
=
missing_params
[
0
]
raise
RuntimeError
(
"Some parameters like "
f
"
{
param_name_example
}
are not in the checkpoint and will falsely "
"use random initialization"
)
return
loaded_params
def
_rewrite_spec_layer_name
(
self
,
spec_layer
:
int
,
name
:
str
)
->
str
:
"""
Rewrite the weight name to match the format of the original model.
Add .mtp_block for modules in transformer layer block for spec layer
"""
spec_layer_weight_names
=
[
"embed_tokens"
,
"enorm"
,
"hnorm"
,
"eh_proj"
,
"shared_head"
,
]
spec_layer_weight
=
False
for
weight_name
in
spec_layer_weight_names
:
if
weight_name
in
name
:
spec_layer_weight
=
True
break
if
not
spec_layer_weight
:
# treat rest weights as weights for transformer layer block
name
=
name
.
replace
(
f
"model.layers.
{
spec_layer
}
."
,
f
"model.layers.
{
spec_layer
}
.mtp_block."
)
return
name
vllm/reasoning/__init__.py
View file @
fc7980db
...
@@ -84,6 +84,10 @@ _REASONING_PARSERS_TO_REGISTER = {
...
@@ -84,6 +84,10 @@ _REASONING_PARSERS_TO_REGISTER = {
"step3_reasoning_parser"
,
"step3_reasoning_parser"
,
"Step3ReasoningParser"
,
"Step3ReasoningParser"
,
),
),
"step3p5"
:
(
"step3p5_reasoning_parser"
,
"Step3p5ReasoningParser"
,
),
}
}
...
...
vllm/reasoning/step3p5_reasoning_parser.py
0 → 100644
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
ChatCompletionRequest
,
)
from
vllm.entrypoints.openai.engine.protocol
import
DeltaMessage
from
vllm.entrypoints.openai.responses.protocol
import
(
ResponsesRequest
,
)
from
vllm.reasoning.basic_parsers
import
BaseThinkingReasoningParser
from
vllm.tokenizers
import
TokenizerLike
class
Step3p5ReasoningParser
(
BaseThinkingReasoningParser
):
"""
Reasoning parser for Step3p5 model.
Step3p5 uses the <think>...</think> format, but it tends to emit an extra
newline immediately before and/or after the </think> token. This parser trims:
- the newline right before </think>
- the newline right after </think>
"""
@
property
def
start_token
(
self
)
->
str
:
return
"<think>"
@
property
def
end_token
(
self
)
->
str
:
return
"</think>"
def
__init__
(
self
,
tokenizer
:
TokenizerLike
,
*
args
,
**
kwargs
):
super
().
__init__
(
tokenizer
,
*
args
,
**
kwargs
)
# Used to hold a trailing "\n" from reasoning content so we can decide
# whether it is immediately before </think>.
self
.
_pending_reasoning_newline
=
False
# Used to delay the reasoning end detection.
# This is necessary to remove the newline appears immediately after </think>,
# which may cause the end detection to be delayed by one round.
self
.
end_offset
=
1
def
is_reasoning_end
(
self
,
input_ids
:
Sequence
[
int
])
->
bool
:
if
self
.
end_token_id
in
input_ids
and
self
.
end_offset
>
0
:
self
.
end_offset
-=
1
return
False
return
self
.
end_offset
<
1
def
is_reasoning_end_streaming
(
self
,
input_ids
:
Sequence
[
int
],
delta_ids
:
Sequence
[
int
]
)
->
bool
:
if
self
.
end_token_id
in
input_ids
and
self
.
end_offset
>
0
:
self
.
end_offset
-=
1
return
False
return
self
.
end_offset
<
1
def
extract_reasoning
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
|
ResponsesRequest
,
)
->
tuple
[
str
|
None
,
str
|
None
]:
reasoning
,
content
=
super
().
extract_reasoning
(
model_output
,
request
)
if
reasoning
is
not
None
:
reasoning
=
reasoning
.
removesuffix
(
"
\n
"
)
if
content
is
not
None
:
content
=
content
.
removeprefix
(
"
\n
"
)
return
reasoning
or
None
,
content
or
None
def
extract_reasoning_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
)
->
DeltaMessage
|
None
:
# Drop the immediate newline that models often emit after </think>.
if
previous_text
.
endswith
(
self
.
end_token
)
and
delta_text
:
if
delta_text
==
"
\n
"
:
return
None
elif
delta_text
.
startswith
(
"
\n
"
):
remaining
=
delta_text
.
removeprefix
(
"
\n
"
)
return
DeltaMessage
(
content
=
remaining
)
if
remaining
else
None
ret
=
super
().
extract_reasoning_streaming
(
previous_text
,
current_text
,
delta_text
,
previous_token_ids
,
current_token_ids
,
delta_token_ids
,
)
if
ret
is
None
:
return
None
# Compatibility path for models that don't generate the start token:
# treat everything before </think> as reasoning and everything after
# as content.
if
(
self
.
start_token_id
not
in
previous_token_ids
and
self
.
start_token_id
not
in
delta_token_ids
):
if
self
.
end_token_id
in
delta_token_ids
:
end_index
=
delta_text
.
find
(
self
.
end_token
)
reasoning
=
delta_text
[:
end_index
]
content
=
delta_text
[
end_index
+
len
(
self
.
end_token
)
:]
ret
=
DeltaMessage
(
reasoning
=
reasoning
,
content
=
content
or
None
)
elif
self
.
end_token_id
in
previous_token_ids
:
ret
=
DeltaMessage
(
content
=
delta_text
)
else
:
ret
=
DeltaMessage
(
reasoning
=
delta_text
)
reasoning_to_output
=
ret
.
reasoning
content_to_output
=
ret
.
content
# Reasoning: handle the newline immediately before </think>.
if
reasoning_to_output
is
not
None
:
if
self
.
_pending_reasoning_newline
:
reasoning_to_output
=
"
\n
"
+
reasoning_to_output
self
.
_pending_reasoning_newline
=
False
if
reasoning_to_output
.
endswith
(
"
\n
"
):
reasoning_to_output
=
reasoning_to_output
.
removesuffix
(
"
\n
"
)
if
self
.
end_token
in
delta_text
:
# Trailing "\n" is right before </think>, drop it.
self
.
_pending_reasoning_newline
=
False
else
:
# Hold the trailing "\n" until we know whether </think> follows.
self
.
_pending_reasoning_newline
=
True
# Content: handle the newline immediately after </think>.
if
content_to_output
is
not
None
:
# No need to get into parser again to remove newline after </think>.
self
.
end_offset
-=
1
# If we have content, reasoning must have ended.
self
.
_pending_reasoning_newline
=
False
if
self
.
end_token
in
delta_text
and
content_to_output
.
startswith
(
"
\n
"
):
content_to_output
=
content_to_output
.
removeprefix
(
"
\n
"
)
reasoning_to_output
=
reasoning_to_output
or
None
content_to_output
=
content_to_output
or
None
if
reasoning_to_output
is
None
and
content_to_output
is
None
:
return
None
return
DeltaMessage
(
reasoning
=
reasoning_to_output
,
content
=
content_to_output
)
vllm/tool_parsers/__init__.py
View file @
fc7980db
...
@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = {
...
@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = {
"step3_tool_parser"
,
"step3_tool_parser"
,
"Step3ToolParser"
,
"Step3ToolParser"
,
),
),
"step3p5"
:
(
"step3p5_tool_parser"
,
"Step3p5ToolParser"
,
),
"xlam"
:
(
"xlam"
:
(
"xlam_tool_parser"
,
"xlam_tool_parser"
,
"xLAMToolParser"
,
"xLAMToolParser"
,
...
...
vllm/tool_parsers/step3p5_tool_parser.py
0 → 100644
View file @
fc7980db
This diff is collapsed.
Click to expand it.
vllm/transformers_utils/config.py
View file @
fc7980db
...
@@ -96,6 +96,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
...
@@ -96,6 +96,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
ultravox
=
"UltravoxConfig"
,
ultravox
=
"UltravoxConfig"
,
step3_vl
=
"Step3VLConfig"
,
step3_vl
=
"Step3VLConfig"
,
step3_text
=
"Step3TextConfig"
,
step3_text
=
"Step3TextConfig"
,
step3p5
=
"Step3p5Config"
,
qwen3_asr
=
"Qwen3ASRConfig"
,
qwen3_next
=
"Qwen3NextConfig"
,
qwen3_next
=
"Qwen3NextConfig"
,
lfm2_moe
=
"Lfm2MoeConfig"
,
lfm2_moe
=
"Lfm2MoeConfig"
,
tarsier2
=
"Tarsier2Config"
,
tarsier2
=
"Tarsier2Config"
,
...
...
vllm/transformers_utils/configs/__init__.py
View file @
fc7980db
...
@@ -50,6 +50,8 @@ _CLASS_TO_MODULE: dict[str, str] = {
...
@@ -50,6 +50,8 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Step3VLConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3VLConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3VisionEncoderConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3VisionEncoderConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3TextConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3TextConfig"
:
"vllm.transformers_utils.configs.step3_vl"
,
"Step3p5Config"
:
"vllm.transformers_utils.configs.step3p5"
,
"Qwen3ASRConfig"
:
"vllm.transformers_utils.configs.qwen3_asr"
,
"Qwen3NextConfig"
:
"vllm.transformers_utils.configs.qwen3_next"
,
"Qwen3NextConfig"
:
"vllm.transformers_utils.configs.qwen3_next"
,
"Tarsier2Config"
:
"vllm.transformers_utils.configs.tarsier2"
,
"Tarsier2Config"
:
"vllm.transformers_utils.configs.tarsier2"
,
# Special case: DeepseekV3Config is from HuggingFace Transformers
# Special case: DeepseekV3Config is from HuggingFace Transformers
...
@@ -90,6 +92,8 @@ __all__ = [
...
@@ -90,6 +92,8 @@ __all__ = [
"Step3VLConfig"
,
"Step3VLConfig"
,
"Step3VisionEncoderConfig"
,
"Step3VisionEncoderConfig"
,
"Step3TextConfig"
,
"Step3TextConfig"
,
"Step3p5Config"
,
"Qwen3ASRConfig"
,
"Qwen3NextConfig"
,
"Qwen3NextConfig"
,
"Tarsier2Config"
,
"Tarsier2Config"
,
]
]
...
...
vllm/transformers_utils/configs/step3p5.py
0 → 100644
View file @
fc7980db
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
from
transformers.configuration_utils
import
PretrainedConfig
class
Step3p5Config
(
PretrainedConfig
):
model_type
=
"step3p5"
def
__init__
(
self
,
hidden_size
:
int
=
5120
,
intermediate_size
:
int
=
13312
,
num_attention_heads
:
int
=
40
,
num_attention_groups
:
int
=
8
,
num_hidden_layers
:
int
=
48
,
max_seq_len
:
int
=
4096
,
vocab_size
:
int
=
65536
,
rms_norm_eps
:
float
=
1e-5
,
moe_every_n_layer
:
int
=
2
,
use_moe
:
bool
=
False
,
moe_intermediate_size
:
int
=
10240
,
moe_num_experts
:
int
=
16
,
moe_top_k
:
int
=
4
,
moe_layer_offset
:
int
=
0
,
rope_theta
:
float
|
list
[
float
]
|
None
=
500000
,
rope_scaling
:
dict
[
str
,
Any
]
|
None
=
None
,
head_dim
:
int
|
None
=
None
,
share_expert_dim
:
int
|
None
=
None
,
norm_expert_weight
:
bool
=
True
,
bos_token_id
:
list
[
int
]
|
int
|
None
=
None
,
eos_token_id
:
list
[
int
]
|
int
|
None
=
None
,
moe_router_activation
:
str
=
"softmax"
,
moe_router_scaling_factor
:
float
=
1.0
,
att_impl_type
:
str
=
"GQA"
,
use_head_wise_attn_gate
:
bool
=
False
,
use_moe_router_bias
:
bool
=
True
,
need_fp32_gate
:
bool
=
True
,
layer_types
:
list
[
str
]
|
None
=
None
,
use_rope_layers
:
list
[
bool
]
|
None
=
None
,
yarn_only_types
:
list
[
str
]
|
None
=
None
,
attention_other_setting
:
dict
[
str
,
Any
]
|
None
=
None
,
num_nextn_predict_layers
:
int
=
0
,
swiglu_limits
:
list
[
float
]
|
None
=
None
,
swiglu_limits_shared
:
list
[
float
]
|
None
=
None
,
max_position_embeddings
:
int
|
None
=
None
,
**
kwargs
,
):
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_attention_heads
=
num_attention_heads
self
.
num_attention_groups
=
num_attention_groups
self
.
num_hidden_layers
=
num_hidden_layers
self
.
max_seq_len
=
max_seq_len
self
.
vocab_size
=
vocab_size
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_moe
=
use_moe
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
moe_every_n_layer
=
moe_every_n_layer
self
.
moe_num_experts
=
moe_num_experts
self
.
num_experts_per_tok
=
moe_top_k
self
.
moe_top_k
=
moe_top_k
self
.
moe_layer_offset
=
moe_layer_offset
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
head_dim
=
head_dim
if
share_expert_dim
is
None
:
self
.
share_expert_dim
=
self
.
moe_intermediate_size
*
self
.
moe_top_k
else
:
self
.
share_expert_dim
=
share_expert_dim
self
.
norm_expert_weight
=
norm_expert_weight
self
.
max_position_embeddings
=
max_position_embeddings
self
.
moe_router_activation
=
moe_router_activation
self
.
moe_router_scaling_factor
=
moe_router_scaling_factor
self
.
use_moe_router_bias
=
use_moe_router_bias
self
.
need_fp32_gate
=
need_fp32_gate
self
.
att_impl_type
=
att_impl_type
self
.
use_head_wise_attn_gate
=
use_head_wise_attn_gate
self
.
layer_types
=
layer_types
self
.
use_rope_layers
=
use_rope_layers
self
.
yarn_only_types
=
yarn_only_types
self
.
attention_other_setting
=
attention_other_setting
self
.
num_nextn_predict_layers
=
num_nextn_predict_layers
self
.
swiglu_limits
=
swiglu_limits
self
.
swiglu_limits_shared
=
swiglu_limits_shared
resolved_bos_token_id
=
1
if
bos_token_id
is
None
else
bos_token_id
resolved_eos_token_id
=
[
2
,
3
]
if
eos_token_id
is
None
else
eos_token_id
self
.
bos_token_id
=
resolved_bos_token_id
self
.
eos_token_id
=
resolved_eos_token_id
super
().
__init__
(
bos_token_id
=
resolved_bos_token_id
,
eos_token_id
=
resolved_eos_token_id
,
**
kwargs
,
)
vllm/v1/attention/backends/flash_attn.py
View file @
fc7980db
...
@@ -263,18 +263,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
...
@@ -263,18 +263,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
vllm_config
:
"VllmConfig"
,
vllm_config
:
"VllmConfig"
,
kv_cache_spec
:
"AttentionSpec"
,
kv_cache_spec
:
"AttentionSpec"
,
)
->
AttentionCGSupport
:
)
->
AttentionCGSupport
:
# FA2 does not support CUDA graphs with encoder-decoder models due to
# accuracy issues reported in https://github.com/vllm-project/vllm/issues/33091
if
(
vllm_config
.
model_config
.
is_encoder_decoder
and
get_flash_attn_version
()
==
2
):
logger
.
warning_once
(
"FlashAttention2 does not support CUDA graphs with "
"encoder-decoder models due to accuracy issues reported in #33091. "
"Disabling CUDA graph."
)
return
AttentionCGSupport
.
NEVER
return
cls
.
_cudagraph_support
return
cls
.
_cudagraph_support
def
__init__
(
def
__init__
(
...
...
vllm/v1/core/kv_cache_coordinator.py
View file @
fc7980db
...
@@ -479,6 +479,16 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
...
@@ -479,6 +479,16 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
hit_length
=
max_cache_hit_length
hit_length
=
max_cache_hit_length
hit_blocks_by_group
:
list
[
list
[
KVCacheBlock
]
|
None
]
=
[
None
]
*
num_groups
hit_blocks_by_group
:
list
[
list
[
KVCacheBlock
]
|
None
]
=
[
None
]
*
num_groups
# Simple hybrid (1 full attn + 1 other): one iteration suffices.
# Full attn is always first if it exists. This avoids EAGLE drops
# being applied multiple times to non-full-attn groups.
# FIXME (yifan): However, for complex hybrid models with multiple attn
# groups, we still have the EAGLE spiral block dropping problem. See
# discussion in issue https://github.com/vllm-project/vllm/issues/32802.
is_simple_hybrid
=
len
(
self
.
attention_groups
)
==
2
and
isinstance
(
self
.
attention_groups
[
0
][
0
],
FullAttentionSpec
)
while
True
:
while
True
:
curr_hit_length
=
hit_length
curr_hit_length
=
hit_length
...
@@ -495,10 +505,6 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
...
@@ -495,10 +505,6 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
# the last iteration.
# the last iteration.
num_blocks
=
curr_hit_length
//
spec
.
block_size
num_blocks
=
curr_hit_length
//
spec
.
block_size
curr_hit_length
=
num_blocks
*
spec
.
block_size
curr_hit_length
=
num_blocks
*
spec
.
block_size
for
group_id
in
group_ids
:
blocks
=
hit_blocks_by_group
[
group_id
]
assert
blocks
is
not
None
del
blocks
[
num_blocks
:]
else
:
else
:
hit_blocks
=
manager_cls
.
find_longest_cache_hit
(
hit_blocks
=
manager_cls
.
find_longest_cache_hit
(
block_hashes
=
_get_block_hashes
(
spec
),
block_hashes
=
_get_block_hashes
(
spec
),
...
@@ -513,10 +519,20 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
...
@@ -513,10 +519,20 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
for
group_id
,
blocks
in
zip
(
group_ids
,
hit_blocks
):
for
group_id
,
blocks
in
zip
(
group_ids
,
hit_blocks
):
hit_blocks_by_group
[
group_id
]
=
blocks
hit_blocks_by_group
[
group_id
]
=
blocks
if
curr_hit_length
<
hit_length
:
if
curr_hit_length
>=
hit_length
:
hit_length
=
curr_hit_length
else
:
break
break
hit_length
=
curr_hit_length
# Simple hybrid: exit after one iteration
if
is_simple_hybrid
:
break
# Truncate full attention blocks to final hit_length (if present)
spec
,
group_ids
,
_
=
self
.
attention_groups
[
0
]
if
isinstance
(
spec
,
FullAttentionSpec
):
num_blocks
=
hit_length
//
spec
.
block_size
for
group_id
in
group_ids
:
if
(
blks
:
=
hit_blocks_by_group
[
group_id
])
is
not
None
:
del
blks
[
num_blocks
:]
return
tuple
(
return
tuple
(
blocks
if
blocks
is
not
None
else
[]
for
blocks
in
hit_blocks_by_group
blocks
if
blocks
is
not
None
else
[]
for
blocks
in
hit_blocks_by_group
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment