Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
561b6cbb
Commit
561b6cbb
authored
Apr 10, 2026
by
王敏
Browse files
merge dev主干代码
parents
0beafe40
ce47a56e
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
239 additions
and
336 deletions
+239
-336
requirements/common.txt
requirements/common.txt
+1
-1
requirements/rocm.txt
requirements/rocm.txt
+1
-1
vllm/_custom_ops.py
vllm/_custom_ops.py
+5
-4
vllm/attention/layer.py
vllm/attention/layer.py
+1
-2
vllm/envs.py
vllm/envs.py
+8
-7
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
...del_executor/layers/fused_moe/fused_moe_modular_method.py
+6
-1
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+21
-5
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+2
-3
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+7
-3
vllm/model_executor/layers/mla.py
vllm/model_executor/layers/mla.py
+4
-3
vllm/model_executor/layers/sparse_attn_indexer.py
vllm/model_executor/layers/sparse_attn_indexer.py
+15
-16
vllm/model_executor/models/config.py
vllm/model_executor/models/config.py
+3
-1
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+1
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+16
-8
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-31
vllm/tool_parsers/qwen3coder_tool_parser.py
vllm/tool_parsers/qwen3coder_tool_parser.py
+7
-4
vllm/v1/attention/backend.py
vllm/v1/attention/backend.py
+0
-3
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+49
-184
vllm/v1/attention/ops/triton_unified_attention.py
vllm/v1/attention/ops/triton_unified_attention.py
+89
-54
vllm/v1/attention/selector.py
vllm/v1/attention/selector.py
+0
-4
No files found.
requirements/common.txt
View file @
561b6cbb
...
...
@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers
>
=
4.56.0, < 5
transformers
=
=
5.2.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 6.33.5 # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...
...
requirements/rocm.txt
View file @
561b6cbb
...
...
@@ -26,7 +26,7 @@ fastrlock==0.8.3
torch == 2.9.0
triton == 3.3.0
flash_attn == 2.
6.1
flash_attn == 2.
8.3
flash_mla == 1.0.0
lightop == 0.6.0
lmslim == 0.3.1
vllm/_custom_ops.py
View file @
561b6cbb
...
...
@@ -370,7 +370,8 @@ def rms_norm_opt_fake(
def
fused_add_rms_norm_opt
(
input
:
torch
.
Tensor
,
residual
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
epsilon
:
float
,
training
:
Optional
[
bool
]
=
False
,
inplace
:
Optional
[
bool
]
=
True
)
->
None
:
op
.
rn_add_forward_autograd
(
input
,
residual
,
weight
,
epsilon
,
training
,
inplace
)
op
.
fused_add_rms_norm_opt
(
input
,
residual
,
weight
,
epsilon
)
#op.rn_add_forward_autograd(input, residual, weight, epsilon, training, inplace)
def
fused_add_rms_norm_opt_fake
(
input
:
torch
.
Tensor
,
...
...
@@ -379,8 +380,8 @@ def fused_add_rms_norm_opt_fake(
epsilon
:
float
,
training
:
Optional
[
bool
]
=
False
,
inplace
:
Optional
[
bool
]
=
False
,
)
->
torch
.
Tensor
:
return
torch
.
empty_like
(
input
)
)
->
None
:
return
None
def
fused_qk_norm_rope
(
qkv
:
torch
.
Tensor
,
...
...
@@ -3626,7 +3627,7 @@ direct_register_custom_op(
direct_register_custom_op
(
op_name
=
"fused_add_rms_norm_opt"
,
op_func
=
fused_add_rms_norm_opt
,
mutates_args
=
[],
mutates_args
=
[
"input"
,
"residual"
],
fake_impl
=
fused_add_rms_norm_opt_fake
,
)
...
...
vllm/attention/layer.py
View file @
561b6cbb
...
...
@@ -245,7 +245,6 @@ class Attention(nn.Module, AttentionLayerBase):
use_mla
=
False
,
has_sink
=
self
.
has_sink
,
use_mm_prefix
=
self
.
use_mm_prefix
,
use_alibi_sqrt
=
bool
(
use_alibi_sqrt
),
attn_type
=
attn_type
,
)
else
:
...
...
vllm/envs.py
View file @
561b6cbb
...
...
@@ -323,10 +323,9 @@ if TYPE_CHECKING:
USE_LIGHTOP_PER_TOKEN_GROUP_QUANT_FP8
:
bool
=
False
USE_LIGHTOP_TOPK
:
bool
=
False
USE_LIGHTOP_CONVERT_REQ_INDEX_TO_GLOBAL_INDEX
:
bool
=
False
VLLM_DISABLE_DSA
:
bool
=
False
VLLM_MLA_CP
:
bool
=
False
VLLM_MLA_CPLB
:
bool
=
False
def
get_default_cache_root
():
return
os
.
getenv
(
"XDG_CACHE_HOME"
,
...
...
@@ -2006,15 +2005,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
"USE_LIGHTOP_CONVERT_REQ_INDEX_TO_GLOBAL_INDEX"
:
lambda
:
(
os
.
environ
.
get
(
"USE_LIGHTOP_CONVERT_REQ_INDEX_TO_GLOBAL_INDEX"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set to 1/True, enable mla context parallel
#If set to 1/True, disenable the DSA.
"VLLM_DISABLE_DSA"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_DISABLE_DSA"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
# If set to 1/True, enable mla context parallel
"VLLM_MLA_CP"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_MLA_CP"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
"VLLM_MLA_CPLB"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_MLA_CPLB"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
}
# --8<-- [end:env-vars-definition]
...
...
vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
View file @
561b6cbb
...
...
@@ -98,6 +98,8 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
use_nn_moe
:
bool
|
None
=
False
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
return
self
.
fused_experts
(
hidden_states
=
x
,
...
...
@@ -110,4 +112,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
global_num_experts
=
layer
.
global_num_experts
,
apply_router_weight_on_input
=
layer
.
apply_router_weight_on_input
,
expert_map
=
None
if
self
.
disable_expert_map
else
layer
.
expert_map
,
use_nn_moe
=
use_nn_moe
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
vllm/model_executor/layers/fused_moe/modular_kernel.py
View file @
561b6cbb
...
...
@@ -735,6 +735,8 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
ExpertTokensMetadata
|
None
,
apply_router_weight_on_input
:
bool
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
None
:
"""
This function computes the intermediate result of a Mixture of Experts
...
...
@@ -1155,6 +1157,8 @@ class FusedMoEModularKernel(torch.nn.Module):
apply_router_weight_on_input
:
bool
,
expert_tokens_meta
:
ExpertTokensMetadata
|
None
,
use_nn_moe
:
bool
|
None
=
False
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
:
_
,
M_full
,
N
,
K
,
top_k
=
self
.
fused_experts
.
moe_problem_size
(
a1q
,
w1
,
w2
,
topk_ids
...
...
@@ -1216,7 +1220,13 @@ class FusedMoEModularKernel(torch.nn.Module):
c_fused_out
=
self
.
_slice_output_tensor
(
fused_out
,
chunk_idx
,
num_chunks
,
CHUNK_SIZE
,
M_full
)
c_shared_output
=
(
None
if
shared_output
is
None
else
self
.
_slice_output_tensor
(
shared_output
,
chunk_idx
,
num_chunks
,
CHUNK_SIZE
,
M_full
)
)
self
.
fused_experts
.
apply
(
output
=
c_fused_out
,
hidden_states
=
a1q
[
s
:
e
],
...
...
@@ -1234,6 +1244,8 @@ class FusedMoEModularKernel(torch.nn.Module):
expert_tokens_meta
=
c_expert_tokens_meta
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
use_nn_moe
=
use_nn_moe
,
shared_output
=
c_shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
return
fused_out
...
...
@@ -1246,13 +1258,12 @@ class FusedMoEModularKernel(torch.nn.Module):
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
The _finalize method is a wrapper around self.prepare_finalize.finalize
that handles DBO, async and shared expert overlap.
"""
shared_output
:
torch
.
Tensor
|
None
=
None
if
not
self
.
prepare_finalize
.
supports_async
():
assert
not
dbo_enabled
()
...
...
@@ -1264,11 +1275,11 @@ class FusedMoEModularKernel(torch.nn.Module):
apply_router_weight_on_input
,
self
.
fused_experts
.
finalize_weight_and_reduce_impl
(),
)
if
self
.
shared_experts
is
not
None
:
if
shared_output
is
None
and
self
.
shared_experts
is
not
None
:
shared_output
=
self
.
shared_experts
(
hidden_states
)
else
:
self
.
alt_event
.
record
()
if
self
.
shared_experts
is
not
None
:
if
shared_output
is
None
and
self
.
shared_experts
is
not
None
:
shared_output
=
self
.
shared_experts
(
hidden_states
)
current_stream
=
torch
.
cuda
.
current_stream
()
...
...
@@ -1327,6 +1338,8 @@ class FusedMoEModularKernel(torch.nn.Module):
expert_map
:
torch
.
Tensor
|
None
=
None
,
apply_router_weight_on_input
:
bool
=
False
,
use_nn_moe
:
bool
|
None
=
False
,
shared_output
:
torch
.
Tensor
|
None
=
None
,
routed_scaling_factor
:
float
=
1.0
,
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
This function computes a Mixture of Experts (MoE) layer using two sets
...
...
@@ -1389,6 +1402,8 @@ class FusedMoEModularKernel(torch.nn.Module):
apply_router_weight_on_input
=
apply_router_weight_on_input
,
expert_tokens_meta
=
expert_tokens_meta
,
use_nn_moe
=
use_nn_moe
,
shared_output
=
shared_output
,
routed_scaling_factor
=
routed_scaling_factor
,
)
return
self
.
_finalize
(
...
...
@@ -1398,4 +1413,5 @@ class FusedMoEModularKernel(torch.nn.Module):
topk_weights
,
topk_ids
,
apply_router_weight_on_input
,
shared_output
=
shared_output
,
)
\ No newline at end of file
vllm/model_executor/layers/layernorm.py
View file @
561b6cbb
...
...
@@ -57,9 +57,8 @@ def fused_add_rms_norm(
return
rms_norm_batch_invariant
(
x
+
residual
,
weight
,
variance_epsilon
),
x
+
residual
# if envs.VLLM_USE_OPT_OP:
if
False
:
ops
.
fused_add_rms_norm_opt
(
if
envs
.
VLLM_USE_OPT_OP
:
torch
.
ops
.
vllm
.
fused_add_rms_norm_opt
(
x
,
residual
,
weight
,
...
...
vllm/model_executor/layers/linear.py
View file @
561b6cbb
...
...
@@ -271,7 +271,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
|
None
=
None
,
bias
:
torch
.
Tensor
|
None
=
None
,
**
_
)
->
torch
.
Tensor
:
if
self
.
use_llama_nn
:
# if os.environ['GEMM_PAD'] == '1' and gemm_bank_conf(layer.weight.shape[1] - 32):
...
...
@@ -458,10 +458,14 @@ class ReplicatedLinear(LinearBase):
def
forward
(
self
,
x
:
torch
.
Tensor
,
*
,
iqis
:
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
|
None
=
None
)
->
torch
.
Tensor
|
tuple
[
torch
.
Tensor
,
Parameter
|
None
]:
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
assert
self
.
quant_method
is
not
None
if
envs
.
USE_FUSED_RMS_QUANT
and
iqis
is
not
None
and
iqis
[
0
]
is
not
None
:
output
=
self
.
quant_method
.
apply
(
self
,
x
,
bias
,
input_quant_args
=
iqis
)
else
:
output
=
self
.
quant_method
.
apply
(
self
,
x
,
bias
)
if
not
self
.
return_bias
:
...
...
vllm/model_executor/layers/mla.py
View file @
561b6cbb
...
...
@@ -181,9 +181,10 @@ class MultiHeadLatentAttentionWrapper(PluggableLayer):
)
if
self
.
indexer
and
self
.
is_sparse
:
_topk_indices
=
self
.
indexer
(
hidden_states
,
q_c
,
positions
,
self
.
indexer_rope_emb
)
if
envs
.
USE_FUSED_RMS_QUANT
and
iqis
is
not
None
:
_topk_indices
=
self
.
indexer
(
hidden_states
,
q_c
,
positions
,
self
.
indexer_rope_emb
,
iqis
=
iqis
)
else
:
_topk_indices
=
self
.
indexer
(
hidden_states
,
q_c
,
positions
,
self
.
indexer_rope_emb
)
if
llama_4_scaling
is
not
None
:
q
*=
llama_4_scaling
...
...
vllm/model_executor/layers/sparse_attn_indexer.py
View file @
561b6cbb
...
...
@@ -31,7 +31,6 @@ elif current_platform.is_xpu():
logger
=
init_logger
(
__name__
)
@
maybe_transfer_kv_layer
def
sparse_attn_indexer
(
hidden_states
:
torch
.
Tensor
,
...
...
vllm/model_executor/models/config.py
View file @
561b6cbb
...
...
@@ -3,6 +3,7 @@
from
copy
import
deepcopy
from
math
import
lcm
from
typing
import
TYPE_CHECKING
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
ModelRegistry
...
...
@@ -554,7 +555,8 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
# For DeepSeekV3.2, a custom fp8 format is used when fp8 kv-cache is enabled.
cache_config
=
vllm_config
.
cache_config
if
cache_config
.
cache_dtype
.
startswith
(
"fp8"
):
force_disable_dsa
=
envs
.
VLLM_DISABLE_DSA
if
cache_config
.
cache_dtype
.
startswith
(
"fp8"
)
and
not
force_disable_dsa
:
cache_config
.
cache_dtype
=
"fp8_ds_mla"
logger
.
info
(
"Using custom fp8 kv-cache format for DeepSeekV3.2"
)
if
cache_config
.
cache_dtype
==
"bfloat16"
:
...
...
vllm/model_executor/models/deepseek_mtp.py
View file @
561b6cbb
...
...
@@ -84,7 +84,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
self
.
device
=
current_platform
.
device_type
#添加判断,默认开启DSA
force_disable_dsa
=
os
.
environ
.
get
(
"
VLLM_DISABLE_DSA
"
,
"0"
)
==
"1"
force_disable_dsa
=
envs
.
VLLM_DISABLE_DSA
self
.
is_v32
=
hasattr
(
config
,
"index_topk"
)
and
not
force_disable_dsa
if
self
.
is_v32
:
topk_tokens
=
config
.
index_topk
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
561b6cbb
...
...
@@ -815,14 +815,17 @@ class Indexer(nn.Module):
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
qr
:
torch
.
Tensor
,
positions
,
rotary_emb
self
,
hidden_states
:
torch
.
Tensor
,
qr
:
torch
.
Tensor
,
positions
,
rotary_emb
,
*
,
iqis
:
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
|
None
=
None
)
->
torch
.
Tensor
:
q
,
_
=
self
.
wq_b
(
qr
)
q
=
q
.
view
(
-
1
,
self
.
n_head
,
self
.
head_dim
)
q_pe
,
q_nope
=
torch
.
split
(
q
,
[
self
.
rope_dim
,
self
.
head_dim
-
self
.
rope_dim
],
dim
=-
1
)
if
envs
.
USE_FUSED_RMS_QUANT
and
self
.
wk
.
weight
.
dtype
==
torch
.
int8
and
iqis
is
not
None
:
k
,
_
=
self
.
wk
(
hidden_states
,
iqis
=
iqis
)
else
:
k
,
_
=
self
.
wk
(
hidden_states
)
k
=
self
.
k_norm
(
k
)
k_pe
,
k_nope
=
torch
.
split
(
...
...
@@ -861,6 +864,9 @@ class Indexer(nn.Module):
else
:
q_fp8
=
q
if
envs
.
USE_FUSED_RMS_QUANT
and
self
.
weights_proj
.
weight
.
dtype
==
torch
.
int8
and
iqis
is
not
None
:
weights
,
_
=
self
.
weights_proj
(
hidden_states
,
iqis
=
iqis
)
else
:
weights
,
_
=
self
.
weights_proj
(
hidden_states
)
if
not
current_platform
.
is_rocm
()
or
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
.
split
(
':'
)[
0
]
==
"gfx938"
:
weights
=
(
...
...
@@ -997,7 +1003,7 @@ class DeepseekV2MLAAttention(nn.Module):
mscale
=
yarn_get_mscale
(
scaling_factor
,
float
(
mscale_all_dim
))
self
.
scaling
=
self
.
scaling
*
mscale
*
mscale
#添加判断,默认开启DSA
force_disable_dsa
=
os
.
environ
.
get
(
"
VLLM_DISABLE_DSA
"
,
"0"
)
==
"1"
force_disable_dsa
=
envs
.
VLLM_DISABLE_DSA
self
.
is_v32
=
hasattr
(
config
,
"index_topk"
)
and
not
force_disable_dsa
if
self
.
is_v32
:
...
...
@@ -1169,19 +1175,21 @@ class DeepseekV2DecoderLayer(nn.Module):
# Fix residual FP16 overflow
residual_fix_overflow
=
False
assert
self
.
input_layernorm
.
has_weight
is
True
# DSA should set update_input True
_dsa_flag
=
hasattr
(
self
.
self_attn
,
"indexer"
)
and
self
.
self_attn
.
indexer
is
not
None
if
residual
is
None
:
residual
=
hidden_states
.
clone
()
i_q
,
i_s
,
_
=
self
.
input_layernorm
(
x
=
hidden_states
,
residual
=
None
,
quant_dtype
=
torch
.
int8
,
update_input
=
False
update_input
=
_dsa_flag
)
residual_fix_overflow
=
True
else
:
i_q
,
i_s
,
residual
=
self
.
input_layernorm
(
x
=
hidden_states
,
residual
=
residual
,
quant_dtype
=
torch
.
int8
,
update_input
=
False
update_input
=
_dsa_flag
)
attn_kwargs
=
{
"positions"
:
positions
,
...
...
@@ -1318,7 +1326,7 @@ class DeepseekV2Model(nn.Module):
self
.
vocab_size
=
config
.
vocab_size
#添加判断,默认开启DSA
force_disable_dsa
=
os
.
environ
.
get
(
"
VLLM_DISABLE_DSA
"
,
"0"
)
==
"1"
force_disable_dsa
=
envs
.
VLLM_DISABLE_DSA
self
.
is_v32
=
hasattr
(
config
,
"index_topk"
)
and
not
force_disable_dsa
if
self
.
is_v32
:
...
...
vllm/platforms/rocm.py
View file @
561b6cbb
...
...
@@ -262,7 +262,6 @@ class RocmPlatform(Platform):
from
vllm._aiter_ops
import
rocm_aiter_ops
block_size
=
attn_selector_config
.
block_size
head_size
=
attn_selector_config
.
head_size
kv_cache_dtype
=
attn_selector_config
.
kv_cache_dtype
if
attn_selector_config
.
use_sparse
:
...
...
@@ -305,36 +304,9 @@ class RocmPlatform(Platform):
f
"is not MLA type while requested for MLA backend."
)
is_non64_block_multiple_64
=
(
block_size
!=
64
and
block_size
%
64
==
0
)
use_unified_flash
=
(
is_non64_block_multiple_64
and
head_size
==
256
)
if
(
envs
.
VLLM_USE_FLASH_ATTN_PA
and
is_non64_block_multiple_64
and
head_size
!=
256
):
logger
.
info_once
(
"Skip unified varlen kernel on V1 engine: head size %d is "
"unsupported (requires 256)."
,
head_size
,
)
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
(
block_size
==
64
or
use_unified_flash
):
if
use_unified_flash
and
block_size
!=
64
:
logger
.
info_once
(
"Using Flash Attention backend with unified varlen kernel on "
"V1 engine. (block size %d, requires block size divisible by 64)"
,
block_size
,
)
else
:
logger
.
info_once
(
"Using Flash Attention backend on V1 engine. "
"(only supports block size 64)"
)
if
envs
.
VLLM_USE_FLASH_ATTN_PA
and
block_size
==
64
:
logger
.
info_once
(
"Using Flash Attention backend on V1 engine. (only supports block size 64)"
)
return
AttentionBackendEnum
.
FLASH_ATTN
.
get_path
()
else
:
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
...
...
vllm/tool_parsers/qwen3coder_tool_parser.py
View file @
561b6cbb
...
...
@@ -243,7 +243,10 @@ class Qwen3CoderToolParser(ToolParser):
self
,
function_call_str
:
str
,
tools
:
list
[
ChatCompletionToolsParam
]
|
None
)
->
ToolCall
|
None
:
# Extract function name
end_index
=
function_call_str
.
index
(
">"
)
end_index
=
function_call_str
.
find
(
">"
)
# If there's no ">" character, this is not a valid xml function call
if
end_index
==
-
1
:
return
None
function_name
=
function_call_str
[:
end_index
]
param_config
=
self
.
_get_arguments_config
(
function_name
,
tools
)
parameters
=
function_call_str
[
end_index
+
1
:]
...
...
@@ -327,10 +330,10 @@ class Qwen3CoderToolParser(ToolParser):
idx
=
model_output
.
find
(
self
.
tool_call_prefix
)
content_index
=
content_index
if
content_index
>=
0
else
idx
content
=
model_output
[:
content_index
]
# .rstrip()
valid_tool_calls
=
[
tc
for
tc
in
tool_calls
if
tc
is
not
None
]
return
ExtractedToolCallInformation
(
tools_called
=
(
len
(
tool_calls
)
>
0
),
tool_calls
=
tool_calls
,
tools_called
=
(
len
(
valid_
tool_calls
)
>
0
),
tool_calls
=
valid_
tool_calls
,
content
=
content
if
content
else
None
,
)
...
...
vllm/v1/attention/backend.py
View file @
561b6cbb
...
...
@@ -225,7 +225,6 @@ class AttentionBackend(ABC):
has_sink
:
bool
,
use_sparse
:
bool
,
use_mm_prefix
:
bool
,
use_alibi_sqrt
:
bool
,
device_capability
:
"DeviceCapability"
,
attn_type
:
str
,
)
->
list
[
str
]:
...
...
@@ -242,8 +241,6 @@ class AttentionBackend(ABC):
invalid_reasons
.
append
(
"partial multimodal token full attention not supported"
)
if
use_alibi_sqrt
and
not
cls
.
supports_alibi_sqrt
():
invalid_reasons
.
append
(
"use_alibi_sqrt not supported"
)
if
use_mla
!=
cls
.
is_mla
():
if
use_mla
:
invalid_reasons
.
append
(
"MLA not supported"
)
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
561b6cbb
...
...
@@ -33,13 +33,6 @@ if is_flash_attn_varlen_func_available():
vllm_flash_attn_varlen_func
,
reshape_and_cache_cuda
,
)
from
vllm.v1.attention.ops.triton_reshape_and_cache_flash
import
(
triton_reshape_and_cache_flash
,
)
try
:
from
flash_attn
import
varlen_fwd_unified
except
Exception
:
varlen_fwd_unified
=
None
else
:
from
vllm.v1.attention.backends.fa_utils
import
(
flash_attn_supports_sinks
,
...
...
@@ -120,38 +113,6 @@ class FlashAttentionBackend(AttentionBackend):
def
get_builder_cls
()
->
type
[
"FlashAttentionMetadataBuilder"
]:
return
FlashAttentionMetadataBuilder
@
classmethod
def
supports_alibi_sqrt
(
cls
)
->
bool
:
return
True
@
classmethod
def
supports_mm_prefix
(
cls
)
->
bool
:
return
True
@
staticmethod
def
_use_rocm_unified_kv_layout
(
block_size
:
int
|
None
=
None
,
key_cache
:
torch
.
Tensor
|
None
=
None
,
value_cache
:
torch
.
Tensor
|
None
=
None
,
)
->
bool
:
if
not
current_platform
.
is_rocm
():
return
False
if
block_size
is
None
:
if
key_cache
is
not
None
and
value_cache
is
not
None
:
if
key_cache
.
ndim
!=
4
or
value_cache
.
ndim
!=
4
:
return
False
if
key_cache
.
shape
!=
value_cache
.
shape
:
return
False
block_size
=
key_cache
.
shape
[
1
]
else
:
try
:
block_size
=
get_current_vllm_config
().
cache_config
.
block_size
except
Exception
:
return
False
return
block_size
is
not
None
and
block_size
!=
64
and
block_size
%
64
==
0
if
current_platform
.
is_rocm
():
@
staticmethod
def
get_kv_cache_shape
(
...
...
@@ -163,9 +124,6 @@ class FlashAttentionBackend(AttentionBackend):
)
->
tuple
[
tuple
[
int
,
...],
tuple
[
int
,
...]]:
if
block_size
%
16
!=
0
:
raise
ValueError
(
"Block size must be a multiple of 16."
)
if
FlashAttentionBackend
.
_use_rocm_unified_kv_layout
(
block_size
):
unified_shape
=
(
num_blocks
,
block_size
,
num_kv_heads
,
head_size
)
return
(
unified_shape
,
unified_shape
)
return
(
(
num_blocks
,
num_kv_heads
,
block_size
,
head_size
),
(
num_blocks
,
num_kv_heads
,
head_size
,
block_size
),
...
...
@@ -178,17 +136,6 @@ class FlashAttentionBackend(AttentionBackend):
# `stride_order` indicates the permutation that gets
# us from `get_kv_cache_shape` to the actual memory layout we want.
cache_layout
=
get_kv_cache_layout
()
if
FlashAttentionBackend
.
_use_rocm_unified_kv_layout
():
if
cache_layout
!=
"NHD"
:
raise
RuntimeError
(
"ROCm unified KV layout currently supports NHD only."
)
if
include_num_layers_dimension
:
# (num_blocks, num_layers, block_size, num_kv_heads, head_size)
return
(
1
,
0
,
2
,
3
,
4
),
(
1
,
0
,
2
,
3
,
4
)
key_stride_order
=
(
0
,
1
,
2
,
3
)
value_stride_order
=
(
0
,
1
,
2
,
3
)
else
:
if
cache_layout
==
"NHD"
and
include_num_layers_dimension
:
# (num_blocks, num_layers, block_size, num_kv_heads, head_size)
return
(
1
,
0
,
3
,
2
,
5
),
(
1
,
0
,
4
,
2
,
3
)
...
...
@@ -324,34 +271,8 @@ class FlashAttentionMetadata:
prefix_scheduler_metadata
:
torch
.
Tensor
|
None
=
None
max_num_splits
:
int
=
0
mm_prefix_range
:
dict
[
int
,
list
[
tuple
[
int
,
int
]]]
|
None
=
None
qq_bias
:
torch
.
Tensor
|
None
=
None
causal
:
bool
=
True
@
property
def
mm_prefix_range_tensor
(
self
)
->
torch
.
Tensor
|
None
:
if
self
.
mm_prefix_range
is
None
:
return
None
num_seqs
=
self
.
seq_lens
.
shape
[
0
]
device
=
self
.
seq_lens
.
device
range_lists
=
[
self
.
mm_prefix_range
.
get
(
i
,
[(
0
,
0
)])
or
[(
0
,
0
)]
for
i
in
range
(
num_seqs
)
]
if
all
(
r
==
[(
0
,
0
)]
for
r
in
range_lists
):
return
None
range_tensors
=
[
torch
.
tensor
(
r
,
dtype
=
torch
.
int32
,
device
=
device
).
view
(
-
1
,
2
)
for
r
in
range_lists
]
return
torch
.
nested
.
nested_tensor
(
range_tensors
,
layout
=
torch
.
jagged
).
to_padded_tensor
(
0
)
def
_get_sliding_window_configs
(
vllm_config
:
VllmConfig
,
...
...
@@ -676,7 +597,6 @@ class FlashAttentionImpl(AttentionImpl):
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
str
|
None
=
None
,
sinks
:
torch
.
Tensor
|
None
=
None
,
use_alibi_sqrt
:
bool
=
False
,
)
->
None
:
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
...
...
@@ -702,7 +622,6 @@ class FlashAttentionImpl(AttentionImpl):
self
.
attn_type
=
attn_type
self
.
vllm_flash_attn_version
=
get_flash_attn_version
()
self
.
use_alibi_sqrt
=
use_alibi_sqrt
# Cache the batch invariant result for use in forward passes
self
.
batch_invariant_enabled
=
vllm_is_batch_invariant
()
...
...
@@ -729,14 +648,6 @@ class FlashAttentionImpl(AttentionImpl):
else
False
)
def
_get_unified_extras
(
self
,
attn_metadata
:
FlashAttentionMetadata
,
)
->
tuple
[
torch
.
Tensor
|
None
,
torch
.
Tensor
|
None
]:
mm_prefix_range_tensor
=
attn_metadata
.
mm_prefix_range_tensor
qq_bias
=
attn_metadata
.
qq_bias
return
mm_prefix_range_tensor
,
qq_bias
def
forward
(
self
,
layer
:
torch
.
nn
.
Module
,
...
...
@@ -863,36 +774,6 @@ class FlashAttentionImpl(AttentionImpl):
print
(
f
"q.shape =
{
query
[:
num_actual_tokens
].
shape
}
, key_cache.shape =
{
key_cache
.
shape
}
, value_cache.shape =
{
value_cache
.
shape
}
"
)
print
(
f
"cu_seqlens_q.shape =
{
cu_seqlens_q
.
shape
}
, max_seqlen_q =
{
max_seqlen_q
}
, seqused_k.shape =
{
seqused_k
.
shape
}
, max_seqlen_k =
{
max_seqlen_k
}
"
)
print
(
f
"softmax_scale =
{
self
.
scale
:.
3
f
}
, alibi_slopes =
{
self
.
alibi_slopes
}
, window_size =
{
self
.
sliding_window
}
, block_tables.shape =
{
block_table
.
shape
}
, softcap =
{
self
.
logits_soft_cap
}
, scheduler_metadata =
{
scheduler_metadata
}
"
)
use_unified_kv_layout
=
(
FlashAttentionBackend
.
_use_rocm_unified_kv_layout
(
key_cache
=
key_cache
,
value_cache
=
value_cache
)
)
if
use_unified_kv_layout
:
mm_prefix_range_tensor
,
qq_bias
=
self
.
_get_unified_extras
(
attn_metadata
)
varlen_fwd_unified
(
q
=
query
[:
num_actual_tokens
],
k
=
key_cache
,
v
=
value_cache
,
cu_seqlens_q
=
cu_seqlens_q
,
seqused_k
=
seqused_k
,
block_table
=
block_table
,
max_seqlen_q
=
max_seqlen_q
,
max_seqlen_k
=
max_seqlen_k
,
softmax_scale
=
self
.
scale
,
causal
=
attn_metadata
.
causal
,
softcap
=
self
.
logits_soft_cap
,
window_size
=
tuple
(
self
.
sliding_window
),
alibi_slopes
=
self
.
alibi_slopes
,
use_alibi_sqrt
=
self
.
use_alibi_sqrt
,
qq_bias
=
qq_bias
,
s_aux
=
self
.
sinks
,
mm_prefix_range
=
mm_prefix_range_tensor
,
return_softmax_lse
=
False
,
out
=
output
[:
num_actual_tokens
],
)
else
:
vllm_flash_attn_varlen_func
(
q
=
query
[:
num_actual_tokens
],
k
=
key_cache
,
...
...
@@ -1008,24 +889,8 @@ class FlashAttentionImpl(AttentionImpl):
# op uses the slot_mapping's shape to determine the number of
# actual tokens.
if
current_platform
.
is_rocm
():
if
FlashAttentionBackend
.
_use_rocm_unified_kv_layout
(
key_cache
=
key_cache
,
value_cache
=
value_cache
,
):
triton_reshape_and_cache_flash
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
self
.
kv_cache_dtype
,
layer
.
_k_scale
,
layer
.
_v_scale
,
)
else
:
if
envs
.
VLLM_USE_OPT_RESHAPE_AND_CACHE
:
from
lightop
import
reshape_and_cache_cuda
reshape_and_cache_cuda
(
key
,
value
,
...
...
vllm/v1/attention/ops/triton_unified_attention.py
View file @
561b6cbb
...
...
@@ -12,6 +12,10 @@ import torch
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
,
triton
try
:
from
flash_attn
import
varlen_fwd_unified
except
Exception
:
varlen_fwd_unified
=
None
logger
=
init_logger
(
__name__
)
float8_info
=
torch
.
finfo
(
current_platform
.
fp8_dtype
())
...
...
@@ -983,6 +987,14 @@ def unified_attention(
or
num_seqs
>
seq_threshold_3D
):
# print(f"[2D Triton] k shape: {k.shape}, v shape: {v.shape}")
use_fa_unified_2d
=
(
current_platform
.
is_rocm
()
and
varlen_fwd_unified
is
not
None
and
block_size
%
64
==
0
and
head_size
==
256
)
if
not
use_fa_unified_2d
:
# print("Running Triton kernel")
kernel_unified_attention_2d
[
(
total_num_q_blocks
,
...
...
@@ -1038,6 +1050,29 @@ def unified_attention(
BLOCK_M
=
BLOCK_M
,
USE_FP8
=
output_scale
is
not
None
,
)
else
:
# print("Running FA kernel")
varlen_fwd_unified
(
q
=
q
,
k
=
k
,
v
=
v
,
cu_seqlens_q
=
cu_seqlens_q
,
seqused_k
=
seqused_k
,
block_table
=
block_table
,
max_seqlen_q
=
max_seqlen_q
,
max_seqlen_k
=
max_seqlen_k
,
softmax_scale
=
softmax_scale
,
causal
=
causal
,
softcap
=
softcap
,
window_size
=
window_size
,
alibi_slopes
=
alibi_slopes
,
use_alibi_sqrt
=
use_alibi_sqrt
,
qq_bias
=
qq_bias
,
s_aux
=
sinks
,
mm_prefix_range
=
mm_prefix_range
,
return_softmax_lse
=
False
,
out
=
out
,
)
else
:
# print(f"[3D Triton] k shape: {k.shape}, v shape: {v.shape}")
kernel_unified_attention_3d
[
...
...
vllm/v1/attention/selector.py
View file @
561b6cbb
...
...
@@ -27,7 +27,6 @@ class AttentionSelectorConfig(NamedTuple):
has_sink
:
bool
=
False
use_sparse
:
bool
=
False
use_mm_prefix
:
bool
=
False
use_alibi_sqrt
:
bool
=
False
attn_type
:
str
=
AttentionType
.
DECODER
def
__repr__
(
self
):
...
...
@@ -40,7 +39,6 @@ class AttentionSelectorConfig(NamedTuple):
f
"has_sink=
{
self
.
has_sink
}
, "
f
"use_sparse=
{
self
.
use_sparse
}
, "
f
"use_mm_prefix=
{
self
.
use_mm_prefix
}
, "
f
"use_alibi_sqrt=
{
self
.
use_alibi_sqrt
}
, "
f
"attn_type=
{
self
.
attn_type
}
)"
)
...
...
@@ -54,7 +52,6 @@ def get_attn_backend(
has_sink
:
bool
=
False
,
use_sparse
:
bool
=
False
,
use_mm_prefix
:
bool
=
False
,
use_alibi_sqrt
:
bool
=
False
,
attn_type
:
str
|
None
=
None
,
)
->
type
[
AttentionBackend
]:
"""Selects which attention backend to use and lazily imports it."""
...
...
@@ -80,7 +77,6 @@ def get_attn_backend(
has_sink
=
has_sink
,
use_sparse
=
use_sparse
,
use_mm_prefix
=
use_mm_prefix
,
use_alibi_sqrt
=
use_alibi_sqrt
,
attn_type
=
attn_type
or
AttentionType
.
DECODER
,
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment