Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c77bc77c
Commit
c77bc77c
authored
Jan 21, 2026
by
xiabo
Browse files
1、kvcache支持fp8的scale
parent
3d01cce7
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
14 deletions
+18
-14
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+18
-14
No files found.
vllm/v1/attention/backends/flash_attn.py
View file @
c77bc77c
...
...
@@ -700,8 +700,9 @@ class FlashAttentionImpl(AttentionImpl):
prefix_scheduler_metadata
=
attn_metadata
.
prefix_scheduler_metadata
,
suffix_scheduler_metadata
=
attn_metadata
.
scheduler_metadata
,
# q_descale=layer._q_scale,
# k_descale=layer._k_scale,
# v_descale=layer._v_scale,
q_descale
=
None
,
k_descale
=
layer
.
_k_scale
,
v_descale
=
layer
.
_v_scale
,
)
return
output
...
...
@@ -779,6 +780,9 @@ class FlashAttentionImpl(AttentionImpl):
# q_descale=layer._q_scale.expand(descale_shape),
# k_descale=layer._k_scale.expand(descale_shape),
# v_descale=layer._v_scale.expand(descale_shape),
q_descale
=
None
,
k_descale
=
layer
.
_k_scale
,
v_descale
=
layer
.
_v_scale
,
is_prefix_cache
=
False
,
)
...
...
@@ -932,12 +936,12 @@ def cascade_attention(
return_softmax_lse
=
True
,
scheduler_metadata
=
prefix_scheduler_metadata
,
# fa_version=fa_version,
#
q_descale=q_descale.expand(descale_shape)
#
if q_descale is not None else None,
#
k_descale=k_descale.expand(descale_shape)
#
if k_descale is not None else None,
#
v_descale=v_descale.expand(descale_shape)
#
if v_descale is not None else None,
q_descale
=
q_descale
.
expand
(
descale_shape
)
if
q_descale
is
not
None
else
None
,
k_descale
=
k_descale
.
expand
(
descale_shape
)
if
k_descale
is
not
None
else
None
,
v_descale
=
v_descale
.
expand
(
descale_shape
)
if
v_descale
is
not
None
else
None
,
is_prefix_cache
=
True
,
)
...
...
@@ -985,12 +989,12 @@ def cascade_attention(
return_softmax_lse
=
True
,
scheduler_metadata
=
suffix_scheduler_metadata
,
# fa_version=fa_version,
#
q_descale=q_descale.expand(descale_shape)
#
if q_descale is not None else None,
#
k_descale=k_descale.expand(descale_shape)
#
if k_descale is not None else None,
#
v_descale=v_descale.expand(descale_shape)
#
if v_descale is not None else None,
q_descale
=
q_descale
.
expand
(
descale_shape
)
if
q_descale
is
not
None
else
None
,
k_descale
=
k_descale
.
expand
(
descale_shape
)
if
k_descale
is
not
None
else
None
,
v_descale
=
v_descale
.
expand
(
descale_shape
)
if
v_descale
is
not
None
else
None
,
is_prefix_cache
=
True
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment