Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
239ef0c1
Unverified
Commit
239ef0c1
authored
Sep 22, 2025
by
Michael Goin
Committed by
GitHub
Sep 22, 2025
Browse files
[CI Failure] Fix fp8 kv cache on <SM90 (#25396)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
1d7f95b8
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
2 deletions
+6
-2
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+6
-2
No files found.
vllm/platforms/cuda.py
View file @
239ef0c1
...
...
@@ -286,6 +286,9 @@ class CudaPlatformBase(Platform):
TREE_ATTN_V1
=
"vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"
# noqa: E501
XFORMERS_V1
=
"vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
# noqa: E501
use_fp8_kv_cache
=
(
kv_cache_dtype
is
not
None
and
kv_cache_dtype
.
startswith
(
"fp8"
))
if
selected_backend
==
_Backend
.
FLASHINFER
:
logger
.
info_once
(
"Using FlashInfer backend on V1 engine."
)
if
cls
.
has_device_capability
(
100
):
...
...
@@ -334,10 +337,11 @@ class CudaPlatformBase(Platform):
# FlashAttention is the default for SM 8.0+ GPUs
if
cls
.
has_device_capability
(
80
):
if
has_sink
and
not
cls
.
is_device_capability
(
90
):
if
(
has_sink
or
use_fp8_kv_cache
)
and
not
cls
.
is_device_capability
(
90
):
logger
.
info_once
(
"Using Triton backend on V1 engine."
)
return
TRITON_ATTN_VLLM_V1
if
is_default_backend_supported
:
=
is_attn_backend_supported
(
el
if
is_default_backend_supported
:
=
is_attn_backend_supported
(
FLASH_ATTN_V1
,
head_size
,
dtype
,
allow_import_error
=
False
):
logger
.
info_once
(
"Using Flash Attention backend on "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment