Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
07a606aa
Unverified
Commit
07a606aa
authored
Nov 13, 2025
by
Huamin Li
Committed by
GitHub
Nov 13, 2025
Browse files
[CI Failure] Fix backend selection for encoder-only models (#28534)
Signed-off-by:
Huamin Li
<
3ericli@gmail.com
>
parent
a7791eac
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
75 additions
and
6 deletions
+75
-6
vllm/attention/backends/abstract.py
vllm/attention/backends/abstract.py
+14
-0
vllm/attention/layer.py
vllm/attention/layer.py
+1
-0
vllm/attention/layers/encoder_only_attention.py
vllm/attention/layers/encoder_only_attention.py
+5
-1
vllm/attention/selector.py
vllm/attention/selector.py
+5
-0
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+1
-0
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+10
-0
vllm/platforms/interface.py
vllm/platforms/interface.py
+1
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-0
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+1
-0
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+1
-0
vllm/v1/attention/backends/cpu_attn.py
vllm/v1/attention/backends/cpu_attn.py
+11
-0
vllm/v1/attention/backends/flash_attn.py
vllm/v1/attention/backends/flash_attn.py
+12
-0
vllm/v1/attention/backends/flex_attention.py
vllm/v1/attention/backends/flex_attention.py
+7
-0
vllm/v1/attention/backends/mla/flashmla_sparse.py
vllm/v1/attention/backends/mla/flashmla_sparse.py
+5
-5
No files found.
vllm/attention/backends/abstract.py
View file @
07a606aa
...
...
@@ -142,6 +142,17 @@ class AttentionBackend(ABC):
def
is_sparse
(
cls
)
->
bool
:
return
False
@
classmethod
def
supports_attn_type
(
cls
,
attn_type
:
str
)
->
bool
:
"""Check if backend supports a given attention type.
By default, only supports decoder attention.
Backends should override this to support other attention types.
"""
from
vllm.attention
import
AttentionType
return
attn_type
==
AttentionType
.
DECODER
@
classmethod
def
supports_compute_capability
(
cls
,
capability
:
"DeviceCapability"
)
->
bool
:
return
True
...
...
@@ -171,6 +182,7 @@ class AttentionBackend(ABC):
has_sink
:
bool
,
use_sparse
:
bool
,
device_capability
:
"DeviceCapability"
,
attn_type
:
str
,
)
->
list
[
str
]:
invalid_reasons
=
[]
if
not
cls
.
supports_head_size
(
head_size
):
...
...
@@ -195,6 +207,8 @@ class AttentionBackend(ABC):
invalid_reasons
.
append
(
"non-sparse not supported"
)
if
not
cls
.
supports_compute_capability
(
device_capability
):
invalid_reasons
.
append
(
"compute capability not supported"
)
if
not
cls
.
supports_attn_type
(
attn_type
):
invalid_reasons
.
append
(
f
"attention type
{
attn_type
}
not supported"
)
combination_reason
=
cls
.
supports_combination
(
head_size
,
dtype
,
...
...
vllm/attention/layer.py
View file @
07a606aa
...
...
@@ -291,6 +291,7 @@ class Attention(nn.Module, AttentionLayerBase):
block_size
,
use_mla
=
False
,
has_sink
=
self
.
has_sink
,
attn_type
=
attn_type
,
)
else
:
self
.
attn_backend
=
attn_backend
...
...
vllm/attention/layers/encoder_only_attention.py
View file @
07a606aa
...
...
@@ -74,7 +74,11 @@ class EncoderOnlyAttention(Attention):
block_size
=
16
underlying_attn_backend
=
get_attn_backend
(
head_size
,
dtype
,
kv_cache_dtype
,
block_size
head_size
,
dtype
,
kv_cache_dtype
,
block_size
,
attn_type
=
AttentionType
.
ENCODER_ONLY
,
)
attn_backend
=
create_encoder_only_attention_backend
(
underlying_attn_backend
)
...
...
vllm/attention/selector.py
View file @
07a606aa
...
...
@@ -76,6 +76,7 @@ def get_attn_backend(
use_mla
:
bool
=
False
,
has_sink
:
bool
=
False
,
use_sparse
:
bool
=
False
,
attn_type
:
str
|
None
=
None
,
)
->
type
[
AttentionBackend
]:
"""Selects which attention backend to use and lazily imports it."""
...
...
@@ -94,6 +95,7 @@ def get_attn_backend(
use_mla
=
use_mla
,
has_sink
=
has_sink
,
use_sparse
=
use_sparse
,
attn_type
=
attn_type
,
)
...
...
@@ -106,6 +108,7 @@ def _cached_get_attn_backend(
use_mla
:
bool
=
False
,
has_sink
:
bool
=
False
,
use_sparse
:
bool
=
False
,
attn_type
:
str
|
None
=
None
,
)
->
type
[
AttentionBackend
]:
# Check whether a particular choice of backend was
# previously forced.
...
...
@@ -159,6 +162,7 @@ def _cached_get_attn_backend(
use_mla
,
has_sink
,
use_sparse
,
attn_type
,
)
else
:
attention_cls
=
current_platform
.
get_attn_backend_cls
(
...
...
@@ -170,6 +174,7 @@ def _cached_get_attn_backend(
use_mla
,
has_sink
,
use_sparse
,
attn_type
,
)
if
not
attention_cls
:
raise
ValueError
(
...
...
vllm/platforms/cpu.py
View file @
07a606aa
...
...
@@ -134,6 +134,7 @@ class CpuPlatform(Platform):
use_mla
:
bool
,
has_sink
:
bool
,
use_sparse
:
bool
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
from
vllm.attention.backends.registry
import
AttentionBackendEnum
...
...
vllm/platforms/cuda.py
View file @
07a606aa
...
...
@@ -298,6 +298,7 @@ class CudaPlatformBase(Platform):
has_sink
,
use_sparse
,
device_capability
,
attn_type
,
)
->
tuple
[
list
[
tuple
[
"AttentionBackendEnum"
,
int
]],
dict
[
"AttentionBackendEnum"
,
list
[
str
]],
...
...
@@ -318,6 +319,7 @@ class CudaPlatformBase(Platform):
has_sink
,
use_sparse
,
device_capability
,
attn_type
,
)
except
ImportError
:
invalid_reasons_i
=
[
"ImportError"
]
...
...
@@ -339,7 +341,13 @@ class CudaPlatformBase(Platform):
use_mla
:
bool
,
has_sink
:
bool
,
use_sparse
:
bool
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
from
vllm.attention
import
AttentionType
if
attn_type
is
None
:
attn_type
=
AttentionType
.
DECODER
device_capability
=
cls
.
get_device_capability
()
assert
device_capability
is
not
None
...
...
@@ -356,6 +364,7 @@ class CudaPlatformBase(Platform):
has_sink
,
use_sparse
,
device_capability
,
attn_type
,
)
except
ImportError
:
invalid_reasons
=
[
"ImportError"
]
...
...
@@ -379,6 +388,7 @@ class CudaPlatformBase(Platform):
has_sink
,
use_sparse
,
device_capability
,
attn_type
,
)
reasons_str
=
(
"{"
...
...
vllm/platforms/interface.py
View file @
07a606aa
...
...
@@ -222,6 +222,7 @@ class Platform:
use_mla
:
bool
,
has_sink
:
bool
,
use_sparse
:
bool
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
"""Get the attention backend class of a device."""
return
""
...
...
vllm/platforms/rocm.py
View file @
07a606aa
...
...
@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
use_mla
,
has_sink
,
use_sparse
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
from
vllm._aiter_ops
import
rocm_aiter_ops
from
vllm.attention.backends.registry
import
AttentionBackendEnum
...
...
vllm/platforms/tpu.py
View file @
07a606aa
...
...
@@ -61,6 +61,7 @@ class TpuPlatform(Platform):
use_mla
:
bool
,
has_sink
,
use_sparse
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
from
vllm.attention.backends.registry
import
AttentionBackendEnum
...
...
vllm/platforms/xpu.py
View file @
07a606aa
...
...
@@ -51,6 +51,7 @@ class XPUPlatform(Platform):
use_mla
:
bool
,
has_sink
:
bool
,
use_sparse
,
attn_type
:
str
|
None
=
None
,
)
->
str
:
from
vllm.v1.attention.backends.utils
import
set_kv_cache_layout
...
...
vllm/v1/attention/backends/cpu_attn.py
View file @
07a606aa
...
...
@@ -48,6 +48,17 @@ class CPUAttentionBackend(AttentionBackend):
def
get_name
()
->
str
:
return
"CPU_ATTN"
@
classmethod
def
supports_attn_type
(
cls
,
attn_type
:
str
)
->
bool
:
"""CPU attention supports decoder and encoder-only attention."""
from
vllm.attention
import
AttentionType
return
attn_type
in
(
AttentionType
.
DECODER
,
AttentionType
.
ENCODER
,
AttentionType
.
ENCODER_ONLY
,
)
@
staticmethod
def
get_impl_cls
()
->
type
[
"CPUAttentionBackendImpl"
]:
return
CPUAttentionBackendImpl
...
...
vllm/v1/attention/backends/flash_attn.py
View file @
07a606aa
...
...
@@ -66,6 +66,18 @@ class FlashAttentionBackend(AttentionBackend):
def
get_name
()
->
str
:
return
"FLASH_ATTN"
@
classmethod
def
supports_attn_type
(
cls
,
attn_type
:
str
)
->
bool
:
"""FlashAttention supports all attention types."""
from
vllm.attention
import
AttentionType
return
attn_type
in
(
AttentionType
.
DECODER
,
AttentionType
.
ENCODER
,
AttentionType
.
ENCODER_ONLY
,
AttentionType
.
ENCODER_DECODER
,
)
@
staticmethod
def
get_impl_cls
()
->
type
[
"FlashAttentionImpl"
]:
return
FlashAttentionImpl
...
...
vllm/v1/attention/backends/flex_attention.py
View file @
07a606aa
...
...
@@ -84,6 +84,13 @@ class FlexAttentionBackend(AttentionBackend):
def
get_name
()
->
str
:
return
"FLEX_ATTENTION"
@
classmethod
def
supports_attn_type
(
cls
,
attn_type
:
str
)
->
bool
:
"""FlexAttention supports both decoder and encoder-only attention."""
from
vllm.attention
import
AttentionType
return
attn_type
in
(
AttentionType
.
DECODER
,
AttentionType
.
ENCODER_ONLY
)
@
staticmethod
def
get_impl_cls
()
->
type
[
"FlexAttentionImpl"
]:
return
FlexAttentionImpl
...
...
vllm/v1/attention/backends/mla/flashmla_sparse.py
View file @
07a606aa
...
...
@@ -40,14 +40,14 @@ logger = init_logger(__name__)
"""
NOTE: FlashMLA Sparse uses an fp8 cache with the following format
In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
structured as:
- **First 512 bytes:** The "quantized NoPE" part, containing 512
- **First 512 bytes:** The "quantized NoPE" part, containing 512
`float8_e4m3` values.
- **Next 16 bytes:** Scale factors, containing 4 `float32` values.
The first `float32` is the scale for the first 128 `float8_e4m3` values,
- **Next 16 bytes:** Scale factors, containing 4 `float32` values.
The first `float32` is the scale for the first 128 `float8_e4m3` values,
the second for the next 128, and so on.
- **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
- **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
part is not quantized for accuracy.
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment