[Bugfix]: Fix cross attention backend selection for Turing GPU (#31806)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>

[Bugfix]: Fix cross attention backend selection for Turing GPU (#31806)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
02809af1 · Isotr0py · GitHub · cbd4690a · 02809af1
Unverified Commit 02809af1 authored Jan 06, 2026 by Isotr0py Committed by GitHub Jan 06, 2026
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 5 deletions

vllm/attention/layers/cross_attention.py vllm/attention/layers/cross_attention.py +9 -5

No files found.
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -149,16 +149,20 @@ class CrossAttention(Attention):
            kv_cache_dtype = "auto"
            block_size = 16

-        underlying_attn_backend = get_attn_backend(
-            head_size, dtype, kv_cache_dtype, block_size
-        )
-        attn_backend = create_cross_attention_backend(underlying_attn_backend)
-
        if attn_type is not None:
            assert attn_type == AttentionType.ENCODER_DECODER, (
                "CrossAttention only supports AttentionType.ENCODER_DECODER"
            )

+        underlying_attn_backend = get_attn_backend(
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_DECODER,
+        )
+        attn_backend = create_cross_attention_backend(underlying_attn_backend)
+
        super().__init__(
            num_heads=num_heads,
            head_size=head_size,