when using VLLM_FLASH_ATTN_V1, set block_size to 64

80a682c7 · zhuwenwen · 8e1c204b · 80a682c7 · 80a682c7
Commit 80a682c7 authored Aug 05, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/attention/layer.py vllm/attention/layer.py +2 -2

vllm/config.py vllm/config.py +1 -1

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -75,7 +75,7 @@ class Attention(nn.Module):
            calculate_kv_scales = cache_config.calculate_kv_scales
        else:
            kv_cache_dtype = "auto"
-            block_size = 16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64
+            block_size = 16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64
            is_attention_free = False
            calculate_kv_scales = False
        if num_kv_heads is None:
@@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module):
        attn_backend = get_attn_backend(head_size,
                                        dtype,
                                        kv_cache_dtype=None,
-                                        block_size=16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64,
+                                        block_size=16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64,
                                        is_attention_free=False)
        backend = backend_name_to_enum(attn_backend.get_name())
        if current_platform.is_rocm():

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1497,7 +1497,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 class CacheConfig:
    """Configuration for the KV cache."""
-    block_size: BlockSize = 16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64  # type: ignore
+    block_size: BlockSize = 16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64  # type: ignore
    """Size of a contiguous cache block in number of tokens. This is ignored on
    neuron devices and set to `--max-model-len`. On CUDA devices, only block
    sizes up to 32 are supported. On HPU devices, block size defaults to 128.