Commit 80a682c7 authored by zhuwenwen's avatar zhuwenwen
Browse files

when using VLLM_FLASH_ATTN_V1, set block_size to 64

parent 8e1c204b
......@@ -75,7 +75,7 @@ class Attention(nn.Module):
calculate_kv_scales = cache_config.calculate_kv_scales
else:
kv_cache_dtype = "auto"
block_size = 16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64
block_size = 16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64
is_attention_free = False
calculate_kv_scales = False
if num_kv_heads is None:
......@@ -303,7 +303,7 @@ class MultiHeadAttention(nn.Module):
attn_backend = get_attn_backend(head_size,
dtype,
kv_cache_dtype=None,
block_size=16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64,
block_size=16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64,
is_attention_free=False)
backend = backend_name_to_enum(attn_backend.get_name())
if current_platform.is_rocm():
......
......@@ -1497,7 +1497,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class CacheConfig:
"""Configuration for the KV cache."""
block_size: BlockSize = 16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64 # type: ignore
block_size: BlockSize = 16 if not envs.VLLM_USE_FLASH_ATTN_PA or not envs.VLLM_FLASH_ATTN_V1 else 64 # type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment