set default block_size to 16

d8a060dd · zhuwenwen · 66092265 · d8a060dd
Commit d8a060dd authored Aug 01, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/config.py vllm/config.py +1 -1

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1630,7 +1630,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
 class CacheConfig:
    """Configuration for the KV cache."""

-    block_size: BlockSize = None if not envs.VLLM_USE_FLASH_ATTN_PA else 64  # type: ignore
+    block_size: BlockSize = 16 if not envs.VLLM_USE_FLASH_ATTN_PA else 64  # type: ignore
    """Size of a contiguous cache block in number of tokens. This is ignored on
    neuron devices and set to `--max-model-len`. On CUDA devices, only block
    sizes up to 32 are supported. On HPU devices, block size defaults to 128.