Unverified Commit c765f0b4 authored by Chen Zhang's avatar Chen Zhang Committed by GitHub
Browse files

[FlashInfer] Avoid FlashInfer block_size 16 + head_size 256 on blackwell (#27994)


Signed-off-by: default avatarChen Zhang <zhangch99@outlook.com>
parent 002b07c4
...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING ...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv, round_up from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
...@@ -356,6 +357,17 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ...@@ -356,6 +357,17 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
).page_size_bytes ).page_size_bytes
else: else:
kernel_block_alignment_size = 16 kernel_block_alignment_size = 16
if (
current_platform.is_device_capability(100)
and model_config.get_head_size() == 256
and (
envs.VLLM_ATTENTION_BACKEND is None
or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
)
):
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
# head size 256 and block size 16 is not supported on blackwell.
kernel_block_alignment_size = 32
attn_page_size_1_token = FullAttentionSpec( attn_page_size_1_token = FullAttentionSpec(
block_size=1, block_size=1,
num_kv_heads=model_config.get_num_kv_heads(parallel_config), num_kv_heads=model_config.get_num_kv_heads(parallel_config),
......
...@@ -402,6 +402,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ...@@ -402,6 +402,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
) )
self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy() self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy()
if self.head_dim == 256 and current_platform.is_device_capability(100):
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
# head size 256 and block size 16 is not supported on blackwell.
assert kv_cache_spec.block_size != 16, (
"There is a bug in FlashInfer "
"block_size 16 head size 256 support. Please avoid this combination by "
"passing --block-size 32 or --block-size 64."
)
def _get_workspace_buffer(self): def _get_workspace_buffer(self):
if self._workspace_buffer is None: if self._workspace_buffer is None:
buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment