Unverified Commit f1fc2107 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Bugfix] Disable cascade attention with FlashInfer (#26130)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
Signed-off-by: default avatarMichael Goin <mgoin64@gmail.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 13cdc021
......@@ -29,7 +29,6 @@ from vllm.utils.flashinfer import (can_use_trtllm_attention,
flashinfer_disable_q_quantization,
supports_trtllm_attention,
use_trtllm_attention)
from vllm.v1.attention.backends.flash_attn import use_cascade_attention
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
......@@ -677,7 +676,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
# TODO: The cascade wrapper currently does not support setting
# kv cache dtype to something different from query dtype.
return False
return use_cascade_attention(*args, **kwargs)
# TODO: Cascade attention doesn't work, disable it for now
# return use_cascade_attention(*args, **kwargs)
return False
class FlashInferImpl(AttentionImpl):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment