"vscode:/vscode.git/clone" did not exist on "59d7ffc17f4c948b4d25d014e4f90d0cd4c20990"
Unverified Commit 1f3a2c29 authored by Nicolò Lucchesi's avatar Nicolò Lucchesi Committed by GitHub
Browse files

[Bugfix] Disable CG for Whisper+FA2 (#33164)


Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
parent 7227d061
......@@ -257,6 +257,26 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
)
supports_update_block_table: bool = True
@classmethod
def get_cudagraph_support(
cls,
vllm_config: "VllmConfig",
kv_cache_spec: "AttentionSpec",
) -> AttentionCGSupport:
# FA2 does not support CUDA graphs with encoder-decoder models due to
# accuracy issues reported in https://github.com/vllm-project/vllm/issues/33091
if (
vllm_config.model_config.is_encoder_decoder
and get_flash_attn_version() == 2
):
logger.warning_once(
"FlashAttention2 does not support CUDA graphs with "
"encoder-decoder models due to accuracy issues reported in #33091. "
"Disabling CUDA graph."
)
return AttentionCGSupport.NEVER
return cls._cudagraph_support
def __init__(
self,
kv_cache_spec: AttentionSpec,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment