"tests/vscode:/vscode.git/clone" did not exist on "3a2cb2649d15021f48901acbddb872671478a1f2"
Unverified Commit a57c877f authored by Frank Wang's avatar Frank Wang Committed by GitHub
Browse files

[BugFix] Fallback from FA4->FA2 for Batch Invariance (#36059)


Signed-off-by: default avatarfrankwang28 <frank.wbb@hotmail.com>
parent f9170209
......@@ -4,6 +4,7 @@
from typing import Any
from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.platforms import current_platform
logger = init_logger(__name__)
......@@ -111,6 +112,16 @@ def get_flash_attn_version(
)
fa_version = 2
# FA4 currently uses batch-shape-dependent scheduling
# heuristics on SM100+, which breaks batch invariance.
if vllm_is_batch_invariant() and fa_version == 4:
logger.warning_once(
"Cannot use FA version 4 with batch invariance, "
"defaulting to FA version 2.",
scope="local",
)
fa_version = 2
# FA4 on SM100 (Blackwell) has TMEM capacity limits that restrict
# supported head dimensions.
# See: https://github.com/Dao-AILab/flash-attention/issues/1959
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment