Unverified Commit 084a9dae authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Bugfix] Disable FlexAttention direct block mask building for encoder-only models (#27344)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent c9461e05
...@@ -658,7 +658,10 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat ...@@ -658,7 +658,10 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
total_cache_tokens=total_cache_tokens, total_cache_tokens=total_cache_tokens,
decode_offset=offset_tensor, decode_offset=offset_tensor,
num_blocks_per_seq=num_blocks_per_seq, num_blocks_per_seq=num_blocks_per_seq,
direct_build=self.direct_build, # FIXME(Isotr0py): direct build has issue to build bidirectional
# attention block mask for encoder-only models, disable it temporarily.
# see: https://github.com/vllm-project/vllm/pull/27329#issuecomment-3431484053
direct_build=(self.direct_build and common_attn_metadata.causal),
q_block_size=self.q_block_size, q_block_size=self.q_block_size,
kv_block_size=self.kv_block_size, kv_block_size=self.kv_block_size,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment