Fix EVEN_M & EVEN_HEADDIM for headdim=40 in Triton bwd

215930bc · Tri Dao · 4f81aff4 · 215930bc
Commit 215930bc authored Oct 31, 2022 by Tri Dao
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 5 deletions

flash_attn/flash_attn_triton.py flash_attn/flash_attn_triton.py +3 -5

No files found.
--- a/flash_attn/flash_attn_triton.py
+++ b/flash_attn/flash_attn_triton.py
@@ -257,11 +257,9 @@ def _bwd_kernel_one_col_block(
        start_m = tl.multiple_of(start_m, BLOCK_M)
        offs_m_curr = start_m + offs_m
        # load q, k, v, do on-chip
-        if EVEN_M:
+        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
-            if EVEN_HEADDIM:
+        if EVEN_M & EVEN_HEADDIM:
            q = tl.load(q_ptrs)
-            else:
-                q = tl.load(q_ptrs, mask=(offs_d[None, :] < headdim))
        else:
            if EVEN_HEADDIM:
                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)