[Enhancement] Keep max score attention across blocks in FlashAttention for...

[Enhancement] Keep max score attention across blocks in FlashAttention for better numerical stablity (#1269) * Implement max score retention across blocks in FlashAttention for improved stability * fix manual pipeline parameters * Update examples/flash_attention/example_gqa_fwd_varlen.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * fix typo * more * fix a previous typo --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

[Enhancement] Keep max score attention across blocks in FlashAttention for...
[Enhancement] Keep max score attention across blocks in FlashAttention for better numerical stablity (#1269) * Implement max score retention across blocks in FlashAttention for improved stability * fix manual pipeline parameters * Update examples/flash_attention/example_gqa_fwd_varlen.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * fix typo * more * fix a previous typo --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
3ab93cd7 · Tong WU · GitHub · b3d6f03c · 3ab93cd7 · 3ab93cd7
Unverified Commit 3ab93cd7 authored Nov 17, 2025 by Tong WU Committed by GitHub Nov 17, 2025
19 changed files
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -61,6 +61,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim_v):

--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -66,6 +66,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim_v):

--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -119,6 +119,8 @@ def flashattn_fwd(batch,
                        V_shared[i, d] = 0.0
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim_v):

--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -61,6 +61,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim_v):

--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -127,6 +127,8 @@ def flashattn(batch,
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.

--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -94,6 +94,8 @@ def flashattn(
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -154,7 +156,7 @@ def flashattn(
                    num_stages=num_stages,
                    order=[-1, 0, 3, 1, -1, 2],
                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                        logsum)

--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -155,7 +155,6 @@ def flashattn(batch_size,
                T.copy(scores_max, scores_max_prev)
                T.fill(scores_max, -T.infinity(accum_dtype))
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                for i in T.Parallel(block_M):
                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])

--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -63,6 +63,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim):

--- a/examples/flash_attention/example_mha_bwd_bshd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -59,6 +59,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim):
@@ -344,7 +346,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch', type=int, default=8, help='Batch size')
    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1048, help='Context size')
+    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
    args = parser.parse_args()

--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -60,6 +60,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                T.copy(scores_max, scores_max_prev)
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_M):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_M, dim):

--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -86,6 +86,8 @@ def flashattn(batch,
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -149,7 +151,7 @@ def flashattn(batch,
                    num_stages=num_stages,
                    order=[-1, 0, 3, 1, -1, 2],
                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                        logsum)

--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -81,6 +81,8 @@ def flashattn(batch,
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.

--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -81,6 +81,8 @@ def flashattn(batch,
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -141,7 +143,7 @@ def flashattn(batch,
                    num_stages=num_stages,
                    order=[-1, 0, 3, 1, -1, 2],
                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                        logsum)

--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -167,6 +167,8 @@ def flashattn(batch_size,
                T.copy(scores_max, scores_max_prev)
                T.fill(scores_max, -T.infinity(accum_dtype))
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                # in the first ceil_div(kBlockM, kBlockN) steps.

--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -115,6 +115,8 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                T.copy(scores_max, scores_max_prev)
                T.fill(scores_max, -T.infinity(accum_dtype))
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_H):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_H, block_N):
@@ -188,6 +190,8 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                T.copy(scores_max, scores_max_prev)
                T.fill(scores_max, -T.infinity(accum_dtype))
                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                for i in T.Parallel(block_H):
                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                for i, j in T.Parallel(block_H, block_N):

--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -70,6 +70,8 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
        T.copy(scores_max, scores_max_prev)
        T.fill(scores_max, -T.infinity(accum_dtype))
        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
        # in the first ceil_div(kBlockM, kBlockN) steps.

--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -87,6 +87,8 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
            T.copy(scores_max, scores_max_prev)
            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
            for i in T.Parallel(block_M):
                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -194,6 +196,8 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                        T.copy(scores_max, scores_max_prev)
                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                        for i in T.Parallel(block_M):
                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -

--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -62,6 +62,8 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
            T.copy(scores_max, scores_max_prev)
            T.fill(scores_max, -T.infinity(accum_dtype))
            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
            # in the first ceil_div(kBlockM, kBlockN) steps.

--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -71,6 +71,8 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
            T.copy(scores_max, scores_max_prev)
            T.fill(scores_max, -T.infinity(accum_dtype))
            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
            # in the first ceil_div(kBlockM, kBlockN) steps.