[Bugfix] Remove redundant T.fill to fix precision issue (#667)

98f93db1 · 徐畅 · GitHub · 722c2a8c · 98f93db1
Unverified Commit 98f93db1 authored Jul 26, 2025 by 徐畅 Committed by GitHub Jul 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

examples/flash_decoding/example_gqa_decode.py examples/flash_decoding/example_gqa_decode.py +1 -1

No files found.
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -169,7 +169,7 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
            T.fill(scores_max, -T.infinity(accum_dtype))
            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
-            T.fill(K_shared, 0)
            for k in T.Pipelined(loop_range, num_stages=num_stages):
                T.copy(
                    K[bid, (seqlen_kv // num_split) * sid +