From eac96cd7a2741bf0fb343d2e857487b1832fc4ec Mon Sep 17 00:00:00 2001
From: Zhengju Tang <97930865+tzj-fxz@users.noreply.github.com>
Date: Fri, 14 Nov 2025 20:51:16 +0800
Subject: [PATCH 001/139] [BugFix] Add autotune and exp2 for GDN kernel (#1258)

* [BugFix] Add autotune and exp2 for GDN kernel

* [Lint]

* [Lint]
---
 examples/gdn/example_chunk_delta_h.py | 54 ++++++++++++++++++---------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 4d6b657f..61c2abd3 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -3,6 +3,7 @@
 import sys  # noqa: F401
 import tilelang
 import tilelang.language as T
+from tilelang.autotuner import autotune
 
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
@@ -80,7 +81,25 @@ def prepare_output(
     return h, final_state, V_new
 
 
-@tilelang.jit(out_idx=[-3, -2, -1])
+def get_configs():
+    import itertools
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{
+        'block_DK': c[0],
+        'block_DV': c[1],
+        'threads': c[2],
+        'num_stages': c[3]
+    } for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
 def tilelang_chunk_gated_delta_rule_fwd_h(
     # task config
     B,
@@ -94,15 +113,15 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
     gate_dtype,
     state_dtype,
     chunk_size,
-    use_g=True,
-    use_initial_state=True,
-    store_final_state=True,
-    save_new_value=True,
+    use_g,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
     # kernel config
     block_DK=64,
-    block_DV=64,
-    threads=256,
-    num_stages=0,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
 ):
     block_S = chunk_size
     BS = S // block_S
@@ -193,11 +212,11 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
                         with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
                             with T.Then():
-                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp(
-                                    G_last_local[0] - G_fragment[i_s2, i_v])
+                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp2(
+                                    (G_last_local[0] - G_fragment[i_s2, i_v]) * 1.442695)
                             with T.Else():
                                 V_new_fragment[i_s2, i_v] = 0
-                    G_last_local[0] = T.exp(G_last_local[0])
+                    G_last_local[0] = T.exp2(G_last_local[0] * 1.442695)
                     for i_k, i_v in T.Parallel(DK, block_DV):
                         b_h_fragment[i_k, i_v] *= G_last_local[0]
 
@@ -281,8 +300,7 @@ def run_test(
     kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
                                                    accum_dtype, gate_dtype, state_dtype, chunk_size,
                                                    use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
+                                                   save_new_value)
     h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
@@ -352,13 +370,13 @@ def main():
         state_dtype="float32",
         chunk_size=64,
         use_g=True,
-        use_initial_state=True,
-        store_final_state=True,
-        save_new_value=True,
-        block_DK=64,
+        use_initial_state=False,
+        store_final_state=False,
+        save_new_value=False,
+        block_DK=32,
         block_DV=32,
         threads=128,
-        num_stages=1,
+        num_stages=2,
     )
 
 
-- 
GitLab


From 0af3fd7c70711f1c78da9bc087293826ecba451e Mon Sep 17 00:00:00 2001
From: Tong WU <109033598+Rachmanino@users.noreply.github.com>
Date: Sat, 15 Nov 2025 09:36:16 +0800
Subject: [PATCH 002/139] [BugFix] Refactor attention kernel to handle OOB
 positions by filling with `-inf` instead of clearing accumulators. (#1222)

* Refactor attention kernel to handle OOB positions by filling with `-inf` instead of clearing accumulators.

* lint

* pre-commit

* Update imports in flash attention test file to use new backward and forward examples for better clarity and consistency.
---
 examples/flash_attention/example_gqa_bwd.py               | 4 +++-
 examples/flash_attention/example_gqa_bwd_tma_reduce.py    | 4 +++-
 .../flash_attention/example_gqa_bwd_wgmma_pipelined.py    | 4 +++-
 examples/flash_attention/example_gqa_fwd_bshd.py          | 4 +++-
 .../example_gqa_fwd_bshd_wgmma_pipelined.py               | 4 +++-
 examples/flash_attention/example_mha_bwd_bhsd.py          | 6 +++++-
 .../{example_mha_bwd.py => example_mha_bwd_bshd.py}       | 8 ++++++--
 ...pelined.py => example_mha_bwd_bshd_wgmma_pipelined.py} | 6 +++++-
 examples/flash_attention/example_mha_fwd_bhsd.py          | 6 ++++--
 .../example_mha_fwd_bhsd_wgmma_pipelined.py               | 4 +++-
 examples/flash_attention/example_mha_fwd_bshd.py          | 5 ++++-
 .../example_mha_fwd_bshd_wgmma_pipelined.py               | 5 ++++-
 examples/flash_attention/test_example_flash_attention.py  | 8 ++++----
 13 files changed, 50 insertions(+), 18 deletions(-)
 rename examples/flash_attention/{example_mha_bwd.py => example_mha_bwd_bshd.py} (97%)
 rename examples/flash_attention/{example_mha_bwd_wgmma_pipelined.py => example_mha_bwd_bshd_wgmma_pipelined.py} (97%)

diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index 907a121d..dd9c8f7c 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -54,7 +54,9 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index 615c2e19..2af06e4b 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -59,7 +59,9 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      T.Cast(accum_dtype, -1e30))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index ed07e7d9..02421249 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -54,7 +54,9 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 4d9d06a4..3d4bfe45 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -96,7 +96,9 @@ def flashattn(batch,
                 acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                              -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
+                                             0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 1c1fc12d..21f5e9a9 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -63,7 +63,9 @@ def flashattn(
                 acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                              -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
+                                             0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 1595ae76..8247b265 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -56,7 +56,9 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
@@ -213,6 +215,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
                                                    0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
diff --git a/examples/flash_attention/example_mha_bwd.py b/examples/flash_attention/example_mha_bwd_bshd.py
similarity index 97%
rename from examples/flash_attention/example_mha_bwd.py
rename to examples/flash_attention/example_mha_bwd_bshd.py
index 543c2c0e..414061ff 100644
--- a/examples/flash_attention/example_mha_bwd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -52,7 +52,9 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
@@ -206,6 +208,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
                                                    0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -340,7 +344,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--batch', type=int, default=8, help='Batch size')
     parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
+    parser.add_argument('--n_ctx', type=int, default=1048, help='Context size')
     parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
     parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
     args = parser.parse_args()
diff --git a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
similarity index 97%
rename from examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
rename to examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index 7ad417ef..e10ef581 100644
--- a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -53,7 +53,9 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                         acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                                      -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
+                                                     -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
@@ -193,6 +195,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
                                                    0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.wait_wgmma(0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index f07f7a61..e936cee3 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -55,7 +55,9 @@ def flashattn(batch,
                 k_idx = k * block_N + j
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -226,7 +228,7 @@ if __name__ == "__main__":
     parser.add_argument('--seq_q', type=int, default=256, help='query sequence length')
     parser.add_argument('--seq_kv', type=int, default=256, help='key/value sequence length')
     parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
+    parser.add_argument('--is_causal', action='store_true', help='causal', default=False)
     parser.add_argument('--tune', action='store_true', help='tune configs')
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index 26167b34..e1d0130a 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -55,7 +55,9 @@ def flashattn(batch,
                 k_idx = k * block_N + j
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index 6a1f707e..a9268019 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -49,7 +49,10 @@ def flashattn(batch,
                 acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                              -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
+                                             0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index 3928db4c..d7023a20 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -49,7 +49,10 @@ def flashattn(batch,
                 acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
                                              -T.infinity(acc_s.dtype))
         else:
-            T.clear(acc_s)
+            # We shall fill -inf for OOB positions
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
+                                             0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index f4932aee..b184fc60 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -2,7 +2,7 @@ import tilelang.testing
 
 import example_gqa_bwd
 import example_gqa_bwd_wgmma_pipelined
-import example_mha_bwd
+import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
 import example_mha_fwd_bhsd_wgmma_pipelined
 import example_gqa_fwd_bshd
@@ -10,7 +10,7 @@ import example_mha_fwd_bshd
 import example_gqa_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
-import example_mha_bwd_wgmma_pipelined
+import example_mha_bwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
 import example_gqa_bwd_tma_reduce_varlen
 
@@ -33,7 +33,7 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main(
+    example_mha_bwd_bshd.main(
         BATCH=1,
         H=16,
         N_CTX=512,
@@ -56,7 +56,7 @@ def test_example_mha_bwd_bhsd():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
+    example_mha_bwd_bshd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
 
 
 @tilelang.testing.requires_cuda
-- 
GitLab


From eb41574431608e2a96d3d8941f9c1e6d775f228e Mon Sep 17 00:00:00 2001
From: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com>
Date: Sat, 15 Nov 2025 11:43:03 +0800
Subject: [PATCH 003/139] [fix] NVRTC execution backend (#1256)

* [fix] NVRTC execution backend

* [fmt] run pre-commit

* [fix] coderabbit reviews

* [test] add cuda-python to test dep

* [fix] coderabbit reviews

* [fix] CUDA 13 compatibility

* [fix] sm90

* [fix] CUDA 13 compatibility

* [fix] pre-commit

* [fix] always use cuda::std::__atomic_ref_impl

* [fix] restore to external API

* Revert "[fix] restore to external API"

This reverts commit 49bd875638fb631d270015f408991d38fd1e9a5d.

* [fmt] use space instead tabs for py codegen

* [fix] im2col API

* [fix] revert atomic.h

* [fix] dynamic shape

* [refactor] extract common utils

* [feat] support L2 persistent map

* [fix] l2 persistent map

* [fix] pre-commit

* [fix] restore _TYPE_MAP

* [fix] pre-commit

* [fix] avoid duplicate TMA descs

* [docs] add docstring

* [fix] coderabbit

* [fix] coderabbit

* [fix] coderabbit

* [fix] coderabbit
---
 requirements-test-cuda.txt                    |   1 +
 src/tl_templates/cuda/instruction/mma.h       |   2 +
 src/tl_templates/cuda/instruction/mma_sm70.h  |   2 +
 src/tl_templates/cuda/instruction/wgmma.h     |   2 +
 src/tl_templates/cuda/nvrtc_std.h             |  53 ++
 src/tl_templates/cuda/reduce.h                |   3 +
 testing/python/jit/test_tilelang_jit_nvrtc.py | 585 ++++++++++++++++++
 tilelang/jit/adapter/libgen.py                | 102 ---
 tilelang/jit/adapter/nvrtc/__init__.py        |  25 +-
 tilelang/jit/adapter/nvrtc/adapter.py         |   7 +-
 tilelang/jit/adapter/nvrtc/libgen.py          | 235 +++++++
 tilelang/jit/adapter/nvrtc/wrapper.py         | 563 +++++++++++++++++
 tilelang/jit/adapter/utils.py                 | 251 +++++++-
 tilelang/jit/adapter/wrapper.py               | 432 +------------
 tilelang/jit/kernel.py                        |   4 +-
 tilelang/language/annotations.py              |   3 +-
 16 files changed, 1747 insertions(+), 523 deletions(-)
 create mode 100644 testing/python/jit/test_tilelang_jit_nvrtc.py
 create mode 100644 tilelang/jit/adapter/nvrtc/libgen.py
 create mode 100644 tilelang/jit/adapter/nvrtc/wrapper.py

diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index 5413ad51..12232023 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -6,3 +6,4 @@
 
 # CUDA specific requirements
 flash-attn==2.5.8
+cuda-python==12.9.4
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
index ed561285..869fa777 100644
--- a/src/tl_templates/cuda/instruction/mma.h
+++ b/src/tl_templates/cuda/instruction/mma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm80.hpp>
 #include <cute/arch/mma_sm89.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/mma_sm70.h b/src/tl_templates/cuda/instruction/mma_sm70.h
index 65674175..7a44b921 100644
--- a/src/tl_templates/cuda/instruction/mma_sm70.h
+++ b/src/tl_templates/cuda/instruction/mma_sm70.h
@@ -2,8 +2,10 @@
 
 #include "../common.h"
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/instruction/wgmma.h b/src/tl_templates/cuda/instruction/wgmma.h
index b5ef59c2..3af2d79f 100644
--- a/src/tl_templates/cuda/instruction/wgmma.h
+++ b/src/tl_templates/cuda/instruction/wgmma.h
@@ -4,8 +4,10 @@
 #include <cute/arch/mma_sm90_gmma.hpp>
 #include <cute/arch/mma_sm90_gmma_ext.hpp>
 
+#ifndef __CUDACC_RTC__
 #include <type_traits>
 #include <utility>
+#endif
 
 namespace tl {
 
diff --git a/src/tl_templates/cuda/nvrtc_std.h b/src/tl_templates/cuda/nvrtc_std.h
index 9930c220..1e6800e5 100644
--- a/src/tl_templates/cuda/nvrtc_std.h
+++ b/src/tl_templates/cuda/nvrtc_std.h
@@ -19,6 +19,11 @@
 
 #ifdef __CUDACC_RTC__
 
+// Disable problematic CUDA standard library headers in NVRTC environment
+// Vector types (float4, uchar, etc.) are built-in to NVRTC and don't need these
+// headers
+#define _LIBCUDACXX___TUPLE_VECTOR_TYPES_H // Prevent vector_types.h inclusion
+
 using int8_t = signed char;
 using uint8_t = unsigned char;
 using int16_t = signed short;
@@ -67,6 +72,24 @@ template <class T> struct is_same<T, T> : true_type {};
 template <class T, class U>
 inline constexpr bool is_same_v = is_same<T, U>::value;
 
+template <class T> struct is_void : false_type {};
+
+template <> struct is_void<void> : true_type {};
+template <> struct is_void<const void> : true_type {};
+template <> struct is_void<volatile void> : true_type {};
+template <> struct is_void<const volatile void> : true_type {};
+
+template <class T> inline constexpr bool is_void_v = is_void<T>::value;
+
+template <class T> struct is_pointer : false_type {};
+
+template <class T> struct is_pointer<T *> : true_type {};
+template <class T> struct is_pointer<T *const> : true_type {};
+template <class T> struct is_pointer<T *volatile> : true_type {};
+template <class T> struct is_pointer<T *const volatile> : true_type {};
+
+template <class T> inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
 namespace index_sequence_impl {
 
 // Based on https://stackoverflow.com/a/32223343/11717224
@@ -118,6 +141,36 @@ template <bool B, class T = void> struct enable_if {};
 template <class T> struct enable_if<true, T> {
   using type = T;
 };
+
+template <class T> struct remove_extent {
+  using type = T;
+};
+
+template <class T> struct remove_extent<T[]> {
+  using type = T;
+};
+
+template <class T, size_t N> struct remove_extent<T[N]> {
+  using type = T;
+};
+
+template <class T> using remove_extent_t = typename remove_extent<T>::type;
+
+template <class T, unsigned I = 0>
+struct extent : integral_constant<size_t, 0> {};
+
+template <class T> struct extent<T[], 0> : integral_constant<size_t, 0> {};
+
+template <class T, unsigned I> struct extent<T[], I> : extent<T, I - 1> {};
+
+template <class T, size_t N>
+struct extent<T[N], 0> : integral_constant<size_t, N> {};
+
+template <class T, size_t N, unsigned I>
+struct extent<T[N], I> : extent<T, I - 1> {};
+
+template <class T, unsigned I = 0>
+inline constexpr size_t extent_v = extent<T, I>::value;
 } // namespace std
 
 #endif
\ No newline at end of file
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 0009b9b9..a083c711 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -1,8 +1,11 @@
 #pragma once
 
 #include "common.h"
+
+#ifndef __CUDACC_RTC__
 #include <cstdint>
 #include <type_traits>
+#endif
 
 namespace tl {
 
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
new file mode 100644
index 00000000..c7076861
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -0,0 +1,585 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    stramp = "&*(XS)"
+
+    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
+    def tilelang_callback_cuda_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    kernel_source = matmul_kernel.get_kernel_source()
+
+    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
+
+
+def test_gemm_f16f16f16_nn():
+    run_gemm(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_nvrtc_kernel_do_bench(M,
+                              N,
+                              K,
+                              trans_A,
+                              trans_B,
+                              in_dtype,
+                              out_dtype,
+                              dtypeAccum,
+                              block_M,
+                              block_N,
+                              block_K,
+                              num_stages=3,
+                              num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+
+    profiler = matmul_kernel.get_profiler()
+
+    nvrtc_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"NVRTC Latency: {nvrtc_latency} ms")
+
+    assert nvrtc_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_nvrtc_kernel_do_bench():
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
+                              256, 32, 2)
+
+
+def run_nvrtc_kernel_multi_stream(M,
+                                  N,
+                                  K,
+                                  trans_A,
+                                  trans_B,
+                                  in_dtype,
+                                  out_dtype,
+                                  dtypeAccum,
+                                  block_M,
+                                  block_N,
+                                  block_K,
+                                  num_stages=3,
+                                  num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_nvrtc_kernel_multi_stream():
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
+                                  128, 256, 32, 2)
+
+
+def run_nvrtc_dynamic_shape(M,
+                            N,
+                            K,
+                            trans_A,
+                            trans_B,
+                            in_dtype,
+                            out_dtype,
+                            dtypeAccum,
+                            block_M,
+                            block_N,
+                            block_K,
+                            num_stages=3,
+                            num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(
+        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_dynamic_shape():
+    run_nvrtc_dynamic_shape(
+        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
+        256, 32, 2)
+
+    run_nvrtc_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
+        "float16", 128, 256, 32, 2)
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N,
+                       C,
+                       H,
+                       W,
+                       F,
+                       K,
+                       S,
+                       D,
+                       P,
+                       block_M,
+                       block_N,
+                       block_K,
+                       num_stages,
+                       threads,
+                       dtype="float16",
+                       accum_dtype="float"):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((N, H, W, C), dtype),
+            kernel: T.Tensor((KH, KW, C, F), dtype),
+            out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(
+                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
+                threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.annotate_layout({
+                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+            })
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_nvrtc_im2col_tma_desc(N,
+                              C,
+                              H,
+                              W,
+                              F,
+                              K,
+                              S,
+                              D,
+                              P,
+                              block_M,
+                              block_N,
+                              block_K,
+                              num_stages=3,
+                              num_threads=256):
+    """Test im2col TMA descriptor functionality in NVRTC backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages,
+                                 num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(
+        out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_im2col_tma_desc():
+    """Test im2col TMA descriptor with NVRTC backend."""
+    if not check_hopper():
+        import pytest
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_nvrtc_im2col_tma_desc(
+        N=4,
+        C=64,
+        H=32,
+        W=32,
+        F=64,
+        K=3,
+        S=1,
+        D=1,
+        P=1,
+        block_M=64,
+        block_N=128,
+        block_K=32,
+        num_stages=3,
+        num_threads=256)
+
+
+def test_nvrtc_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="nvrtc")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype="float32",
+    ):
+
+        @T.prim_func
+        def kernel(
+                A: T.Tensor((M, N), dtype),
+                B: T.Tensor((M, N), dtype),
+                C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index 1e33ec04..208370b0 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 import ctypes
-import importlib
 import logging
 import os
-import os.path as osp
 import subprocess
 import tempfile
 from typing import Any
@@ -21,14 +19,6 @@ from .utils import is_cpu_target, is_cuda_target, is_hip_target
 
 logger = logging.getLogger(__name__)
 
-try:
-    from tilelang.jit.adapter.nvrtc import is_nvrtc_available
-    if is_nvrtc_available:
-        import cuda.bindings.driver as cuda
-        from tilelang.contrib.nvrtc import compile_cuda
-except ImportError:
-    is_nvrtc_available = False
-
 
 class LibraryGenerator:
     srcpath: str | None = None
@@ -183,95 +173,3 @@ class LibraryGenerator:
 
     def set_src_path(self, srcpath):
         self.srcpath = srcpath
-
-
-class PyLibraryGenerator(LibraryGenerator):
-    host_func: str | None = None
-    culib = None
-    pymodule = None
-
-    def __init__(self, target: Target, verbose: bool = False):
-        if not is_nvrtc_available:
-            raise ImportError("cuda-python is not available, nvrtc backend cannot be used. "
-                              "Please install cuda-python via `pip install cuda-python` "
-                              "if you want to use the nvrtc backend.")
-        super().__init__(target, verbose)
-
-    @staticmethod
-    def import_from_file(module_name, file_path):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-    def update_host_func(self, host_func: str):
-        self.host_func = host_func
-
-    def load_lib(self, lib_path: str | None = None):
-        if lib_path is None:
-            lib_path = self.libpath
-
-        pypath = lib_path.replace(".cubin", ".py")
-        self.pymodule = self.import_from_file("kernel", pypath)
-
-        # Ensure the context is valid
-        ctx = cuda.cuCtxGetCurrent()[1]
-        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
-            import torch
-            torch.cuda.synchronize()
-
-        result, self.culib = cuda.cuLibraryLoadFromFile(
-            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
-        assert result == cuda.CUresult.CUDA_SUCCESS, f"Failed to load library: {lib_path}"
-
-    def compile_lib(self, timeout: float = None):
-        target = self.target
-        verbose = self.verbose
-        if is_cuda_target(target):
-            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
-            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
-            libpath = src.name.replace(".cu", ".cubin")
-
-            project_root = osp.join(osp.dirname(__file__), "..", "..")
-            if CUTLASS_INCLUDE_DIR is None:
-                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-            else:
-                cutlass_path = CUTLASS_INCLUDE_DIR
-
-            if TILELANG_TEMPLATE_PATH is None:
-                tl_template_path = osp.abspath(osp.join(project_root, "src"))
-            else:
-                tl_template_path = TILELANG_TEMPLATE_PATH
-
-            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
-
-            options = [f"-I{tl_template_path}", f"-I{cutlass_path}", f"-I{cuda_home}/include"]
-            if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
-
-            cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=verbose)
-            with open(libpath, "wb") as f:
-                f.write(cubin_bytes)
-
-            src.write(self.lib_code)
-            src.flush()
-
-            self.srcpath = src.name
-            self.libpath = libpath
-
-            pypath = src.name.replace(".cu", ".py")
-            with open(pypath, "w") as f:
-                f.write(self.host_func)
-        else:
-            raise ValueError(f"Unsupported target: {target}")
-
-    def __del__(self):
-        if self.culib:
-            result = cuda.cuLibraryUnload(self.culib)[0]
-            if result != cuda.CUresult.CUDA_SUCCESS:
-                logger.warning(f"Failed to unload library: {self.libpath}")
-            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/__init__.py b/tilelang/jit/adapter/nvrtc/__init__.py
index c9068faf..faa08c19 100644
--- a/tilelang/jit/adapter/nvrtc/__init__.py
+++ b/tilelang/jit/adapter/nvrtc/__init__.py
@@ -5,7 +5,10 @@ This module provides runtime compilation support using NVIDIA's NVRTC API.
 
 import logging
 
-__all__ = ['NVRTCKernelAdapter', 'is_nvrtc_available', 'check_nvrtc_available']
+__all__ = [
+    'NVRTCKernelAdapter', 'TLNVRTCSourceWrapper', 'NVRTCLibraryGenerator', 'is_nvrtc_available',
+    'check_nvrtc_available'
+]
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +40,9 @@ def check_nvrtc_available():
 
 # Conditionally import the adapter
 if is_nvrtc_available:
-    from .adapter import NVRTCKernelAdapter  # noqa: F401
+    from .adapter import NVRTCKernelAdapter
+    from .wrapper import TLNVRTCSourceWrapper
+    from .libgen import NVRTCLibraryGenerator
 else:
     # Provide a dummy class that raises error on instantiation
     class NVRTCKernelAdapter:
@@ -45,3 +50,19 @@ else:
 
         def __init__(self, *args, **kwargs):
             raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+        @classmethod
+        def from_database(cls, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class TLNVRTCSourceWrapper:
+        """Dummy TLNVRTCSourceWrapper that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class NVRTCLibraryGenerator:
+        """Dummy NVRTCLibraryGenerator that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index d6723a03..5f8a2827 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -9,12 +9,13 @@ from tvm.target import Target
 from tilelang import tvm as tvm
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter.wrapper import TLPyWrapper
-from tilelang.jit.adapter.libgen import PyLibraryGenerator
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.utils.target import determine_target
 from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.jit.adapter.nvrtc import is_nvrtc_available, check_nvrtc_available
 
+from .libgen import NVRTCLibraryGenerator
+
 logger = logging.getLogger(__name__)
 
 # Import cuda bindings if available
@@ -75,7 +76,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_device_module(device_mod)
         self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
 
-        self.lib_generator = PyLibraryGenerator(self.target, self.verbose)
+        self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
         self.lib_generator.update_lib_code(self.kernel_global_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
@@ -130,7 +131,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = PyLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator = NVRTCLibraryGenerator(adapter.target, adapter.verbose)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
         adapter.pymodule = adapter.lib_generator.pymodule
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
new file mode 100644
index 00000000..50a587a5
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -0,0 +1,235 @@
+"""NVRTC Library Generator for TileLang.
+
+Compiles CUDA kernels at runtime using NVRTC and manages resulting binaries.
+
+Why NVRTC instead of nvcc:
+- No offline compilation step, enables true JIT workflows
+- Works without CUDA toolkit installed (only requires driver)
+- Allows kernel specialization based on runtime parameters
+
+Key responsibilities:
+- Compile CUDA source to cubin using NVRTC API
+- Generate accompanying Python launcher code
+- Load compiled cubin and extract kernel handles
+- Manage library lifecycle (load/unload)
+"""
+from __future__ import annotations
+import importlib
+import logging
+import os.path as osp
+import platform
+import tempfile
+from types import ModuleType
+
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cuda_target
+from tilelang.jit.adapter.nvrtc import is_nvrtc_available, NVRTC_UNAVAILABLE_MESSAGE
+
+logger = logging.getLogger(__name__)
+
+if is_nvrtc_available:
+    import cuda.bindings.driver as cuda
+    from tilelang.contrib.nvrtc import compile_cuda
+else:
+    raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+
+class NVRTCLibraryGenerator(LibraryGenerator):
+    """Runtime compiler and loader for NVRTC-compiled CUDA kernels.
+
+    Lifecycle:
+        1. compile_lib(): CUDA source → cubin + Python launcher
+        2. load_lib(): cubin → loaded library + kernel handles
+        3. pymodule.call(): Execute kernels via Python launcher
+        4. __del__: Cleanup (unload library)
+
+    Why three files (cu, cubin, py):
+        - .cu: Source for debugging, kept in temp directory
+        - .cubin: Compiled binary, loaded by CUDA driver
+        - .py: Launch code, imported as Python module
+
+    Attributes:
+        host_func: Generated Python launch code (from wrapper)
+        culib: CUDA library handle (CUlibrary)
+        pymodule: Imported Python module containing call() function
+    """
+    host_func: str | None = None
+    culib: cuda.CUlibrary | None = None
+    pymodule: ModuleType | None = None
+    pypath: str | None = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        """Initialize NVRTC library generator.
+
+        Args:
+            target: Compilation target (must be CUDA)
+            verbose: Enable verbose compilation output
+        """
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        """Dynamically import Python module from file path.
+
+        Standard importlib pattern for loading modules outside sys.path.
+        Used to import generated .py launcher code from temp directory.
+
+        Args:
+            module_name: Name to assign to imported module
+            file_path: Absolute path to .py file
+
+        Returns:
+            Imported module object
+        """
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Failed to import module from file: {file_path}")
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        """Store generated Python launch code for later file write.
+
+        Called by adapter after wrapper generates the launch code.
+        This is the bridge between code generation and file output.
+
+        Args:
+            host_func: Python source code containing call() function
+        """
+        self.host_func = host_func
+
+    def load_lib(self, lib_path: str | None = None):
+        """Load compiled cubin and Python launcher into memory.
+
+        Why two loads:
+            1. Import Python module for launch logic
+            2. Load cubin via CUDA Driver API for kernel handles
+
+        Context synchronization: CUDA context must be current before loading.
+        If not, use torch.cuda.synchronize() to establish context.
+
+        Args:
+            lib_path: Path to .cubin file (optional, uses self.libpath if None)
+
+        Side effects:
+            - Sets self.pymodule to imported Python module
+            - Sets self.culib to CUDA library handle
+        """
+        if lib_path is None:
+            lib_path = self.libpath
+        else:
+            self.libpath = lib_path
+
+        self.pypath = lib_path.replace(".cubin", ".py")
+        self.pymodule = self.import_from_file("kernel", self.pypath)
+
+        # Ensure the context is valid
+        ctx = cuda.cuCtxGetCurrent()[1]
+        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
+            import torch
+            torch.cuda.synchronize()
+
+        result, self.culib = cuda.cuLibraryLoadFromFile(
+            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
+        if result != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"Failed to load library: {lib_path}, error: {result}")
+
+    def compile_lib(self, timeout: float | None = None):
+        """Compile CUDA source to cubin using NVRTC and write output files.
+
+        Output artifacts (all in temp directory):
+            - .cu: Source code (for debugging)
+            - .cubin: Compiled binary (for execution)
+            - .py: Python launcher (for calling kernels)
+
+        Include paths setup:
+            - TileLang templates: kernel primitives and utilities
+            - CUTLASS: optimized GEMM/tensor ops
+            - CUDA headers: driver/runtime APIs
+
+        Why architecture detection:
+            ARM64 servers (SBSA) have different header paths than x86_64.
+
+        Args:
+            timeout: Compilation timeout in seconds (currently unsupported by NVRTC compiler)
+
+        Side effects:
+            - Writes .cu, .cubin, .py files to temp directory
+            - Sets self.srcpath, self.libpath, self.pypath
+        """
+        target = self.target
+        verbose = self.verbose
+        if is_cuda_target(target):
+            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
+            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
+            libpath = src.name.replace(".cu", ".cubin")
+
+            project_root = osp.join(osp.dirname(__file__), "..", "..")
+            if CUTLASS_INCLUDE_DIR is None:
+                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
+            else:
+                cutlass_path = CUTLASS_INCLUDE_DIR
+
+            if TILELANG_TEMPLATE_PATH is None:
+                tl_template_path = osp.abspath(osp.join(project_root, "src"))
+            else:
+                tl_template_path = TILELANG_TEMPLATE_PATH
+
+            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
+            __CUDACC_VER_MAJOR__ = cuda.CUDA_VERSION // 1000
+
+            # Determine target architecture
+            machine = platform.machine()
+            target_arch = "sbsa-linux" if machine in ("aarch64", "arm64") else "x86_64-linux"
+
+            options = [
+                f"-I{tl_template_path}",
+                f"-I{cutlass_path}",
+                f"-I{cuda_home}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include/cccl",
+                f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
+            ]
+            if self.compile_flags:
+                options += [
+                    item for flag in self.compile_flags for item in flag.split()
+                    if item not in options
+                ]
+
+            cubin_bytes = compile_cuda(
+                self.lib_code, target_format="cubin", options=options, verbose=verbose)
+            with open(libpath, "wb") as f:
+                f.write(cubin_bytes)
+
+            src.write(self.lib_code)
+            src.flush()
+
+            self.srcpath = src.name
+            self.libpath = libpath
+            self.pypath = src.name.replace(".cu", ".py")
+            if self.host_func is None:
+                raise RuntimeError(
+                    "Host function is not set, please call update_host_func() first.")
+            with open(self.pypath, "w") as f:
+                f.write(self.host_func)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def __del__(self):
+        """Cleanup: unload CUDA library when object is destroyed.
+
+        Critical for resource management - CUDA libraries consume GPU memory.
+        Failure to unload is logged but not raised (destructor can't fail).
+
+        Why explicit unload:
+            Python GC doesn't know about GPU resources, must release manually.
+        """
+        if self.culib:
+            result = cuda.cuLibraryUnload(self.culib)[0]
+            if result != cuda.CUresult.CUDA_SUCCESS:
+                logger.warning(f"Failed to unload library: {self.libpath}")
+            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
new file mode 100644
index 00000000..1a29adef
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -0,0 +1,563 @@
+"""NVRTC Source Wrapper for TileLang.
+
+Generates Python runtime code for launching CUDA kernels compiled via NVRTC.
+
+Why this exists:
+- NVRTC compiles kernels at runtime, needs Python launch code (not C++)
+- TMA descriptors must be initialized once per unique buffer, not per kernel
+- L2 cache policies require explicit CUDA Driver API setup/teardown
+
+Key design:
+- Two-pass generation: collect all descriptors first, then generate launches
+- Dict-based deduplication ensures TMA descriptors created only once
+- Generates pure Python using cuda.bindings.driver for zero C++ dependency
+"""
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import (match_declare_kernel, pythonic_expr,
+                                        parse_function_call_args, parse_tma_descriptor_args)
+
+PREDEF_HOST_FUNC_PY = """
+from cuda.bindings.driver import (
+    CUtensorMapDataType,
+    CUtensorMapInterleave,
+    CUtensorMapSwizzle,
+    CUtensorMapL2promotion,
+    CUtensorMapFloatOOBfill,
+    cuTensorMapEncodeTiled,
+    cuTensorMapEncodeIm2col,
+    CUresult,
+    cuKernelSetAttribute,
+    CUfunction_attribute,
+    CUdevice,
+    CUlaunchConfig,
+    cuLaunchKernelEx,
+    cuuint64_t,
+    cuuint32_t,
+    CUkernel,
+)
+import ctypes
+
+_function_names = {}
+
+def call({}):
+    {}
+"""
+
+TMA_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_boxDim = [{6}]
+    {0}_elementStrides = [{7}]
+    {0}_interleave = CUtensorMapInterleave({8})
+    {0}_swizzle = CUtensorMapSwizzle({9})
+    {0}_l2Promotion = CUtensorMapL2promotion({10})
+    {0}_oobFill = CUtensorMapFloatOOBfill({11})
+
+    res, {0} = cuTensorMapEncodeTiled(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_boxDim,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+TMA_IM2COL_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_elementStrides = [{6}]
+    {0}_lowerCorner = [{7}]
+    {0}_upperCorner = [{8}]
+    {0}_channelsPerPixel = {9}
+    {0}_pixelsPerColumn = {10}
+    {0}_interleave = CUtensorMapInterleave({11})
+    {0}_swizzle = CUtensorMapSwizzle({12})
+    {0}_l2Promotion = CUtensorMapL2promotion({13})
+    {0}_oobFill = CUtensorMapFloatOOBfill({14})
+
+    res, {0} = cuTensorMapEncodeIm2col(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_lowerCorner,
+        {0}_upperCorner,
+        {0}_channelsPerPixel,
+        {0}_pixelsPerColumn,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_CREATE_HANDLE_PY = """
+    from cuda.bindings.driver import (
+        CUstreamAttrValue,
+        CUstreamAttrID,
+        CUlimit,
+        CUaccessProperty,
+        cuCtxGetLimit,
+        cuCtxSetLimit,
+        cuStreamSetAttribute,
+        cuCtxResetPersistingL2Cache,
+    )
+
+    stream_attribute = CUstreamAttrValue()
+    res, init_persisting_l2_cache_size = cuCtxGetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE)
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to get L2 cache size limit: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_INIT_FUNC_PY = """
+    stream_attribute.accessPolicyWindow.hitRatio = {1}
+    stream_attribute.accessPolicyWindow.hitProp = CUaccessProperty.CU_ACCESS_PROPERTY_PERSISTING
+    stream_attribute.accessPolicyWindow.missProp = CUaccessProperty.CU_ACCESS_PROPERTY_STREAMING
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, {2})[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set L2 cache size limit: {{res}}")
+
+    stream_attribute.accessPolicyWindow.base_ptr = {0}.data_ptr()
+    stream_attribute.accessPolicyWindow.num_bytes = {2}
+
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set stream L2 access policy: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_RESET_HANDLE_PY = """
+    stream_attribute.accessPolicyWindow.num_bytes = 0
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset stream L2 access policy: {{res}}")
+
+    res = cuCtxResetPersistingL2Cache()[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset L2 cache: {{res}}")
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, init_persisting_l2_cache_size)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to restore L2 cache size limit: {{res}}")
+"""
+
+KERNEL_LAUNCH_FUNC_PY = """
+    res = cuKernelSetAttribute(
+        CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        {7},
+        kernels["{0}"],
+        CUdevice({10})
+    )[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
+
+    config = CUlaunchConfig()
+    config.gridDimX = {1}
+    config.gridDimY = {2}
+    config.gridDimZ = {3}
+    config.blockDimX = {4}
+    config.blockDimY = {5}
+    config.blockDimZ = {6}
+    config.sharedMemBytes = {7}
+    config.hStream = stream
+
+    arg_values = {8}
+    arg_types = {9}
+
+    res = cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
+"""
+
+
+class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
+    """NVRTC backend wrapper: generates Python kernel launch code.
+
+    Core responsibility: transform TVM IRModule into executable Python function
+    that initializes resources (TMA descriptors, L2 cache) and launches kernels
+    via CUDA Driver API.
+
+    Data flow:
+        IRModule → collect kernel metadata → deduplicate resources →
+        generate Python code → executable function
+
+    Why Python generation instead of C++:
+        NVRTC workflow requires runtime compilation, Python is the natural host.
+        Using cuda.bindings.driver eliminates C++ wrapper complexity.
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "ctypes.c_float",
+        "float16": "ctypes.c_uint16",
+        "bfloat16": "ctypes.c_uint16",
+        "float8_e4m3": "ctypes.c_uint8",
+        "float8_e4m3fn": "ctypes.c_uint8",
+        "float8_e5m2": "ctypes.c_uint8",
+        "float64": "ctypes.c_double",
+        "int64": "ctypes.c_int64",
+        "int32": "ctypes.c_int32",
+        "uint32": "ctypes.c_uint32",
+        "bool": "ctypes.c_bool",
+        "int8": "ctypes.c_int8",
+        "uint8": "ctypes.c_uint8",
+        "int16": "ctypes.c_int16",
+        "uint16": "ctypes.c_uint16",
+        "uchar": "ctypes.c_uint8",
+    }
+
+    _generated_host_func: str | None = None
+
+    def __init__(self,
+                 scheduled_ir_module: IRModule,
+                 source: str,
+                 target: Target,
+                 device_mod: IRModule | None = None,
+                 host_mod: IRModule | None = None,
+                 pass_configs: dict[str, Any] | None = None):
+        """Initialize NVRTC wrapper with compiled IR modules.
+
+        Args:
+            scheduled_ir_module: TVM IR after scheduling passes
+            source: Generated CUDA C++ source code
+            target: Compilation target (should be NVRTC-compatible)
+            device_mod: Device-side IR module (kernel functions)
+            host_mod: Host-side IR module (launch logic)
+            pass_configs: Optional compiler pass configurations
+        """
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string, ignoring casts.
+
+        Casts are noise in generated Python code - Python is dynamically typed.
+        """
+        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True)
+
+    def create_dispatch_func(self, code, function_informations):
+        """Generate Python dispatch function that launches multiple CUDA kernels.
+
+        Why two-pass design:
+            Pass 1: Collect TMA descriptors from all kernels into shared dicts
+            Pass 2: Generate code - descriptors first (deduplicated), then launches
+
+            Single-pass would create duplicate descriptors for each kernel.
+            Dict naturally deduplicates by descriptor name.
+
+        Args:
+            code: CUDA C++ source containing kernel declarations
+            function_informations: Dict mapping kernel names to metadata
+                (grid/block dims, params, shared memory size)
+
+        Returns:
+            Python source code defining a call() function that:
+            1. Initializes L2 cache policies (if needed)
+            2. Creates TMA descriptors once per unique buffer
+            3. Launches each kernel with cuLaunchKernelEx
+            4. Resets L2 cache policies (if needed)
+        """
+        # Extract the set of dynamic symbolic names used in the primary function
+        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
+
+        function_args = [{"name": "kernels", "type": "dict[str, CUkernel]"}]
+        # Collect function arguments based on primary function's parameters and buffer mappings
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append({
+                    "name": buffer.data.name,
+                    "type": "ctypes.c_void_p",
+                })
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
+            else:
+                raise ValueError(
+                    f"Parameter {param} is not in the buffer map of the primary function.")
+        # Add dynamic symbols as integer arguments
+        for dyn_sym in dynamic_symbolic_set:
+            if dyn_sym not in [arg["name"] for arg in function_args]:
+                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
+
+        function_args.append(self.get_stream_type())
+
+        # Format the function arguments for declaration
+        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
+
+        # Check if any function needs L2 Persistent Map
+        has_l2_persistent_map = False
+        for function_name, _ in function_informations.items():
+            if function_name in self.l2_persistent_map:
+                has_l2_persistent_map = True
+                break
+
+        desc_name_map: dict[str, str] = {}
+        desc_name_var_map: dict[str, tvm.tir.Var] = {}
+        device_index = 0
+        kernel_launch_code = """"""
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_CREATE_HANDLE_PY
+
+        # First pass: collect all TMA descriptors from all kernels to avoid duplication
+        kernel_info_list = []
+        for function_name, function_info in function_informations.items():
+            block_info = function_info["block_info"]
+            grid_info = function_info["grid_info"]
+            dynamic_smem_buf = function_info["dynamic_smem_buf"]
+            function_params = function_info["function_params"]
+
+            # Find the location of the global kernel function in the code
+            index = match_declare_kernel(code, function_name + "(")
+
+            # Analyze the function declaration to prepare for argument extraction
+            declaration = code[index:].split(";")[0]
+
+            # Identify the start of the function body to insert arguments
+            index = code.index("{", index)
+
+            # Transform function for NVRTC: returns (arg_value, arg_type) tuples
+            def transform_nvrtc_arg(name: str, arg_type: str):
+                if arg_type == "ctypes.c_void_p":
+                    return (f"{name}.data_ptr()", arg_type)
+                return (name, arg_type)
+
+            call_args = parse_function_call_args(declaration, function_args, function_params,
+                                                 desc_name_map, desc_name_var_map,
+                                                 transform_nvrtc_arg)
+
+            for arg_name, arg_type in call_args:
+                if arg_type == "ctypes.c_void_p":
+                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
+                    break
+
+            # Store kernel info for second pass
+            kernel_info_list.append({
+                'function_name': function_name,
+                'block_info': block_info,
+                'grid_info': grid_info,
+                'dynamic_smem_buf': dynamic_smem_buf,
+                'call_args': call_args,
+                'device_index': device_index,
+            })
+
+        # Generate TMA descriptor initialization code once for all kernels
+        kernel_launch_code += self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
+
+        # Second pass: generate kernel launch code for each kernel
+        for kernel_info in kernel_info_list:
+            function_name = kernel_info['function_name']
+            block_info = kernel_info['block_info']
+            grid_info = kernel_info['grid_info']
+            dynamic_smem_buf = kernel_info['dynamic_smem_buf']
+            call_args = kernel_info['call_args']
+            device_index = kernel_info['device_index']
+
+            arg_names = ", ".join([arg[0] for arg in call_args])
+            arg_types = ", ".join([arg[1] for arg in call_args])
+            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
+
+            # Generate L2 persistent map initialization for this function
+            init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
+            kernel_launch_code += init_l2_persistent_map
+
+            # Generate kernel launch code
+            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(function_name,
+                                                               self._pythonic_expr(grid_info[0]),
+                                                               self._pythonic_expr(grid_info[1]),
+                                                               self._pythonic_expr(grid_info[2]),
+                                                               self._pythonic_expr(block_info[0]),
+                                                               self._pythonic_expr(block_info[1]),
+                                                               self._pythonic_expr(block_info[2]),
+                                                               smem_str, arg_names, arg_types,
+                                                               device_index)
+
+        # Reset L2 persistent map after all kernel execution
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE_PY
+
+        # Wrap the kernel dispatch logic in an external C function
+        host_func = PREDEF_HOST_FUNC_PY.format(
+            repr(list(function_informations.keys())), def_args, kernel_launch_code)
+        return host_func
+
+    def generate_l2_persistent_map(self, function_name: str) -> str:
+        """Generate Python code to configure L2 cache persistence for a kernel.
+
+        L2 persistence pins frequently-accessed data in L2 cache to reduce
+        memory bandwidth. Requires explicit setup via CUDA stream attributes.
+
+        Args:
+            function_name: Kernel name to check for L2 persistence config
+
+        Returns:
+            Python code that sets stream access policy window, or empty
+            string if no L2 persistence configured for this kernel.
+        """
+        if function_name not in self.l2_persistent_map:
+            return ""
+        init_l2_persistent_map = ""
+        for buffer_name, (hit_ratio,
+                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+            # Get persisting_l2_cache_max_size
+            from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+            persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
+            try:
+                num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
+            except TypeError:
+                # as size_in_bytes may be a symbolic expression
+                num_bytes = persisting_l2_cache_max_size
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(
+                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+
+        return init_l2_persistent_map
+
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
+                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        """Generate Python code to initialize TMA descriptors.
+
+        TMA (Tensor Memory Accelerator) descriptors are opaque CUDA objects
+        that describe memory layout for async copies. Must be created on host
+        before kernel launch.
+
+        Args:
+            desc_name_map: Maps descriptor variable names to buffer names
+            desc_name_var_map: Maps descriptor names to TVM variables
+
+        Returns:
+            Python code that calls cuTensorMapEncodeTiled/Im2col for each
+            unique descriptor. Empty string if no TMA descriptors needed.
+        """
+        tma_descriptor_init = ""
+        if self.tma_descriptor_args is None:
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map,
+                                                  desc_name_var_map, self._pythonic_expr)
+
+        # Generate Python code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC_PY.format(
+                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.box_dim)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
+            else:
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC_PY.format(
+                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})",
+                                  params.element_strides)), ", ".join(params.lower_corner),
+                    ", ".join(params.upper_corner), params.smem_box_channel, params.smem_box_pixel,
+                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
+
+        return tma_descriptor_init
+
+    def update_lib_code(self, code: str):
+        """Update library code and generate host dispatch function.
+
+        Entry point for code generation. Walks the host IR to extract kernel
+        call sites, matches them with device kernels, then generates Python
+        dispatch code via create_dispatch_func().
+
+        Args:
+            code: CUDA C++ source code containing compiled kernels
+
+        Returns:
+            The same code string (stored in self.lib_code). Side effect:
+            sets self.host_func to generated Python dispatcher.
+        """
+        # Update the library code with the given code string
+        self.lib_code = code
+
+        # Organize function information for code generation
+        function_informations = {}
+        for function_name in self.function_names:
+            # Do not update function with dispatch host function
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] | None = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and
+                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError(
+                            "tvm_call_packed should have at least 1 argument and match device function parameters"
+                        )
+                    function_params = args[1:1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        # Create the host function wrapper for the CUDA kernel
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
+
+    def get_stream_type(self) -> dict[str, str]:
+        """Return stream parameter spec for Python signature.
+
+        NVRTC backend uses raw int for stream handle (not cudaStream_t pointer).
+        Default to 0 (NULL stream) for convenience.
+        """
+        return {"name": "stream=0", "type": "int"}
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index efc965e1..94e590d3 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Literal
+from typing import Literal, Callable, Any
 from tilelang import tvm as tvm
 from tvm import IRModule, tir
 from tvm.target import Target
@@ -107,13 +107,16 @@ def get_annotated_mod(
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None) -> str:
+def pythonic_expr(expr: tvm.tir.PrimExpr,
+                  dtype_map: dict[str, str] | None = None,
+                  ignore_cast: bool = False) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
     Args:
         expr: The TVM PrimExpr to convert.
-
+        dtype_map: A dictionary mapping data types to their string representations.
+        ignore_cast: Whether to ignore the cast operator and return the string representation of the value without the cast.
     Returns:
         A string representation of the expression.
     """
@@ -158,10 +161,11 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
         elif isinstance(node, tvm.tir.Cast):
             # C-style cast has high precedence
             value_str, _ = node_to_result_map[node.value]
-            if dtype_map is None:
-                s = f"({node.dtype}){value_str}"
+            if ignore_cast:
+                s = value_str
             else:
-                s = f"({dtype_map[node.dtype]}){value_str}"
+                type_str = node.dtype if dtype_map is None else dtype_map[node.dtype]
+                s = f"({type_str}){value_str}"
             p = PRECEDENCE.get(type(node), ATOMIC_PRECEDENCE)
         elif isinstance(
                 node,
@@ -216,3 +220,238 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
     tvm.tir.stmt_functor.post_order_visit(expr, _visitor)
 
     return next(iter(node_to_result_map[expr]), "")
+
+
+def maybe_desc_name(name: str,
+                    matches: list[str],
+                    i: int,
+                    desc_name_map: dict[str, str] | None = None) -> bool:
+    """
+    Check if a parameter name corresponds to a TMA descriptor.
+
+    Args:
+        name: The parameter name to check.
+        matches: List of all matched parameter names.
+        i: Index of the current match.
+        desc_name_map: Optional mapping to store descriptor name relationships.
+
+    Returns:
+        True if the parameter is a TMA descriptor.
+    """
+    match = matches[i]
+    if not (match == name + "_desc" or match.startswith(name + "_desc_")):
+        return False
+    desc_decls = []
+    if desc_name_map is not None:
+        desc_name_map[match] = name
+    if i > 0:
+        desc_decls.append(matches[i - 1])
+    if i < len(matches) - 1:
+        desc_decls.append(matches[i + 1])
+    return any([decl == "CUtensorMap" for decl in desc_decls])
+
+
+def parse_function_call_args(
+    declaration: str,
+    function_args: list[dict[str, str]],
+    function_params: list[Any],
+    desc_name_map: dict[str, str] | None = None,
+    desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    transform_arg: Callable[[str, str], Any] | None = None,
+) -> list[Any]:
+    """
+    Parse function call arguments from a kernel declaration.
+
+    Args:
+        declaration: The kernel function declaration string.
+        function_args: List of function argument specifications.
+        function_params: List of function parameters from TVM IR.
+        desc_name_map: Optional mapping for descriptor names.
+        desc_name_var_map: Optional mapping from descriptor names to TVM variables.
+        transform_arg: Optional function to transform each argument (name, type) -> result.
+
+    Returns:
+        List of parsed call arguments.
+    """
+    pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
+    matches = re.findall(pattern, declaration)
+    call_args = []
+
+    for i, match in enumerate(matches):
+        for arg in function_args:
+            if arg["name"] == match:
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, arg["type"]))
+                else:
+                    call_args.append(match)
+            elif maybe_desc_name(arg["name"], matches, i, desc_name_map):
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, "None"))
+                else:
+                    call_args.append(match)
+                if desc_name_var_map is not None and function_params is not None:
+                    assert len(call_args) <= len(function_params), \
+                        f"Too many arguments: {len(call_args)} > {len(function_params)}"
+                    desc_name_var_map[match] = function_params[len(call_args) - 1]
+
+    return call_args
+
+
+class TMADescriptorParams:
+    """Parsed TMA descriptor parameters."""
+
+    def __init__(self,
+                 handle_name: str,
+                 dtype: str,
+                 tensor_rank: int,
+                 global_address: Any,
+                 is_img2col: bool = False):
+        self.handle_name = handle_name
+        self.dtype = dtype
+        self.tensor_rank = tensor_rank
+        self.global_address = global_address
+        self.is_img2col = is_img2col
+
+        # Common fields
+        self.global_dim: list[str] = []
+        self.global_stride: list[str] = []
+        self.element_strides: list[str] = []
+        self.interleave: str = ""
+        self.swizzle: str = ""
+        self.l2_promotion: str = ""
+        self.oob_fill: str = ""
+
+        # Tiled-specific fields
+        self.box_dim: list[str] = []
+
+        # Im2col-specific fields
+        self.lower_corner: list[str] = []
+        self.upper_corner: list[str] = []
+        self.smem_box_channel: str = ""
+        self.smem_box_pixel: str = ""
+
+
+def parse_tma_descriptor_args(
+    tma_descriptor_args: dict[tvm.tir.Var, list[Any]],
+    desc_name_map: dict[str, str],
+    desc_name_var_map: dict[str, tvm.tir.Var],
+    pythonic_expr_func: Callable[[Any], str],
+) -> list[TMADescriptorParams]:
+    """
+    Parse TMA descriptor arguments into structured parameters.
+
+    Args:
+        tma_descriptor_args: Dictionary mapping TMA descriptor variables to their arguments.
+        desc_name_map: Mapping from descriptor handles to parameter names.
+        desc_name_var_map: Mapping from descriptor handles to TVM variables.
+        pythonic_expr_func: Function to convert TVM expressions to strings.
+
+    Returns:
+        List of parsed TMA descriptor parameters.
+    """
+    if not tma_descriptor_args:
+        return []
+
+    results = []
+
+    for handle_name, _ in desc_name_map.items():
+        assert handle_name in desc_name_var_map, \
+            f"Handle name {handle_name} not found in desc_name_var_map"
+        desc_var = desc_name_var_map[handle_name]
+
+        assert desc_var in tma_descriptor_args, \
+            f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
+        args = tma_descriptor_args[desc_var]
+
+        # Skip __tvm_tensormap_create_tiled and second element (like CUDA version)
+        if len(args) < 3:
+            raise ValueError(
+                f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
+
+        tma_create_str, _, dtype, tensor_rank, global_address, *remaining_args = args
+
+        is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
+
+        # Convert basic fields
+        dtype = pythonic_expr_func(dtype)
+        tensor_rank = int(pythonic_expr_func(tensor_rank))
+
+        # Validate tensor_rank
+        if not isinstance(tensor_rank, int) or tensor_rank <= 0:
+            raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
+
+        params = TMADescriptorParams(handle_name, dtype, tensor_rank, global_address, is_img2col)
+
+        if not is_img2col:
+            # Tiled mode
+            expected_args_len = 4 * tensor_rank + 4
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
+                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [
+                pythonic_expr_func(i) for i in remaining_args[tensor_rank:2 * tensor_rank]
+            ]
+            params.box_dim = [
+                pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank:3 * tensor_rank]
+            ]
+            params.element_strides = [
+                pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank:4 * tensor_rank]
+            ]
+
+            # Extract remaining parameters
+            try:
+                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank:4 *
+                                                                             tensor_rank + 4]
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
+                ) from e
+        else:
+            # Im2col mode
+            expected_args_len = 5 * tensor_rank + 2
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
+                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [
+                pythonic_expr_func(i) for i in remaining_args[tensor_rank:2 * tensor_rank]
+            ]
+            params.element_strides = [
+                pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank:3 * tensor_rank]
+            ]
+            params.lower_corner = [
+                pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
+            ]
+            params.upper_corner = [
+                pythonic_expr_func(i)
+                for i in remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
+            ]
+
+            # Extract remaining parameters
+            try:
+                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = \
+                    remaining_args[5 * tensor_rank - 4:5 * tensor_rank + 2]
+                params.smem_box_pixel = pythonic_expr_func(smem_box_pixel)
+                params.smem_box_channel = pythonic_expr_func(smem_box_channel)
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to unpack the final 6 TMA parameters "
+                    "(smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
+                ) from e
+
+        results.append(params)
+
+    return results
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index cdd0d5c7..7819890d 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -5,7 +5,8 @@ from typing import Any
 from tvm import IRModule
 from tvm.target import Target
 from .utils import (is_metal_target, match_declare_kernel, match_declare_kernel_cpu, is_cuda_target,
-                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr)
+                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr,
+                    parse_function_call_args, parse_tma_descriptor_args)
 import re
 import logging
 import textwrap
@@ -49,16 +50,6 @@ extern "C" int call({}) {{
 }}
 """
 
-PREDEF_HOST_FUNC_PY = """
-import cuda.bindings.driver
-import ctypes
-
-_function_names = {}
-
-def call({}):
-    {}
-"""
-
 L2_PERSISTENT_MAP_CREATE_HANDLE = """
 \tcudaStreamAttrValue stream_attribute;
 \tsize_t init_persisting_l2_cache_size;
@@ -136,65 +127,6 @@ TMA_IM2COL_DESC_INIT_FUNC = """
 \t}}
 """
 
-TMA_DESC_INIT_FUNC_PY = """
-\t{0}_type = cuda.bindings.driver.CUtensorMapDataType({1})
-\t{0}_tensorRank = {2}
-\t{0}_globalAddress = {3}.data_ptr()
-\t{0}_globalDim = [{4}]
-\t{0}_globalStride = [{5}][1:]
-\t{0}_boxDim = [{6}]
-\t{0}_elementStrides = [{7}]
-\t{0}_interleave = cuda.bindings.driver.CUtensorMapInterleave({8})
-\t{0}_swizzle = cuda.bindings.driver.CUtensorMapSwizzle({9})
-\t{0}_l2Promotion = cuda.bindings.driver.CUtensorMapL2promotion({10})
-\t{0}_oobFill = cuda.bindings.driver.CUtensorMapFloatOOBfill({11})
-
-\tres, {0} = cuda.bindings.driver.cuTensorMapEncodeTiled(
-\t\t{0}_type,
-\t\t{0}_tensorRank,
-\t\t{0}_globalAddress,
-\t\t{0}_globalDim,
-\t\t{0}_globalStride,
-\t\t{0}_boxDim,
-\t\t{0}_elementStrides,
-\t\t{0}_interleave,
-\t\t{0}_swizzle,
-\t\t{0}_l2Promotion,
-\t\t{0}_oobFill,
-\t)
-
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
-"""
-
-KERNEL_LAUNCH_FUNC_PY = """
-\tres = cuda.bindings.driver.cuKernelSetAttribute(
-\t\tcuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-\t\t{7},
-\t\tkernels["{0}"],
-\t\tcuda.bindings.driver.CUdevice({10})
-\t)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
-
-\tconfig = cuda.bindings.driver.CUlaunchConfig()
-\tconfig.gridDimX = {1}
-\tconfig.gridDimY = {2}
-\tconfig.gridDimZ = {3}
-\tconfig.blockDimX = {4}
-\tconfig.blockDimY = {5}
-\tconfig.blockDimZ = {6}
-\tconfig.sharedMemBytes = {7}
-\tconfig.hStream = stream
-
-\targ_values = {8}
-\targ_types = {9}
-
-\tres = cuda.bindings.driver.cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
-"""
-
 
 class BaseWrapper(ABC):
 
@@ -297,41 +229,6 @@ class TLCUDASourceWrapper:
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
-        def func_call_args(s,
-                           function_args,
-                           function_params,
-                           desc_name_map: dict[str, str] | None = None,
-                           desc_name_var_map: dict[str, tvm.tir.Var] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(match)
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append(match)
-                        assert len(call_args) <= len(
-                            function_params
-                        ), f"Function {function_name} has {len(function_params)} parameters, but {len(call_args)} arguments"
-                        desc_name_var_map[match] = function_params[len(call_args) - 1]
-
-            return call_args
-
         has_l2_persistent_map = False
         for function_name, _ in function_informations.items():
             if function_name in self.l2_persistent_map:
@@ -365,8 +262,8 @@ class TLCUDASourceWrapper:
             kernel_launch_code += init_l2_persistent_map
 
             if self.use_cooperative_groups[function_name]:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
+                args_list = parse_function_call_args(declaration, function_args, function_params,
+                                                     desc_name_map, desc_name_var_map)
                 assert len(function_params) == len(
                     args_list
                 ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
@@ -377,8 +274,8 @@ class TLCUDASourceWrapper:
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
                     function_name, grid_str, block_str, function_name + "_args", smem_str)
             else:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
+                args_list = parse_function_call_args(declaration, function_args, function_params,
+                                                     desc_name_map, desc_name_var_map)
                 assert len(function_params) == len(
                     args_list
                 ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
@@ -420,101 +317,26 @@ class TLCUDASourceWrapper:
         tma_descripter_init = ""
         if self.tma_descriptor_args is None:
             return tma_descripter_init
-        for handle_name, _ in desc_name_map.items():
-            assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
-            desc_var = desc_name_var_map[handle_name]
-
-            assert desc_var in self.tma_descriptor_args, f"TMA descriptor {desc_var} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_var]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-
-            tma_create_str, _, dtype, tensor_rank, globalAddress, *remaining_args = args
-
-            is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
-            dtype = self._pythonic_expr(dtype)
-            tensor_rank = int(self._pythonic_expr(tensor_rank))
-
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            if not is_img2col:
-                # Calculate required length for remaining_args
-                expected_args_len = 4 * tensor_rank + 4  # 4 groups of tensor_rank size + 4 parameters
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                box_dim = [self._pythonic_expr(i) for i in box_dim]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-
-                # Extract remaining parameters
-                try:
-                    interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                               tensor_rank + 4]
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
 
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map,
+                                                  desc_name_var_map, self._pythonic_expr)
+
+        # Generate C++ code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
                 tma_descripter_init += TMA_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(box_dim), ",".join(element_strides),
-                    interleave, swizzle, l2Promotion, oobFill)
+                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    ",".join(params.global_dim), ",".join(params.global_stride),
+                    ",".join(params.box_dim), ",".join(params.element_strides), params.interleave,
+                    params.swizzle, params.l2_promotion, params.oob_fill)
             else:
-                # Calculate required length for remaining_args
-                expected_args_len = 5 * tensor_rank + 2
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                element_strides = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                lower_corner = remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
-                upper_corner = remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-                lower_corner = [self._pythonic_expr(i) for i in lower_corner]
-                upper_corner = [self._pythonic_expr(i) for i in upper_corner]
-
-                # Extract remaining parameters
-                try:
-                    smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill = remaining_args[
-                        5 * tensor_rank - 4:5 * tensor_rank + 2]
-                    smem_box_pixel = self._pythonic_expr(smem_box_pixel)
-                    smem_box_channel = self._pythonic_expr(smem_box_channel)
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 6 TMA parameters (smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
                 tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(element_strides), ",".join(lower_corner),
-                    ",".join(upper_corner), smem_box_channel, smem_box_pixel, interleave, swizzle,
-                    l2Promotion, oobFill)
+                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    ",".join(params.global_dim), ",".join(params.global_stride),
+                    ",".join(params.element_strides), ",".join(params.lower_corner),
+                    ",".join(params.upper_corner), params.smem_box_channel, params.smem_box_pixel,
+                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
 
         return tma_descripter_init
 
@@ -713,213 +535,6 @@ class TLCUDASourceWrapper:
             raise ValueError("Cannot find primary function in the module.")
 
 
-class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
-    """
-    A wrapper class for the TileLang NVRTC backend.
-    """
-
-    _TYPE_MAP = {
-        "float32": "ctypes.c_float",
-        "float16": "ctypes.c_uint16",
-        "bfloat16": "ctypes.c_uint16",
-        "float8_e4m3": "ctypes.c_uint8",
-        "float8_e4m3fn": "ctypes.c_uint8",
-        "float8_e5m2": "ctypes.c_uint8",
-        "float64": "ctypes.c_double",
-        "int64": "ctypes.c_int64",
-        "int32": "ctypes.c_int32",
-        "uint32": "ctypes.c_uint32",
-        "bool": "ctypes.c_bool",
-        "int8": "ctypes.c_int8",
-        "uint8": "ctypes.c_uint8",
-        "int16": "ctypes.c_int16",
-        "uint16": "ctypes.c_uint16",
-        "uchar": "ctypes.c_uint8",
-    }
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
-        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
-
-    def create_dispatch_func(self, code, function_informations):
-        # Extract the set of dynamic symbolic names used in the primary function
-        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
-
-        function_args = [{"name": "kernels", "type": "Dict[str, cuda.bindings.driver.CUkernel]"}]
-        # Collect function arguments based on primary function's parameters and buffer mappings
-        for param in self.prim_func.params:
-            if param in self.prim_func.buffer_map:
-                buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": "ctypes.c_void_p",
-                })
-            elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
-            else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
-        # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
-
-        function_args.append(self.get_stream_type())
-        # Format the function arguments for declaration
-        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
-
-        def func_call_args(s, function_args, desc_name_map: dict[str, str] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(
-                            (f"{match}.data_ptr()" if arg["type"] == "ctypes.c_void_p" else match,
-                             arg["type"]))
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append((match, "None"))
-            return call_args
-
-        desc_name_map: dict[str, str] = {}
-        device_index = 0
-        kernel_launch_code = """"""
-        for function_name, function_info in function_informations.items():
-            block_info = function_info["block_info"]
-            grid_info = function_info["grid_info"]
-            dynamic_smem_buf = function_info["dynamic_smem_buf"]
-
-            # Find the location of the global kernel function in the code
-            index = match_declare_kernel(code, function_name + "(")
-
-            # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
-
-            # Identify the start of the function body to insert arguments
-            index = code.index("{", index)
-            call_args = func_call_args(declaration, function_args, desc_name_map)
-            for arg_name, arg_type in call_args:
-                if arg_type == "ctypes.c_void_p":
-                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
-                    break
-            arg_names = ", ".join([arg[0] for arg in call_args])
-            arg_types = ", ".join([arg[1] for arg in call_args])
-            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
-            kernel_launch_code += self.generate_tma_descriptor_args(
-                desc_name_map) + KERNEL_LAUNCH_FUNC_PY.format(
-                    function_name, self._pythonic_expr(grid_info[0]),
-                    self._pythonic_expr(grid_info[1]), self._pythonic_expr(grid_info[2]),
-                    self._pythonic_expr(block_info[0]), self._pythonic_expr(block_info[1]),
-                    self._pythonic_expr(
-                        block_info[2]), smem_str, arg_names, arg_types, device_index)
-
-        # Wrap the kernel dispatch logic in an external C function
-        host_func = PREDEF_HOST_FUNC_PY.format(
-            repr(list(function_informations.keys())), def_args, kernel_launch_code)
-        return host_func
-
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str]) -> str:
-        tma_descripter_init = ""
-        if self.tma_descriptor_args is None:
-            return tma_descripter_init
-
-        for handle_name, name in desc_name_map.items():
-            desc_name = name + "_desc"
-            assert desc_name in self.tma_descriptor_args, f"TMA descriptor {desc_name} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_name]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
-
-            tensor_rank = int(tensor_rank)
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            # Calculate required length for remaining_args
-            # 4 groups of tensor_rank size + 4 parameters
-            expected_args_len = 4 * tensor_rank + 4
-            if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-            # Extract dimensions and strides using list slicing
-            global_dim = remaining_args[:tensor_rank]
-            global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-            box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-            element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-            global_dim = [str(i) for i in global_dim]
-            global_stride = [str(i) for i in global_stride]
-            box_dim = [str(i) for i in box_dim]
-            element_strides = [str(i) for i in element_strides]
-
-            # Extract remaining parameters
-            try:
-                interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                           tensor_rank + 4]
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                ) from e
-
-            tma_descripter_init += TMA_DESC_INIT_FUNC_PY.format(
-                handle_name, dtype, tensor_rank, globalAddress,
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_stride)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})", box_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})",
-                              element_strides)), interleave, swizzle, l2Promotion, oobFill)
-        return tma_descripter_init
-
-    def update_lib_code(self, code: str):
-        # Update the library code with the given code string
-        self.lib_code = code
-
-        # Organize function information for code generation
-        function_informations = {}
-        for function_name in self.function_names:
-            # Do not update function with dispatch host function
-            if (function_name not in self.block_info) or (function_name not in self.grid_info):
-                continue
-
-            function_informations[function_name] = {
-                "function_name": function_name,
-                "block_info": self.block_info[function_name],
-                "grid_info": self.grid_info[function_name],
-                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
-            }
-
-        # Create the host function wrapper for the CUDA kernel
-        self.host_func = self.create_dispatch_func(code, function_informations)
-        return self.lib_code
-
-    def get_stream_type(self) -> dict[str, str]:
-        return {"name": "stream=0", "type": "int"}
-
-
 class TLHIPSourceWrapper(TLCUDASourceWrapper):
     """
     A wrapper class for the TileLang HIP backend.
@@ -1230,9 +845,10 @@ class TLPyWrapper(TLWrapper):
     def wrap(self, c_source: str):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
         if is_cuda_target(self.target):
+            from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
             wrapper_class = TLNVRTCSourceWrapper
         else:
-            raise ValueError(f"Unsupported platform: {self.arch.platform}")
+            raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
         wrapper = wrapper_class(
             scheduled_ir_module=self.scheduled_ir_module,
             source=c_source,
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index bb47716c..6f5eb0b5 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -15,7 +15,7 @@ from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
 from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  NVRTCKernelAdapter, TorchDLPackKernelAdapter, MetalKernelAdapter)
+                                  TorchDLPackKernelAdapter, MetalKernelAdapter)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
@@ -270,6 +270,7 @@ class JITKernel(Generic[_P, _T]):
                 compile_flags=compile_flags,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
             adapter = NVRTCKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -339,6 +340,7 @@ class JITKernel(Generic[_P, _T]):
                 pass_configs=pass_configs,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
             adapter = NVRTCKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 12d3af4d..3c469e78 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -5,6 +5,7 @@ from typing import Callable
 
 from tilelang.layout import Layout
 from tvm.script.parser.tir import attr, block_attr
+from tvm.tir import FloatImm
 
 __all__ = [
     "use_swizzle",
@@ -49,5 +50,5 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
     _l2_hit_ratio_map = {}
     for buffer, hit_ratio in l2_hit_ratio_map.items():
         assert buffer.scope() == "global", "persistent L2 can only be applied to global buffers"
-        _l2_hit_ratio_map[buffer.data] = float(hit_ratio)
+        _l2_hit_ratio_map[buffer.data] = FloatImm("float32", float(hit_ratio))
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
-- 
GitLab


From 729e66ca6de418085d896f6f662184f931da9bb2 Mon Sep 17 00:00:00 2001
From: Jiaxing Ding <61589029+Paran0idy@users.noreply.github.com>
Date: Sat, 15 Nov 2025 22:12:20 +0800
Subject: [PATCH 004/139] [AMD] Update CK for ROCm7 (#1262)

---
 3rdparty/composable_kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 1c45ca35..b38bb492 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 1c45ca35dd5c215e0c1db1f40f01556f467f52a8
+Subproject commit b38bb492a1a55b5abb0c345962143c0f9c482cfb
-- 
GitLab


From 2de566e798e2b6786255df395ce652d52f10af9e Mon Sep 17 00:00:00 2001
From: Kevinzz <kevinzz08@foxmail.com>
Date: Sun, 16 Nov 2025 15:56:11 +0800
Subject: [PATCH 005/139] [BugFix] Remove memory_order in atomic constexpr and
 fix NSA bwd (#1260)

* fix nsa bwd and atomic

* [Lint]

* [BugFix]
- New implementation for atomicMax and atomicMin using atomicCAS
- PTX version atomicAdd for single 16-byte data
- Modify the test cases

* [Lint]

---------

Co-authored-by: tzj-fxz <tzjfxz@gmail.com>
---
 .../deepseek_nsa/example_tilelang_nsa_bwd.py  |  24 +-
 src/tl_templates/cuda/atomic.h                | 213 +++++++++++++++---
 .../test_tilelang_language_atomic_add.py      |  60 ++---
 3 files changed, 229 insertions(+), 68 deletions(-)

diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 8387d227..1d1b5ea3 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -106,8 +106,8 @@ def tilelang_kernel_fwd(
                     T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
-                        for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
+                        for k, j in T.Parallel(G, BS):
+                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0,
                                                          -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
@@ -124,18 +124,18 @@ def tilelang_kernel_fwd(
                     T.copy(scores_max, scores_max_prev)
                     T.fill(scores_max, -T.infinity(accum_dtype))
                     T.reduce_max(acc_s, scores_max, dim=1, clear=True)
-                    for i in T.Parallel(G):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                    for i, j in T.Parallel(G, BS):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    for k in T.Parallel(G):
+                        scores_scale[k] = T.exp2(scores_max_prev[k] * scale - scores_max[k] * scale)
+                    for k, j in T.Parallel(G, BS):
+                        acc_s[k, j] = T.exp2(acc_s[k, j] * scale - scores_max[k] * scale)
                     T.reduce_sum(acc_s, scores_sum, dim=1)
-                    for i in T.Parallel(G):
-                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    for k in T.Parallel(G):
+                        logsum[k] = logsum[k] * scores_scale[k] + scores_sum[k]
                     T.copy(acc_s, acc_s_cast)
 
                     # Rescale
-                    for i, j in T.Parallel(G, BV):
-                        acc_o[i, j] *= scores_scale[i]
+                    for k, j in T.Parallel(G, BV):
+                        acc_o[k, j] *= scores_scale[k]
 
                     # V * softmax(Q * K)
                     T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
@@ -465,8 +465,8 @@ def tilelang_kernel_bwd_dqkv(
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
                     T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
-                    for i, j in T.Parallel(BS, G):
-                        dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                    for _i, _j in T.Parallel(BS, G):
+                        dsT_cast[_i, _j] = qkT[_i, _j] * (dsT[_i, _j] - delta[_j]) * sm_scale
 
                     # [BS, G] @ [G, BK] -> [BS, BK]
                     T.gemm(dsT_cast, Q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 82eeccfd..a573886b 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -46,10 +46,22 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMax for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
@@ -61,11 +73,21 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
@@ -78,10 +100,22 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    // There is no implementation of atomicMin for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
@@ -93,11 +127,21 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
@@ -110,10 +154,67 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    } else {
+      // Since atomic ref do not support memory order, we need to inline ptx
+      // code here for each situation
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      }
+    }
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
@@ -125,11 +226,69 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      return static_cast<T1>(
+          atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    } else {
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(*reinterpret_cast<__half *>(&ret_val_cast));
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(
+            *reinterpret_cast<__nv_bfloat16 *>(&ret_val_cast));
+      }
+    }
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index 42c33e54..132e002a 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -236,7 +236,31 @@ def run_atomic_addx2(M, N, block_M, block_N):
     torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
 
 
-@tilelang.jit
+def test_atomic_add():
+    run_atomic_add(8, 128, 128, 32, 32)
+
+
+def test_atomic_max():
+    run_atomic_max(4, 64, 64, 16, 16)
+
+
+def test_atomic_min():
+    run_atomic_min(4, 64, 64, 16, 16)
+
+
+def test_atomic_load_store():
+    run_atomic_load_store(64, 64, 16, 16)
+
+
+def test_atomic_memory_order():
+    run_atomic_memory_order(4, 64, 64, 16, 16)
+
+
+def test_atomic_addx2():
+    run_atomic_addx2(32, 64, 8, 16)
+
+
+@tilelang.jit(debug_root_path="./testing/python/language")
 def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
 
     @T.prim_func
@@ -248,9 +272,9 @@ def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
                     val = A[idx_i, idx_j]
-                    T.atomic_add(B[idx_i, idx_j], val, memory_order="relaxed")
-                    T.atomic_max(C[idx_i, idx_j], val, memory_order="acquire")
-                    T.atomic_min(D[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_add(B[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_max(C[idx_i, idx_j], val, memory_order="relaxed")
+                    T.atomic_min(D[idx_i, idx_j], val, memory_order="relaxed")
 
     return atomic_different_orders
 
@@ -271,30 +295,6 @@ def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
     torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float('inf')), A))
 
 
-def test_atomic_add():
-    run_atomic_add(8, 128, 128, 32, 32)
-
-
-def test_atomic_max():
-    run_atomic_max(4, 64, 64, 16, 16)
-
-
-def test_atomic_min():
-    run_atomic_min(4, 64, 64, 16, 16)
-
-
-def test_atomic_load_store():
-    run_atomic_load_store(64, 64, 16, 16)
-
-
-def test_atomic_memory_order():
-    run_atomic_memory_order(4, 64, 64, 16, 16)
-
-
-def test_atomic_addx2():
-    run_atomic_addx2(32, 64, 8, 16)
-
-
 @tilelang.jit
 def atomic_addx4_program(M, N, block_M, block_N):
 
@@ -361,7 +361,9 @@ def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
 
 
 def test_atomic_different_memory_orders():
-    run_atomic_different_memory_orders(32, 32, 8, 8)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="float")
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="float16")
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="bfloat16")
 
 
 def test_atomic_addx4():
-- 
GitLab


From 716dbef52f550dd4d0864c340eb2362904b0ea33 Mon Sep 17 00:00:00 2001
From: Zhengju Tang <97930865+tzj-fxz@users.noreply.github.com>
Date: Mon, 17 Nov 2025 01:22:02 +0800
Subject: [PATCH 006/139] [Example] Add GQA decoding kernel with varlen page
 table (#1265)

* [Example] Add page table for gqa decode

* [Example] Page table for varlen decoding

* [Lint]

* [Refactor] Remove redundant code

* [Lint]

* [Lint]

* [Lint]
---
 .../example_gqa_decode_varlen_logits_paged.py | 711 ++++++++++++++++++
 1 file changed, 711 insertions(+)
 create mode 100644 examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py

diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
new file mode 100644
index 00000000..e565cbeb
--- /dev/null
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
@@ -0,0 +1,711 @@
+import torch
+import math
+import argparse
+import tilelang
+import tilelang.language as T
+from example_gqa_decode_varlen_logits import flash_attn_with_attn_pool_decode, repeat_kv, do_bench
+
+torch.manual_seed(0)
+
+
+def get_configs():
+    import itertools
+    block_N = [64, 128]
+    block_H = [64]
+    num_split = [1]
+    num_stages = [1, 2, 3]
+    threads = [128]
+    _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
+
+    configs = [{
+        'block_N': c[0],
+        'block_H': c[1],
+        'num_split': c[2],
+        'num_stages': c[3],
+        'threads': c[4]
+    } for c in _configs]
+    return configs
+
+
+# @autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
+def flashattn(batch,
+              heads,
+              k_heads,
+              max_seqlen_kv,
+              total_seqlen_k,
+              dim,
+              has_sink,
+              page_block_size,
+              block_N=128,
+              block_H=64,
+              num_split=1,
+              num_stages=1,
+              threads=128):
+    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    shape_q = [batch, heads, dim]
+    shape_k = [total_seqlen_k, k_heads, dim]
+    shape_v = [total_seqlen_k, k_heads, dim]
+    shape_o = [batch, heads, dim]
+    shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
+    dtype = "float16"
+    accum_dtype = "float"
+    kv_group_num = heads // k_heads
+    assert page_block_size >= block_N and page_block_size % block_N == 0, "page_block_size must be larger than block_N and a multiple of block_N"
+
+    valid_block_H = min(block_H, kv_group_num)
+    # TODO: check if max_seqlen_kv is correct for varlen case
+
+    @T.macro
+    def flash_attn(
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+            s_aux: T.Tensor([heads], "float32"),
+            BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], "int32"),
+            Output: T.Tensor([batch, heads, dim], dtype),
+            S: T.Tensor(shape_s, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([valid_block_H, dim], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
+            s_aux_shared = T.alloc_shared([block_H], "float32")
+
+            bid = bx
+            hid = by
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            cur_start_k = cu_seqlens_k[bid]
+            cur_end_k = cu_seqlens_k[bid + 1]
+            cur_seqlen_k = cur_end_k - cur_start_k
+
+            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                k_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (
+                    k * block_N) % page_block_size
+                T.copy(K[cur_start_k + k_start:cur_start_k + k_start + block_N, cur_kv_head, :],
+                       K_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j],
+                                                 -T.infinity(accum_dtype))
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                # scores_max_prev is m_i
+                # scores_max is row_max->m_ij in triton
+                T.copy(scores_max, S_shared[:, k])
+                # scores_scale is alpha in triton
+                for i in T.Parallel(block_H):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                # scores_sum is l_ij in triton
+                # logsum is l_i in triton
+                for i in T.Parallel(block_H):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+                for i, j in T.Parallel(block_H, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                v_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (
+                    k * block_N) % page_block_size
+                T.copy(V[cur_start_k + v_start:cur_start_k + v_start + block_N, cur_kv_head, :],
+                       V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            if has_sink:
+                T.copy(s_aux[hid * valid_block_H:hid * valid_block_H + block_H], s_aux_shared)
+                for i in T.Parallel(block_H):
+                    logsum[i] += s_aux_shared[i]
+            for i, j in T.Parallel(block_H, dim):
+                acc_o[i, j] /= logsum[i]
+            for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
+                S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(acc_o[:valid_block_H, :], O_shared)
+            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid,
+                                                  hid * valid_block_H:(hid + 1) * valid_block_H, :])
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+            s_aux: T.Tensor([heads], "float32"),
+            BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], "int32"),
+            Output: T.Tensor(shape_o, dtype),
+            S: T.Tensor(shape_s, dtype),
+    ):
+        flash_attn(Q, K, V, cu_seqlens_k, s_aux, BLOCK_TABLE, Output, S)
+
+    # TODO: split version
+    return flashattn_gqa_decode_no_split
+
+
+def flash_attn_with_attn_pool_decode_tilelang(
+    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
+    K: torch.Tensor,  ## [tk, k_h, k_dim]
+    V: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_k: int,
+    real_max_k_seqlen: int,
+    num_split: int,
+    softmax_scale: float,
+    s_aux: torch.Tensor = None,
+    block_size: int = 64,
+    use_per_kv_head_sparse_index: bool = False,
+    tl_kernel=None,
+    block_table: torch.Tensor = None,
+):
+    num_tokens, q_h, head_size = Q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = K.size(1)
+
+    assert Q.dim() == K.dim() == 3
+    assert Q.size(2) == K.size(2)
+    assert cu_seqlens_k.dim() == 1
+    assert head_size in {64, 128, 256}
+    assert Q.is_contiguous()
+    assert K.is_contiguous()
+    assert V.is_contiguous()
+
+    gqa_group_size = q_h // k_h
+
+    O_tl = torch.zeros_like(Q)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)),
+                       dtype=Q.dtype,
+                       device=Q.device)
+    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux, block_table)
+
+    if use_per_kv_head_sparse_index:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    else:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
+
+    return O_tl, S_tl
+
+
+def test_equal_seqlen_decode_main(args):
+    """Test decode kernel with equal sequence lengths"""
+    print("Testing decode kernel with equal sequence lengths")
+
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    k_seqlen = args.k_seqlen
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+
+    # For decode, query is just 1 token per batch
+    q = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
+    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+    softmax_scale = 1.0 / math.sqrt(head_size)
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Convert to varlen format for K, V
+    k_varlen = k.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size).contiguous()
+    v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size).contiguous()
+
+    # Generate cumulative sequence lengths
+    cu_seqlens_k = torch.arange(
+        0, (batch_size + 1) * k_seqlen, k_seqlen, device='cuda', dtype=torch.int32)
+    max_seqlen_k = k_seqlen
+
+    print(f"q shape: {q.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
+                          args.test_sink, page_block_size)
+
+    block_table = torch.zeros(
+        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size)
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+        block_table=block_table,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :,
+                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
+                             block_size):] = 0
+
+    # Compute torch reference
+    q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
+    k_repeat = repeat_kv(k, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
+    v_repeat = repeat_kv(v, q_heads // kv_heads)  # [b, q_heads, k_seqlen, head_size]
+
+    if sink is None:
+        # Standard scaled dot-product attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(
+            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        attn_weights = torch.softmax(logits, dim=-1)
+        O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(
+            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
+                               v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+
+    # Compute attention score pooling
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True).to(torch.float16)
+
+    print("S_tilelang", S_tilelang)
+    print("attn_score_pooled", attn_score_pooled)
+
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_o_tilelang = torch.max(torch.abs(O_tilelang - O_torch))
+    max_diff_s_tilelang = torch.max(torch.abs(S_tilelang - attn_score_pooled))
+
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
+    assert torch.allclose(
+        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(
+        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(
+        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
+    assert torch.allclose(
+        S_tilelang, attn_score_pooled, atol=1e-2,
+        rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
+    print("✅ All tests passed!")
+
+
+def test_varlen_decode_main(args):
+    """Test decode kernel with variable sequence lengths"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen  # Use as max sequence length
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+
+    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Generate variable length k sequences
+    k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    print(f"k_seqlens: {k_seqlens}")
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    print(f"cu_seqlens_k: {cu_seqlens_k}")
+
+    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
+    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    print(f"Actual max_seqlen_k: {max_seqlen_k}")
+    print(f"q_decode shape: {q_decode.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
+                          args.test_sink, page_block_size)
+
+    block_table = torch.zeros(
+        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size)
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+        block_table=block_table,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :,
+                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
+                             block_size):] = 0
+
+    # Create torch reference - pad tensors for comparison
+    k_padded_list = []
+    v_padded_list = []
+
+    for i in range(batch_size):
+        actual_k_len = k_seqlens[i]
+
+        # Extract and pad k, v for this batch
+        k_start = cu_seqlens_k[i]
+        k_end = cu_seqlens_k[i + 1]
+
+        # Pad to max_seqlen_k
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
+
+        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
+        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
+
+        k_padded_list.append(k_padded)
+        v_padded_list.append(v_padded)
+
+    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
+    k_padded_batched = torch.stack(
+        k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(
+        v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+
+    # Expand q to match kv heads: [b, q_heads, 1, head_size]
+    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
+
+    print(f"q_expanded shape: {q_expanded.shape}")
+    print(f"k_padded_batched shape: {k_padded_batched.shape}")
+    print(f"v_padded_batched shape: {v_padded_batched.shape}")
+
+    # Compute torch reference
+    k_repeat = repeat_kv(k_padded_batched,
+                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched,
+                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+
+    if sink is None:
+        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(
+            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_score[i, :, :, actual_k_len:] = float('-inf')
+
+        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(
+            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            logits[i, :, :, actual_k_len:] = float('-inf')
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
+                               v_repeat)  # [b, q_heads, 1, head_size]
+
+    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
+
+    # Compute attention score pooling for S
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+
+    print(f"O_triton shape: {O_triton.shape}")
+    print(f"O_tilelang shape: {O_tilelang.shape}")
+    print(f"O_torch shape: {O_torch.shape}")
+    print(f"S_triton shape: {S_triton.shape}")
+    print(f"S_tilelang shape: {S_tilelang.shape}")
+    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+
+    # Compare results
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
+
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_s_tl = torch.max(
+        torch.abs(S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
+
+    assert torch.allclose(
+        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(
+        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(
+        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(
+        S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)],
+        attn_score_pooled,
+        atol=1e-2,
+        rtol=1e-2), f"Score mismatch: {max_diff_s_tl.item()}"
+
+    print("✅ All tests passed!")
+
+
+def speed_benchmark_decode_comparison(args):
+    """Speed benchmark for decode kernel"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+
+    print("\n=== Decode Speed Benchmark Comparison ===")
+    print("Configuration:")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Q heads: {q_heads}, KV heads: {kv_heads}")
+    print(f"  Max K sequence length: {max_k_seqlen}")
+    print(f"  Head size: {head_size}")
+    print(f"  Block size: {block_size}")
+    print(f"  Data type: {dtype}")
+    print(f"  Variable lengths: {args.test_varlen}")
+    print(f"  s_aux attention: {args.test_sink}")
+    print()
+
+    # Generate input data
+    if args.test_varlen:
+        k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    else:
+        k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    # Generate tensors
+    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        print("  Using sink attention with sink values")
+
+    print("Setup complete:")
+    print(f"  Total K tokens: {total_k_tokens}")
+    print(f"  Actual max K seq len: {max_seqlen_k}")
+    if args.test_varlen:
+        print(f"  K sequence lengths: {k_seqlens.tolist()}")
+
+    # Warmup
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
+                          args.test_sink, page_block_size)
+
+    block_table = torch.zeros(
+        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Benchmark
+    print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
+    tilelang_time = do_bench(
+        flash_attn_with_attn_pool_decode_tilelang,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+        False,
+        tl_kernel,
+        block_table,
+    )
+    print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
+
+    # Benchmark
+    print("⚡ Benchmarking Triton kernel (100 iterations)...")
+    triton_time = do_bench(flash_attn_with_attn_pool_decode, q_decode, k_varlen, v_varlen,
+                           cu_seqlens_k, max_seqlen_k, args.k_seqlen, 1, softmax_scale, sink,
+                           block_size)
+    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
+    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Flash Attention Decode with Attention Pooling')
+    parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
+    parser.add_argument('--q_heads', type=int, default=32, help='Number of query heads')
+    parser.add_argument('--kv_heads', type=int, default=8, help='Number of key-value heads')
+    parser.add_argument('--k_seqlen', type=int, default=8192, help='Key sequence length')
+    parser.add_argument(
+        '--head_size', type=int, default=128, choices=[64, 128, 256], help='Head dimension')
+    parser.add_argument('--block_size', type=int, default=128, help='Block size for computation')
+    parser.add_argument(
+        '--dtype', type=str, default='bfloat16', choices=['float16', 'bfloat16'], help='Data type')
+    parser.add_argument(
+        '--test_varlen', action='store_true', help='Test with truly variable sequence lengths')
+    parser.add_argument(
+        '--test_sink', action='store_true', help='Test with sink attention mechanism')
+    parser.add_argument('--benchmark', action='store_true', help='Run speed benchmark')
+    parser.add_argument(
+        '--num_split', type=int, default=1, choices=[1, 16], help='Number of splits')
+    parser.add_argument('--page_block_size', type=int, default=128, help='Page block size')
+    args = parser.parse_args()
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = 'float16'
+    args.num_split = 1
+
+    if args.benchmark:
+        speed_benchmark_decode_comparison(args)
+    elif args.test_varlen:
+        test_varlen_decode_main(args)
+    else:
+        test_equal_seqlen_decode_main(args)
-- 
GitLab


From 041d4a06b53ebeb4540636063cad2aa66fc5e1b9 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Mon, 17 Nov 2025 13:06:23 +0800
Subject: [PATCH 007/139] [Refactor] add support for numpy dtype conversion
 (#1255)

* add typing stub for tir.ir

* remove idents

* minor update

* [Refactor] add numpy conversion for dtype

* fix lint error

* remove unused np.float_ in dtype conversion

* fix type in np.int_

* fix typo

* minor fix

* remove debug files
---
 .../test_tilelang_language_frontend_v2.py     | 113 ++++++-------
 tilelang/language/v2/dtypes.py                | 155 +++++++++---------
 2 files changed, 134 insertions(+), 134 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index fb3f1e15..1d9a20fe 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -145,62 +145,63 @@ def test_dtype_str_repr():
         buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope='shared')  # noqa F841
 
 
-def test_torch_eq():
-    dtypes = [
-        T.bool,
-        T.short,
-        T.int,
-        T.long,
-        T.half,
-        T.float,
-        T.long,
-        T.int8,
-        T.int16,
-        T.int32,
-        T.int64,
-        T.uint8,
-        T.uint16,
-        T.uint32,
-        T.uint64,
-        T.float8_e4m3fn,
-        T.float8_e4m3fnuz,
-        T.float8_e5m2,
-        T.float8_e5m2fnuz,
-        T.float8_e8m0fnu,
-        T.float16,
-        T.bfloat16,
-        T.float32,
-        T.float64,
-    ]
-    torch_dtypes = [
-        torch.bool,
-        torch.short,
-        torch.int,
-        torch.long,
-        torch.half,
-        torch.float,
-        torch.long,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.uint8,
-        torch.uint16,
-        torch.uint32,
-        torch.uint64,
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
-        torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e8m0fnu,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-        torch.float64,
-    ]
-    for a, b in zip(dtypes, torch_dtypes):
-        assert a == b, f"{a} and {b} are not equal"
-        assert T.dtype(b) == a, "dtype conversion error"
+# not supported now
+# def test_torch_eq():
+#     dtypes = [
+#         T.bool,
+#         T.short,
+#         T.int,
+#         T.long,
+#         T.half,
+#         T.float,
+#         T.long,
+#         T.int8,
+#         T.int16,
+#         T.int32,
+#         T.int64,
+#         T.uint8,
+#         T.uint16,
+#         T.uint32,
+#         T.uint64,
+#         T.float8_e4m3fn,
+#         T.float8_e4m3fnuz,
+#         T.float8_e5m2,
+#         T.float8_e5m2fnuz,
+#         T.float8_e8m0fnu,
+#         T.float16,
+#         T.bfloat16,
+#         T.float32,
+#         T.float64,
+#     ]
+#     torch_dtypes = [
+#         torch.bool,
+#         torch.short,
+#         torch.int,
+#         torch.long,
+#         torch.half,
+#         torch.float,
+#         torch.long,
+#         torch.int8,
+#         torch.int16,
+#         torch.int32,
+#         torch.int64,
+#         torch.uint8,
+#         torch.uint16,
+#         torch.uint32,
+#         torch.uint64,
+#         torch.float8_e4m3fn,
+#         torch.float8_e4m3fnuz,
+#         torch.float8_e5m2,
+#         torch.float8_e5m2fnuz,
+#         torch.float8_e8m0fnu,
+#         torch.float16,
+#         torch.bfloat16,
+#         torch.float32,
+#         torch.float64,
+#     ]
+#     for a, b in zip(dtypes, torch_dtypes):
+#         assert a == b, f"{a} and {b} are not equal"
+#         assert T.dtype(b) == a, "dtype conversion error"
 
 
 def test_var_assign():
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 2161e377..0702635a 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -1,95 +1,98 @@
 from tilelang import tvm
 from tvm import ir
 import torch
-import ctypes
 from typing import TYPE_CHECKING, Union
 from tvm import tir
 import tvm.script.ir_builder.tir._ffi_api as tb_ffi
+import numpy as np
 
 dtype = tvm.DataType
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
 
-# Base dtype conversion list
-_dtype_cvt_base = [
-    (None, 'handle', ctypes.c_long, 'long', None),  # use long to repr void*
-    (bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.short, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.long, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.half, 'float16', None, None, 'Float16'),
-    (torch.float, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.double, 'float64', ctypes.c_double, 'double', 'Float64'),
-
-    #   (pytype,                'tvm dtype str',    'ctypes dtype',     'cffi dtype')
-    (torch.bool, 'bool', ctypes.c_bool, 'bool', 'Boolean'),
-    (torch.int8, 'int8', ctypes.c_int8, 'char', 'Int8'),
-    (torch.int16, 'int16', ctypes.c_int16, 'short', 'Int16'),
-    (torch.int32, 'int32', ctypes.c_int32, 'int', 'Int32'),
-    (torch.int64, 'int64', ctypes.c_int64, 'long long', 'Int64'),
-    (torch.uint8, 'uint8', ctypes.c_uint8, 'unsigned char', 'UInt8'),
-    (torch.uint16, 'uint16', ctypes.c_uint16, 'unsigned short', 'UInt16'),
-    (torch.uint32, 'uint32', ctypes.c_uint32, 'unsigned int', 'UInt32'),
-    (torch.uint64, 'uint64', ctypes.c_uint64, 'unsigned long long', 'UInt64'),
-    (torch.float16, 'float16', None, None, 'Float16'),
-    (torch.float32, 'float32', ctypes.c_float, 'float', 'Float32'),
-    (torch.float64, 'float64', ctypes.c_double, 'double', 'Float64'),
-    (None, 'float8_e4m3', None, None, 'Float8E4M3'),
-    (torch.bfloat16, 'bfloat16', None, None, 'BFloat16'),
-]
-
-# Dynamically add fp8-related types if they exist in torch
-_fp8_dtype_mappings = [
-    ('float8_e4m3fn', 'Float8E4M3FN'),
-    ('float8_e4m3fnuz', 'Float8E4M3FNUZ'),
-    ('float8_e5m2', 'Float8E5M2'),
-    ('float8_e5m2fnuz', 'Float8E5M2FNUZ'),
-    ('float8_e8m0fnu', 'Float8E8M0FNU'),
-]
-
-_dtype_cvt = list(_dtype_cvt_base)
-for torch_attr_name, tvm_name in _fp8_dtype_mappings:
-    if hasattr(torch, torch_attr_name):
-        torch_dtype = getattr(torch, torch_attr_name)
-        _dtype_cvt.append((torch_dtype, torch_attr_name, None, None, tvm_name))
-
+_PYTHON_DTYPE_TO_STR = {
+    bool: 'bool',
+    int: 'int32',
+    float: 'float32',
+}
 
-def _create_type_mapper(sidx, didx, smapper=lambda x: x, dmapper=lambda x: x):
-    return {
-        smapper(item[sidx]): dmapper(item[didx])
-        for item in _dtype_cvt
-        if item[didx] is not None and item[sidx] is not None
-    }
+_NUMPY_DTYPE_TO_STR = {
+    np.bool_: 'bool',
+    np.short: 'int16',
+    np.int_: 'int64',
+    np.longlong: 'int64',
+    np.half: 'float16',
+    np.double: 'float64',
+    np.int8: 'int8',
+    np.int16: 'int16',
+    np.int32: 'int32',
+    np.int64: 'int64',
+    np.uint8: 'uint8',
+    np.uint16: 'uint16',
+    np.uint32: 'uint32',
+    np.uint64: 'uint64',
+    np.float16: 'float16',
+    np.float32: 'float32',
+    np.float64: 'float64',
+}
 
+_NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
 
-_dtype_py2tvmstr = _create_type_mapper(0, 1)
-_dtype_tvmstr2fficall = _create_type_mapper(1, 4, dmapper=lambda x: getattr(tb_ffi, x))
-_dtype_tvm2py = _create_type_mapper(1, 0, lambda x: dtype(x))
-_dtype_tvm2ctype = _create_type_mapper(1, 2, lambda x: dtype(x))
-_dtype_tvm2cffi = _create_type_mapper(1, 3, lambda x: dtype(x))
+_TORCH_DTYPE_TO_STR = {
+    torch.bool: 'bool',
+    torch.short: 'int16',
+    torch.int: 'int32',
+    torch.long: 'int64',
+    torch.half: 'float16',
+    torch.float: 'float32',
+    torch.double: 'float64',
+    torch.int8: 'int8',
+    torch.int16: 'int16',
+    torch.int32: 'int32',
+    torch.int64: 'int64',
+    torch.uint8: 'uint8',
+    torch.uint16: 'uint16',
+    torch.uint32: 'uint32',
+    torch.uint64: 'uint64',
+    torch.float16: 'float16',
+    torch.float32: 'float32',
+    torch.float64: 'float64',
+    torch.bfloat16: 'bfloat16',
+}
 
+# _STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
 
-def __dtype_eq__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__eq__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__eq__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
+# _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
 
+_DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
 
-def __dtype_ne__(self: dtype, other: AnyDType):
-    if isinstance(other, str):
-        return str.__ne__(self, other)
-    if other in _dtype_py2tvmstr:
-        return str.__ne__(self, _dtype_py2tvmstr[other])
-    return NotImplemented
+_STR_TO_TVM_DTYPE_CALL = {
+    'bool': 'Boolean',
+    'int8': 'Int8',
+    'int32': 'Int32',
+    'int64': 'Int64',
+    'uint8': 'UInt8',
+    'uint16': 'UInt16',
+    'uint32': 'UInt32',
+    'uint64': 'UInt64',
+    'float16': 'Float16',
+    'float32': 'Float32',
+    'float64': 'Float64',
+    'bfloat16': 'BFloat16',
+    'float8_e4m3': 'Float8E4M3',
+    'float8_e4m3fn': 'Float8E4M3FN',
+    'float8_e4m3fnuz': 'Float8E4M3FNUZ',
+    'float8_e5m2': 'Float8E5M2',
+    'float8_e5m2fnuz': 'Float8E5M2FNUZ',
+    'float8_e8m0fnu': 'Float8E8M0FNU'
+}
 
 
 def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
-    if self in _dtype_tvmstr2fficall:
-        return _dtype_tvmstr2fficall[self](expr, is_size_var)
+    if self in _STR_TO_TVM_DTYPE_CALL:
+        attr = _STR_TO_TVM_DTYPE_CALL[self]
+        call = getattr(tb_ffi, attr, None)
+        return call(expr, is_size_var)
     # try to construct the ffi call
     if self.startswith('uint'):
         val = 'UInt' + self[4:]
@@ -117,17 +120,13 @@ __orig_dtype_new = dtype.__new__
 def __dtype_new__(cls, value: AnyDType) -> dtype:
     if isinstance(value, str):
         return __orig_dtype_new(cls, value)
-    elif value in _dtype_py2tvmstr:
-        return __orig_dtype_new(cls, _dtype_py2tvmstr[value])
+    elif value in _DTYPE_TO_STR:
+        return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
     else:
-        expected = set(list(_dtype_py2tvmstr.keys()) + list(_dtype_tvmstr2fficall.values()))
+        expected = set(list(_DTYPE_TO_STR.keys()) + list(_DTYPE_TO_STR.values()))
         raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
 
 
-dtype.__eq__ = __dtype_eq__
-dtype.__req__ = __dtype_eq__
-dtype.__ne__ = __dtype_ne__
-dtype.__rne__ = __dtype_ne__
 dtype.__call__ = __dtype_call__
 dtype.__new__ = __dtype_new__
 
-- 
GitLab


From a2a278149f56bc6ffb8f99a10fde737d2d2ae677 Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri <vpjayasiri@gmail.com>
Date: Mon, 17 Nov 2025 06:07:30 +0000
Subject: [PATCH 008/139] [EXAMPLE] In the flash attention example keep the max
 of all blocks seen in scores_max numerical stability (#1148)

* Keep the max of all blocks seen in scores_max for stability

* ruff formatting
---
 examples/flash_attention/example_mha_fwd_bhsd.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index e936cee3..e0e0bca2 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -86,6 +86,10 @@ def flashattn(batch,
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
-- 
GitLab


From b3d6f03cea2710497a8704c083148813ee0826f3 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Mon, 17 Nov 2025 19:42:32 +0800
Subject: [PATCH 009/139] [Docs] Improve Installation Guide (#1270)

* [Docs] Improve installation guide

* address comments
---
 docs/get_started/Installation.md | 134 ++++++++++---------------------
 1 file changed, 42 insertions(+), 92 deletions(-)

diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index 3d5c6db9..be0d794e 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -8,25 +8,25 @@
 - **Python Version**: >= 3.8
 - **CUDA Version**: 12.0 <= CUDA < 13
 
-The easiest way to install **tile-lang** is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
+The easiest way to install tilelang is directly from PyPI using pip. To install the latest version, run the following command in your terminal:
 
 ```bash
 pip install tilelang
 ```
 
-Alternatively, you may choose to install **tile-lang** using prebuilt packages available on the Release Page:
+Alternatively, you may choose to install tilelang using prebuilt packages available on the Release Page:
 
 ```bash
 pip install tilelang-0.0.0.dev0+ubuntu.20.4.cu120-py3-none-any.whl
 ```
 
-To install the latest version of **tile-lang** from the GitHub repository, you can run the following command:
+To install the latest version of tilelang from the GitHub repository, you can run the following command:
 
 ```bash
 pip install git+https://github.com/tile-ai/tilelang.git
 ```
 
-After installing **tile-lang**, you can verify the installation by running:
+After installing tilelang, you can verify the installation by running:
 
 ```bash
 python -c "import tilelang; print(tilelang.__version__)"
@@ -40,18 +40,18 @@ python -c "import tilelang; print(tilelang.__version__)"
 - **Python Version**: >= 3.8
 - **CUDA Version**: >= 10.0
 
-```bash
-docker run -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.01-py3
-```
+If you prefer Docker, please skip to the [Install Using Docker](#install-using-docker) section. This section focuses on building from source on a native Linux environment.
 
-To build and install **tile-lang** directly from source, follow these steps. This process requires certain pre-requisites from Apache TVM, which can be installed on Ubuntu/Debian-based systems using the following commands:
+First, install the OS-level prerequisites on Ubuntu/Debian-based systems using the following commands:
 
 ```bash
 apt-get update
 apt-get install -y python3 python3-dev python3-setuptools gcc zlib1g-dev build-essential cmake libedit-dev
 ```
 
-After installing the prerequisites, you can clone the **tile-lang** repository and install it using pip:
+Then, clone the tilelang repository and install it using pip. The `-v` flag enables verbose output during the build process.
+
+> **Note**: Use the `--recursive` flag to include necessary submodules. Tilelang currently depends on a customized version of TVM, which is included as a submodule. If you prefer [Building with Existing TVM Installation](#using-existing-tvm), you can skip cloning the TVM submodule (but still need other dependencies).
 
 ```bash
 git clone --recursive https://github.com/tile-ai/tilelang.git
@@ -59,12 +59,18 @@ cd tilelang
 pip install . -v
 ```
 
-If you want to install **tile-lang** in development mode, you can run the following command:
+If you want to install tilelang in development mode, you can use the `-e` flag so that any changes to the Python files will be reflected immediately without reinstallation.
 
 ```bash
 pip install -e . -v
 ```
 
+> **Note**: changes to C++ files require rebuilding the tilelang C++ library. See [Faster Rebuild for Developers](#faster-rebuild-for-developers) below. A default `build` directory will be created if you use `pip install`, so you can also directly run `make` in the `build` directory to rebuild it as [Working from Source via PYTHONPATH](#working-from-source-via-pythonpath) suggested below.
+
+(working-from-source-via-pythonpath)=
+
+### Working from Source via `PYTHONPATH`
+
 If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first:
 
 ```bash
@@ -85,17 +91,21 @@ Some useful CMake options you can toggle while configuring:
 - `-DUSE_ROCM=ON` selects ROCm support when building on AMD GPUs.
 - `-DNO_VERSION_LABEL=ON` disables the backend/git suffix in `tilelang.__version__`.
 
-We currently provide four methods to install **tile-lang**:
+(using-existing-tvm)=
 
-1. [Install Using Docker](#install-method-1) (Recommended)
-2. [Install from Source (using the bundled TVM submodule)](#install-method-2)
-3. [Install from Source (using your own TVM installation)](#install-method-3)
+### Building with Existing TVM Installation
 
-(install-method-1)=
+If you already have a compatible TVM installation, use the `TVM_ROOT` environment variable to specify the location of your existing TVM repository when building tilelang:
 
-### Method 1: Install Using Docker (Recommended)
+```bash
+TVM_ROOT=<your-tvm-repo> pip install . -v
+```
+
+(install-using-docker)=
 
-For users who prefer a containerized environment with all dependencies pre-configured, **tile-lang** provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems and is the **recommended approach** for most users.
+## Install Using Docker
+
+For users who prefer a containerized environment with all dependencies pre-configured, tilelang provides Docker images for different CUDA versions. This method is particularly useful for ensuring consistent environments across different systems.
 
 **Prerequisites:**
 - Docker installed on your system
@@ -142,82 +152,17 @@ docker run -itd \
 - `--name tilelang_b200`: Assigns a name to the container for easy management
 - `/bin/zsh`: Uses zsh as the default shell
 
-4. **Access the Container**:
+4. **Access the Container and Verify Installation**:
 
 ```bash
 docker exec -it tilelang_b200 /bin/zsh
-```
-
-5. **Verify Installation**:
-
-Once inside the container, verify that **tile-lang** is working correctly:
-
-```bash
+# Inside the container:
 python -c "import tilelang; print(tilelang.__version__)"
 ```
 
-You can now run TileLang examples and develop your applications within the containerized environment. The Docker image comes with all necessary dependencies pre-installed, including CUDA toolkit, TVM, and TileLang itself.
-
-**Example Usage:**
-
-After accessing the container, you can run TileLang examples:
-
-```bash
-cd /home/tilelang/examples
-python elementwise/test_example_elementwise.py
-```
-
-This Docker-based installation method provides a complete, isolated environment that works seamlessly on systems with compatible NVIDIA GPUs like the B200, ensuring optimal performance for TileLang applications.
-
-(install-method-2)=
-
-### Method 2: Install from Source (Using the Bundled TVM Submodule)
-
-If you already have a compatible TVM installation, follow these steps:
-
-1. **Clone the Repository**:
-
-```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
-```
-
-**Note**: Use the `--recursive` flag to include necessary submodules.
-
-2. **Configure Build Options**:
-
-Create a build directory and specify your existing TVM path:
-
-```bash
-pip install . -v
-```
-
-(install-method-3)=
-
-### Method 3: Install from Source (Using Your Own TVM Installation)
-
-If you prefer to use the built-in TVM version, follow these instructions:
-
-1. **Clone the Repository**:
-
-```bash
-git clone --recursive https://github.com/tile-ai/tilelang
-cd tilelang
-```
-
-**Note**: Ensure the `--recursive` flag is included to fetch submodules.
-
-2. **Configure Build Options**:
-
-Copy the configuration file and enable the desired backends (e.g., LLVM and CUDA):
-
-```bash
-TVM_ROOT=<your-tvm-repo> pip install . -v
-```
-
 ## Install with Nightly Version
 
-For users who want access to the latest features and improvements before official releases, we provide nightly builds of **tile-lang**.
+For users who want access to the latest features and improvements before official releases, we provide nightly builds of tilelang.
 
 ```bash
 pip install tilelang -f https://tile-ai.github.io/whl/nightly/cu121/
@@ -253,23 +198,28 @@ Set `NO_TOOLCHAIN_VERSION=ON` to disable this.
 ### Run-time environment variables
 
 <!-- TODO: tvm -->
+TODO
+
+## Other Tips
 
-## IDE Configs
+### IDE Configs
 
-Building tilelang locally will automatically `compile_commands.json` file in `build` dir.
+Building tilelang locally will automatically generate a `compile_commands.json` file in `build` dir.
 VSCode with clangd and [clangd extension](https://marketplace.visualstudio.com/items?itemName=llvm-vs-code-extensions.vscode-clangd) should be able to index that without extra configuration.
 
-## Compile cache
+### Compile Cache
 
-`ccache` will be automatically used if found.
+The default path of the compile cache is `~/.tilelang/cache`. `ccache` will be automatically used if found.
 
-## Repairing wheels
+### Repairing Wheels
 
 If you plan to use your wheel in other environment,
-it's recommend to use auditwheel (on Linux) or delocate (on Darwin)
+it's recommended to use auditwheel (on Linux) or delocate (on Darwin)
 to repair them.
 
-## Faster rebuild for developers
+(faster-rebuild-for-developers)=
+
+### Faster Rebuild for Developers
 
 `pip install` introduces extra [un]packaging and takes ~30 sec to complete,
 even if no source change.
-- 
GitLab


From 3ab93cd76b77978f416359bc9998e225ac276dcd Mon Sep 17 00:00:00 2001
From: Tong WU <109033598+Rachmanino@users.noreply.github.com>
Date: Mon, 17 Nov 2025 21:53:19 +0800
Subject: [PATCH 010/139] [Enhancement] Keep max score attention across blocks
 in FlashAttention for better numerical stablity (#1269)

* Implement max score retention across blocks in FlashAttention for improved stability

* fix manual pipeline parameters

* Update examples/flash_attention/example_gqa_fwd_varlen.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* fix typo

* more

* fix a previous typo

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .../benchmark_tilelang_block_sparse_fmha.py            |  2 ++
 examples/amd/example_amd_flash_attn_bwd.py             |  2 ++
 examples/amd/example_amd_flash_attn_fwd.py             |  2 ++
 examples/attention_sink/example_gqa_sink_bwd_bhsd.py   |  2 ++
 .../example_gqa_sink_fwd_bhsd_wgmma_pipelined.py       |  4 +++-
 examples/attention_sink/example_mha_sink_bwd_bhsd.py   |  2 ++
 examples/attention_sink/example_mha_sink_fwd_bhsd.py   |  2 ++
 .../example_mha_sink_fwd_bhsd_wgmma_pipelined.py       |  4 +++-
 .../example_tilelang_sparse_gqa_decode_paged.py        |  3 +--
 ...example_tilelang_sparse_gqa_decode_varlen_indice.py |  3 +--
 .../example_tilelang_sparse_gqa_decode_varlen_mask.py  |  1 +
 .../amd/benchmark_mla_decode_amd_tilelang.py           |  4 ++++
 examples/deepseek_mla/example_mla_decode.py            |  4 ++++
 examples/deepseek_mla/example_mla_decode_paged.py      |  4 ++++
 examples/deepseek_mla/example_mla_decode_persistent.py |  2 ++
 examples/deepseek_mla/example_mla_decode_ws.py         | 10 +++++++++-
 .../experimental/example_mla_decode_kv_fp8.py          |  2 ++
 examples/deepseek_v32/sparse_mla_fwd.py                |  2 ++
 examples/deepseek_v32/sparse_mla_fwd_pipelined.py      |  4 ++++
 examples/flash_attention/README.md                     |  4 +++-
 examples/flash_attention/example_gqa_bwd.py            |  2 ++
 examples/flash_attention/example_gqa_bwd_tma_reduce.py |  2 ++
 .../example_gqa_bwd_tma_reduce_varlen.py               |  2 ++
 .../flash_attention/example_gqa_bwd_wgmma_pipelined.py |  2 ++
 examples/flash_attention/example_gqa_fwd_bshd.py       |  2 ++
 .../example_gqa_fwd_bshd_wgmma_pipelined.py            |  4 +++-
 examples/flash_attention/example_gqa_fwd_varlen.py     |  1 -
 examples/flash_attention/example_mha_bwd_bhsd.py       |  2 ++
 examples/flash_attention/example_mha_bwd_bshd.py       |  4 +++-
 .../example_mha_bwd_bshd_wgmma_pipelined.py            |  2 ++
 .../example_mha_fwd_bhsd_wgmma_pipelined.py            |  4 +++-
 examples/flash_attention/example_mha_fwd_bshd.py       |  2 ++
 .../example_mha_fwd_bshd_wgmma_pipelined.py            |  4 +++-
 examples/flash_attention/example_mha_fwd_varlen.py     |  2 ++
 examples/flash_decoding/example_gqa_decode.py          |  4 ++++
 examples/flash_decoding/example_mha_inference.py       |  2 ++
 .../minference/example_vertical_slash_sparse_attn.py   |  4 ++++
 examples/seer_attention/block_sparse_attn_tilelang.py  |  2 ++
 .../test_tilelang_transform_config_index_bitwidth.py   |  2 ++
 39 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index aefe4d42..7c9edb59 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -95,6 +95,8 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d47866e1..d5c52f9c 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -178,6 +178,8 @@ def fast_flashattn(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         if m_prev[i] == -T.infinity(accum_dtype):
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ec5db1e..3c422c28 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -171,6 +171,8 @@ def fast_flashattn(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index eec43db9..b442505f 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -99,6 +99,8 @@ def flashattn_fwd(
                 T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 7765603a..8d181726 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -105,6 +105,8 @@ def flashattn(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
@@ -181,7 +183,7 @@ def flashattn(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 866668e4..b9fa0fd9 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -96,6 +96,8 @@ def flashattn_fwd(
                 T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 2449b090..0ccb6958 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -95,6 +95,8 @@ def flashattn(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index 35284407..64d6ec69 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -98,6 +98,8 @@ def flashattn(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # NOTE(wt): check_inf is necessary for sliding window attention.
@@ -174,7 +176,7 @@ def flashattn(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index e2998216..1c4b847d 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -105,8 +105,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index ae300426..b3087522 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -95,8 +95,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index ad62817d..3417bd7f 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -92,6 +92,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index db460437..61c3b63c 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -91,6 +91,8 @@ def flashmla_decode(batch,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -157,6 +159,8 @@ def flashmla_decode(batch,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 417e319f..3932d112 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -74,6 +74,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -148,6 +150,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index fe50d4d4..d23ff00c 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -93,6 +93,8 @@ def mla_decode_tilelang(batch,
                         acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
                                                      -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -176,6 +178,8 @@ def mla_decode_tilelang(batch,
                     acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
                                                  -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 3f57ea05..2f896f26 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -98,6 +98,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_H):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                         for i in T.Parallel(block_H):
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
                                                      scores_max[i] * scale)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 6554d57d..fcd427ef 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -104,7 +104,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
 
                     T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    T.reduce_max(acc_s, out=m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -137,6 +139,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -324,6 +328,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -356,6 +362,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 1b1447e8..b141822f 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -74,6 +74,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index a39c72c4..e65b8901 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -147,6 +147,8 @@ def sparse_mla_fwd(
                 )
                 T.copy(m_i, m_i_prev)
                 T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                 for h_i in T.Parallel(H_per_block):
                     alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 96dda7df..1621d85b 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -164,6 +164,8 @@ def sparse_mla_fwd(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -198,6 +200,8 @@ def sparse_mla_fwd(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
diff --git a/examples/flash_attention/README.md b/examples/flash_attention/README.md
index be11a8dc..633727ec 100644
--- a/examples/flash_attention/README.md
+++ b/examples/flash_attention/README.md
@@ -77,7 +77,9 @@ def flash_attention(
 
             # Compute the maximum value per row on dimension 1 (block_N)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                
             # Compute the factor by which we need to rescale previous partial sums
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] - scores_max[i])
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index dd9c8f7c..968d1de3 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -61,6 +61,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index 2af06e4b..c427908a 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -66,6 +66,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 88f2d81e..a9604f4d 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -119,6 +119,8 @@ def flashattn_fwd(batch,
                         V_shared[i, d] = 0.0
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index 02421249..e916812f 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -61,6 +61,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 3d4bfe45..a6d3b5f2 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -127,6 +127,8 @@ def flashattn(batch,
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 21f5e9a9..03ad15e9 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -94,6 +94,8 @@ def flashattn(
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -154,7 +156,7 @@ def flashattn(
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index db16e158..ccc50e41 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -155,7 +155,6 @@ def flashattn(batch_size,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-
                 for i in T.Parallel(block_M):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 8247b265..d91d1770 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -63,6 +63,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                 T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
diff --git a/examples/flash_attention/example_mha_bwd_bshd.py b/examples/flash_attention/example_mha_bwd_bshd.py
index 414061ff..7c85f982 100644
--- a/examples/flash_attention/example_mha_bwd_bshd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -59,6 +59,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -344,7 +346,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--batch', type=int, default=8, help='Batch size')
     parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1048, help='Context size')
+    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
     parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
     parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
     args = parser.parse_args()
diff --git a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index e10ef581..e8ee5d97 100644
--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -60,6 +60,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                 T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index e1d0130a..b797bbcc 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -86,6 +86,8 @@ def flashattn(batch,
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -149,7 +151,7 @@ def flashattn(batch,
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index a9268019..b5b72828 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -81,6 +81,8 @@ def flashattn(batch,
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index d7023a20..02d8baef 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -81,6 +81,8 @@ def flashattn(batch,
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -141,7 +143,7 @@ def flashattn(batch,
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
                     stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
+                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f381e900..bbb4546c 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -167,6 +167,8 @@ def flashattn(batch_size,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 9ec3a026..46d9beea 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -115,6 +115,8 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -188,6 +190,8 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 3eabc9a7..0360b3e2 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -70,6 +70,8 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        for i in T.Parallel(block_M):
+            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
         # To do causal softmax, we need to set the scores_max to 0 if it is -inf
         # This process is called Check_inf in FlashAttention3 code, and it only need to be done
         # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index ebf8513a..48df3e09 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -87,6 +87,8 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
             T.copy(scores_max, scores_max_prev)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -194,6 +196,8 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                         T.copy(scores_max, scores_max_prev)
 
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
                             scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index dcd581c6..219d3ee3 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -62,6 +62,8 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index f051f028..1ef1589a 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -71,6 +71,8 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
-- 
GitLab


From 220c32362ef5e152621082f310fb89202b92323c Mon Sep 17 00:00:00 2001
From: Yu Cheng <54519279+chengyupku@users.noreply.github.com>
Date: Tue, 18 Nov 2025 01:26:51 +0800
Subject: [PATCH 011/139] [Bugfix] Fix multiple cg defination when using
 T.sync_grid (#1272)

---
 src/target/codegen_cuda.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 6b5f5063..dda96925 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -1645,10 +1645,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::sync_grid())) {
     this->need_cooperative_groups_ = true;
     this->PrintIndent();
-    this->stream << "cooperative_groups::grid_group grid = "
-                    "cooperative_groups::this_grid();\n";
-    this->PrintIndent();
-    this->stream << "grid.sync();\n";
+    this->stream << "cooperative_groups::this_grid().sync();\n";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";
-- 
GitLab


From b1922518ce3238a3982c61e909e8fc74ab4e37cc Mon Sep 17 00:00:00 2001
From: Yichen Yan <wenji.yyc@alibaba-inc.com>
Date: Tue, 18 Nov 2025 11:36:32 +0800
Subject: [PATCH 012/139] [Minor] Remove from __future__ import annotations for
 python 3.8 (#1273)

---
 tilelang/carver/arch/arch_base.py                |  3 ---
 tilelang/carver/common_schedules.py              |  1 -
 tilelang/carver/roller/hint.py                   |  3 +--
 tilelang/carver/roller/policy/common.py          |  1 -
 tilelang/carver/roller/rasterization.py          |  1 -
 tilelang/carver/roller/shape_inference/common.py |  1 -
 tilelang/carver/roller/shape_inference/tir.py    |  1 -
 tilelang/carver/template/base.py                 |  7 +++----
 tilelang/carver/template/conv.py                 |  1 -
 tilelang/carver/template/elementwise.py          |  1 -
 tilelang/carver/template/flashattention.py       |  1 -
 tilelang/carver/template/gemv.py                 |  1 -
 tilelang/carver/template/matmul.py               |  1 -
 tilelang/contrib/cc.py                           |  1 -
 tilelang/contrib/nvcc.py                         |  1 -
 tilelang/intrinsics/mma_sm70_layout.py           |  3 ---
 tilelang/jit/adapter/ctypes/adapter.py           |  1 -
 tilelang/jit/adapter/cython/adapter.py           |  1 -
 tilelang/jit/adapter/dlpack.py                   |  2 --
 tilelang/language/allocate.py                    |  2 +-
 tilelang/language/annotations.py                 |  2 --
 tilelang/language/copy.py                        |  1 -
 tilelang/language/customize.py                   |  1 -
 tilelang/language/experimental/gemm_sp.py        |  1 -
 tilelang/language/fill.py                        |  1 -
 tilelang/language/frame.py                       |  1 -
 tilelang/language/gemm.py                        |  1 -
 tilelang/language/kernel.py                      |  1 -
 tilelang/language/loop.py                        |  1 -
 tilelang/language/overrides/parser.py            |  2 --
 tilelang/language/parser/operation.py            |  2 --
 tilelang/language/proxy.py                       |  2 +-
 tilelang/language/reduce.py                      |  1 -
 tilelang/language/tir/ir.py                      |  1 -
 tilelang/language/utils.py                       |  1 -
 tilelang/language/v2/builder.py                  |  1 -
 tilelang/language/warpgroup.py                   |  2 --
 tilelang/layout/fragment.py                      | 10 ++++------
 tilelang/layout/gemm_sp.py                       |  1 -
 tilelang/layout/layout.py                        |  6 ++----
 tilelang/layout/swizzle.py                       |  2 +-
 tilelang/primitives/gemm/__init__.py             |  1 -
 tilelang/profiler/__init__.py                    |  1 -
 tilelang/quantize/lop3.py                        |  1 -
 tilelang/quantize/mxfp.py                        |  1 -
 tilelang/transform/add_bufstore_wrapper.py       |  1 -
 tilelang/utils/tensor.py                         |  1 -
 47 files changed, 13 insertions(+), 68 deletions(-)

diff --git a/tilelang/carver/arch/arch_base.py b/tilelang/carver/arch/arch_base.py
index a10fa434..4c8825e8 100644
--- a/tilelang/carver/arch/arch_base.py
+++ b/tilelang/carver/arch/arch_base.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 class TileDevice:
     """
     Represents the architecture of a computing device, capturing various hardware specifications.
diff --git a/tilelang/carver/common_schedules.py b/tilelang/carver/common_schedules.py
index 2766a15e..199f0158 100644
--- a/tilelang/carver/common_schedules.py
+++ b/tilelang/carver/common_schedules.py
@@ -19,7 +19,6 @@
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm common_schedules.py in dlight.
 """Common schedule strategies for TIR."""
-from __future__ import annotations
 from typing import Callable
 
 from tvm import tir
diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py
index 20d62f68..17c69dae 100644
--- a/tilelang/carver/roller/hint.py
+++ b/tilelang/carver/roller/hint.py
@@ -1,5 +1,4 @@
 """Hint definition for schedule"""
-from __future__ import annotations
 from tvm import DataType
 from . import PrimFuncNode
 import numpy as np
@@ -218,7 +217,7 @@ class Hint:
         return dic
 
     @classmethod
-    def from_dict(cls, dic: dict) -> Hint:
+    def from_dict(cls, dic: dict) -> 'Hint':
         hint = cls()
         for k, v in dic.items():
             setattr(hint, k, v)
diff --git a/tilelang/carver/roller/policy/common.py b/tilelang/carver/roller/policy/common.py
index 747dddbb..fb33eefd 100644
--- a/tilelang/carver/roller/policy/common.py
+++ b/tilelang/carver/roller/policy/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import numpy as np
 
 
diff --git a/tilelang/carver/roller/rasterization.py b/tilelang/carver/roller/rasterization.py
index 39c603b6..ebd1319a 100644
--- a/tilelang/carver/roller/rasterization.py
+++ b/tilelang/carver/roller/rasterization.py
@@ -1,5 +1,4 @@
 """Rasteration Plan For L2 Cache Locality"""
-from __future__ import annotations
 
 
 class Rasterization:
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index aaf59aed..c52a170e 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from collections import OrderedDict
 
 from tvm import arith
diff --git a/tilelang/carver/roller/shape_inference/tir.py b/tilelang/carver/roller/shape_inference/tir.py
index 675298c6..618cf9b3 100644
--- a/tilelang/carver/roller/shape_inference/tir.py
+++ b/tilelang/carver/roller/shape_inference/tir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from collections.abc import Mapping
 from tvm.tir.schedule.schedule import BlockRV
 from tvm.ir import structural_equal
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index 5aa5074c..a119c16a 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -1,5 +1,4 @@
 # Import necessary modules and classes
-from __future__ import annotations
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
@@ -42,7 +41,7 @@ class BaseTemplate(ABC):
         """
         pass
 
-    def with_arch(self, arch: TileDevice) -> BaseTemplate:
+    def with_arch(self, arch: TileDevice) -> 'BaseTemplate':
         """
         Sets the architecture for this template and returns itself.
 
@@ -110,7 +109,7 @@ class BaseTemplate(ABC):
         """
         raise NotImplementedError("initialize_function is not implemented")
 
-    def set_function(self, func: PrimFunc) -> BaseTemplate:
+    def set_function(self, func: PrimFunc) -> 'BaseTemplate':
         """
         Sets the function for this template and returns itself.
 
@@ -123,7 +122,7 @@ class BaseTemplate(ABC):
         self._func = func
         return self
 
-    def set_output_nodes(self, output_nodes: list[OutputNode]) -> BaseTemplate:
+    def set_output_nodes(self, output_nodes: list[OutputNode]) -> 'BaseTemplate':
         """
         Sets the output nodes for this template and returns itself.
 
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index f180084d..9ea89202 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te, tir
diff --git a/tilelang/carver/template/elementwise.py b/tilelang/carver/template/elementwise.py
index 26d53152..8cd30619 100644
--- a/tilelang/carver/template/elementwise.py
+++ b/tilelang/carver/template/elementwise.py
@@ -1,5 +1,4 @@
 # Import necessary modules
-from __future__ import annotations
 from dataclasses import dataclass  # Used for defining data classes
 from .base import BaseTemplate  # Importing the base class for templates
 from tvm import te  # Importing TVM's tensor expression module
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index 760b1981..ae1a2540 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index 7195a0b8..cdcc78d0 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index 4847cdb2..653ddab3 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index 0807c255..87d943ab 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Util to invoke C/C++ compilers in the system."""
-from __future__ import annotations
 import functools
 import os
 import shutil
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 2903b15d..202e0f3b 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -1,7 +1,6 @@
 # pylint: disable=invalid-name
 # modified from apache tvm python/tvm/contrib/nvcc.py
 """Utility to invoke nvcc compiler in the system"""
-from __future__ import absolute_import as _abs
 from __future__ import annotations
 
 import os
diff --git a/tilelang/intrinsics/mma_sm70_layout.py b/tilelang/intrinsics/mma_sm70_layout.py
index d6491c2b..e7a57da7 100644
--- a/tilelang/intrinsics/mma_sm70_layout.py
+++ b/tilelang/intrinsics/mma_sm70_layout.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 def shared_16x4_to_mma_a_32x4_layout(row, col, rep):
     tid = (row % 4) + 16 * ((row // 4) % 2) + 4 * (row // 8) + 8 * rep
     local_id = col
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index 648c66c1..bf0aef51 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -1,6 +1,5 @@
 """The profiler and convert to torch utils"""
 from __future__ import annotations
-
 import torch
 from ..base import BaseKernelAdapter
 import ctypes
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 7857872c..bc43533b 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,6 +1,5 @@
 """The profiler and convert to torch utils"""
 from __future__ import annotations
-
 import ctypes
 import logging
 import torch
diff --git a/tilelang/jit/adapter/dlpack.py b/tilelang/jit/adapter/dlpack.py
index 9fa767f0..402dfb2f 100644
--- a/tilelang/jit/adapter/dlpack.py
+++ b/tilelang/jit/adapter/dlpack.py
@@ -1,6 +1,4 @@
 """The profiler and convert to torch utils"""
-from __future__ import annotations
-
 import torch
 from tilelang.contrib.dlpack import to_pytorch_func
 from .base import BaseKernelAdapter
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index d70355ad..f0784e86 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -13,8 +13,8 @@ Available allocation functions:
 Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
-
 from __future__ import annotations
+
 from typing import overload, Literal
 from tilelang import tvm as tvm
 from tvm.script import tir as T
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 3c469e78..2ce71cb9 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -1,6 +1,4 @@
 """Annotation helpers exposed on the TileLang language surface."""
-from __future__ import annotations
-
 from typing import Callable
 
 from tilelang.layout import Layout
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index 4ad857b5..62de13d0 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from typing import Literal
 from tilelang import language as T
 from tilelang.utils.language import (
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 0830c22d..9175bdb8 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index fc511c00..e966e7d6 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index 74aeb264..ad74720f 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
 from tilelang.utils.language import get_buffer_region_from_load
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index 8e6d5926..db649952 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,6 +1,5 @@
 """Override the LetFrame to print a message when entering the frame."""
 from __future__ import annotations
-
 from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
 from tvm.ir import Range
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 0f01582f..0f2e82d7 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 54b78d3d..5e819da7 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from collections import deque
 from tvm import tir
 from tvm.tir import Var
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 85f2acd8..4f8d5c30 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from typing import Any
 from tvm import tir
 from tvm.tir import IntImm
diff --git a/tilelang/language/overrides/parser.py b/tilelang/language/overrides/parser.py
index 01d59b60..af42098a 100644
--- a/tilelang/language/overrides/parser.py
+++ b/tilelang/language/overrides/parser.py
@@ -1,6 +1,4 @@
 """TVMScript parser overrides tailored for TileLang."""
-from __future__ import annotations
-
 from functools import partial
 
 from tvm.script.ir_builder import tir as T
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index 43774947..b2138acf 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -17,8 +17,6 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """The tir expression operation registration"""
-from __future__ import annotations
-
 from tvm import tir
 from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
 from tvm.tir import IntImm
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 2c5a372f..e2f65e83 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,6 +1,6 @@
 """The language interface for tl programs."""
-
 from __future__ import annotations
+
 from typing import Any, SupportsIndex, TYPE_CHECKING
 from collections.abc import Sequence
 from typing_extensions import Self
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 5b895c41..09289559 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -1,6 +1,5 @@
 """The language interface for tl programs."""
 from __future__ import annotations
-
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
 from tilelang.language.utils import buffer_to_tile_region
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index fc5491ce..74cb32f7 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import tvm.script.ir_builder.tir.ir as _ir
 from tvm.script.ir_builder.tir import frame
 from tvm.tir import PrimExpr
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 8a918c3f..ad8b83dd 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from tilelang import tvm as tvm
 from tvm import tir
 from tvm.tir import PrimExpr, Buffer, BufferLoad, op
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 90c8a8e9..684880b7 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-
 from contextlib import contextmanager, AbstractContextManager
 from dataclasses import dataclass
 import inspect
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 872d3001..bec76809 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,6 +1,4 @@
 """The language interface for tl programs."""
-from __future__ import annotations
-
 from tvm.script.ir_builder.tir.frame import TIRFrame
 from tvm.ffi import register_object
 from tilelang import _ffi_api
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index 06fc7a98..b9a56d8e 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,7 +1,5 @@
 """Wrapping Layouts."""
 # pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
-
 import tvm
 import tvm_ffi
 from tvm.ir import Range
@@ -124,7 +122,7 @@ class Fragment(Layout):
     def repeat(self,
                repeats,
                repeat_on_thread: bool = False,
-               lower_dim_first: bool = True) -> Fragment:
+               lower_dim_first: bool = True) -> 'Fragment':
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -144,7 +142,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_repeat(self, repeats, repeat_on_thread, lower_dim_first)
 
-    def replicate(self, replicate: int) -> Fragment:
+    def replicate(self, replicate: int) -> 'Fragment':
         """
         Replicate the Fragment across a new thread dimension.
 
@@ -160,7 +158,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_replicate(self, replicate)
 
-    def condense_rep_var(self) -> Fragment:
+    def condense_rep_var(self) -> 'Fragment':
         """
         Condense or fold the replicate variable into the existing iteration space.
         This operation may be used to reduce dimensionality if the replicate variable
@@ -207,7 +205,7 @@ class Fragment(Layout):
         """
         return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
-    def is_equal(self, other: Fragment) -> bool:
+    def is_equal(self, other: 'Fragment') -> bool:
         """
         Check if the current fragment is equal to another fragment.
         """
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index 2fd58cd2..eaaa178f 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -1,7 +1,6 @@
 """Wrapping Layouts."""
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
-
 import tvm
 import tilelang.language as T
 import warnings
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index 14db1222..10e0357e 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,7 +1,5 @@
 """Wrapping Layouts."""
 # pylint: disable=invalid-name, unsupported-binary-operation
-from __future__ import annotations
-
 import tvm_ffi
 from tvm.ir import Node, Range
 from tvm.tir import IterVar, Var, PrimExpr, IndexMap
@@ -122,7 +120,7 @@ class Layout(Node):
         # Map the provided indices using the constructed index mapping
         return index_map.map_indices(indices)
 
-    def inverse(self) -> Layout:
+    def inverse(self) -> 'Layout':
         """
         Compute the inverse of the current layout transformation.
 
@@ -133,7 +131,7 @@ class Layout(Node):
         """
         return _ffi_api.Layout_inverse(self)
 
-    def is_equal(self, other: Layout) -> bool:
+    def is_equal(self, other: 'Layout') -> bool:
         """
         Check if the current layout is equal to another layout.
 
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index f63c954a..3a219c67 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -1,7 +1,7 @@
 """Wrapping Layouts."""
 # pylint: disable=invalid-name, unsupported-binary-operation
-
 from __future__ import annotations
+
 import tvm
 from tvm.tir import Buffer, BufferLoad, BufferRegion
 from tilelang import _ffi_api
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
index ee9436d1..24843740 100644
--- a/tilelang/primitives/gemm/__init__.py
+++ b/tilelang/primitives/gemm/__init__.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-
 from tvm import tir
 from tilelang.utils import is_local, is_fragment, is_shared
 from tilelang.primitives.gemm.base import GemmWarpPolicy
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index c681ee97..3ff2baab 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,6 +1,5 @@
 """The profiler and convert to torch utils"""
 from __future__ import annotations
-
 from typing import Callable, Any, Literal
 from functools import partial
 import torch
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index 47d91f05..e4e7f7ee 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-from __future__ import annotations
 from typing import Literal
 
 decode_i4_to_f16 = """
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 0425c549..80f3e061 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from typing import Literal
 
 # Implementation asm for fp4 to bf16, using twiddling
diff --git a/tilelang/transform/add_bufstore_wrapper.py b/tilelang/transform/add_bufstore_wrapper.py
index 7ccab470..d8457f99 100644
--- a/tilelang/transform/add_bufstore_wrapper.py
+++ b/tilelang/transform/add_bufstore_wrapper.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from tvm.tir import (BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm)
 from tvm.tir.stmt_functor import ir_transform, post_order_visit
 from tvm.tir.transform import prim_func_pass
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 51f63db4..79947750 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 """The profiler and convert to torch utils"""
 from enum import Enum
 import torch
-- 
GitLab


From e805f8e5a96a0c63342bdf0420941737dcbdc469 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 18 Nov 2025 14:06:31 +0800
Subject: [PATCH 013/139] [BugFix] Adding extra parameters into autotune
 hashkey (#1274)

* [BugFix] Adding extra parameters into autotune hashkey

* lint

* None check

* check serializable
---
 tilelang/autotuner/tuner.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 4027c619..7138f4c1 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -235,7 +235,8 @@ class AutoTuner:
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
 
-    def generate_cache_key(self, parameters: dict[str, Any]) -> AutotuneResult | None:
+    def generate_cache_key(self, parameters: dict[str, Any],
+                           extra_parameters: dict[str, Any]) -> AutotuneResult | None:
         """Generate a cache key for the auto-tuning process.
         """
 
@@ -261,6 +262,7 @@ class AutoTuner:
         key_data = {
             "version": __version__,
             "op_parameters": tuple(op_parameters),
+            "extra_parameters": extra_parameters,
             "func_source": func_source,
             "configs": self.configs,
             "compile_args": hash(self.compile_args),
@@ -293,10 +295,28 @@ class AutoTuner:
         sig = inspect.signature(self.fn)
         parameters = sig.parameters
 
+        # NOTE(chaofan):  We need to extract some parameters from the closure.
+        # Consider the case:
+        #   def gemm(M, N, K):
+        #       def kernel(...)
+        # If we only extract source, M/N/K will be symbolic and there will be cache problem.
+        extra_parameters: dict[str, Any] = {}
+        cells = self.fn.__closure__
+        var_names = self.fn.__code__.co_freevars
+        if cells is not None:
+            assert len(var_names) == len(cells), "Number of free variables does not match"
+            for var_name, cell in zip(var_names, cells):
+                if var_name in parameters:
+                    continue
+                # Cell content must be serializable
+                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), \
+                    f"Cell contents {cell.cell_contents} is not serializable: {type(cell.cell_contents)}"
+                extra_parameters[var_name] = cell.cell_contents
+
         if isinstance(self.configs, Callable):
             self.configs = self.configs(*self._kernel_parameters)
 
-        key = self.generate_cache_key(parameters)
+        key = self.generate_cache_key(parameters, extra_parameters)
 
         with self._lock:
             if env.is_cache_enabled():
-- 
GitLab


From 49c857154efdf9edf509c8ab1fb0c967724470b8 Mon Sep 17 00:00:00 2001
From: Elevator14B <elevator14b@outlook.com>
Date: Tue, 18 Nov 2025 15:28:23 +0800
Subject: [PATCH 014/139] Fix various issues under `int64_t` static and dynamic
 shape. (#1218)

* Fix various issues under int64_t static and dynamic shape.

* Resolve reviewed issues.

* Add unit test.

* fix

---------

Co-authored-by: LeiWang1999 <leiwang1999@outlook.com>
---
 src/transform/inject_assumes.cc               |  4 +-
 .../language/test_tilelang_language_int64.py  | 66 +++++++++++++++++++
 .../jit/adapter/cython/cython_wrapper.pyx     |  4 +-
 tilelang/jit/adapter/nvrtc/wrapper.py         |  4 +-
 tilelang/jit/adapter/wrapper.py               | 28 ++++----
 5 files changed, 88 insertions(+), 18 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_language_int64.py

diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index 485e270c..3c3bf923 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -6,6 +6,7 @@
 #include "tvm/node/structural_hash.h"
 #include "tvm/tir/builtin.h"
 #include "tvm/tir/expr.h"
+#include "tvm/tir/op.h"
 #include "tvm/tir/stmt.h"
 #include "tvm/tir/stmt_functor.h"
 #include "tvm/tir/transform.h"
@@ -62,7 +63,8 @@ private:
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
-        auto simplified = analyzer.Simplify(GT(e.expr, 0));
+        auto simplified =
+            analyzer.Simplify(GT(e.expr, make_zero(e.expr->dtype)));
         std::stringstream ss;
         ss << "Buffer shape should be greater than 0: shape `" << e.expr
            << "` from buffer ";
diff --git a/testing/python/language/test_tilelang_language_int64.py b/testing/python/language/test_tilelang_language_int64.py
new file mode 100644
index 00000000..28fa2211
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_int64.py
@@ -0,0 +1,66 @@
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit
+def fill_symbolic(value: float, dtype="bfloat16"):
+    n = T.symbolic("n", "int64")
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_symbolic(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_symbolic(1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_symbolic():
+    # Requires 8GB VRAM
+    run_fill_symbolic(2**32)
+
+
+@tilelang.jit
+def fill_static(n: int, value: float, dtype="bfloat16"):
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_static(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_static(n, 1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_static():
+    # Requires 8GB VRAM
+    run_fill_static(2**32)
+
+
+if __name__ == "__main__":
+    test_fill_symbolic()
+    test_fill_static()
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index f17bfffc..873e5507 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -267,9 +267,9 @@ cdef class CythonKernelWrapper:
         # Add dynamic dimension values to kernel arguments
         for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
             if ref_id == 0:
-                call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].shape[shape_idx]))
             else:
-                call_args.append(tensor_list[buffer_idx].stride(shape_idx))
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx)))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
index 1a29adef..7e00050c 100644
--- a/tilelang/jit/adapter/nvrtc/wrapper.py
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -313,9 +313,9 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
                 raise ValueError(
                     f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
 
         function_args.append(self.get_stream_type())
 
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 7819890d..48b8e908 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -220,9 +220,9 @@ class TLCUDASourceWrapper:
                 raise ValueError(
                     f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "int"})
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
 
         function_args.append(self.get_stream_type())
 
@@ -405,18 +405,20 @@ class TLCUDASourceWrapper:
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
 
-        def unique_push_back(name: str):
+        def unique_push_back(name: str, dtype: str):
             if name not in dynamic_symbolic_set:
-                dynamic_symbolic_set.append(name)
+                dynamic_symbolic_set[name] = dtype
+            else:
+                assert dtype == dynamic_symbolic_set[name]
 
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var):
-                        unique_push_back(dim.name)
+                        unique_push_back(dim.name, str(dim.dtype))
 
         # Note: In buffer definitions, any dynamic symbols appearing in strides are listed after those in the shape.
         for param in prim_func.params:
@@ -424,9 +426,9 @@ class TLCUDASourceWrapper:
                 buffer = prim_func.buffer_map[param]
                 for stride in buffer.strides:
                     if isinstance(stride, tvm.tir.Var):
-                        unique_push_back(stride.name)
+                        unique_push_back(stride.name, str(stride.dtype))
 
-        return dynamic_symbolic_set
+        return list(dynamic_symbolic_set.items())
 
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
@@ -665,8 +667,8 @@ class TLCPUSourceWrapper:
                 raise ValueError(
                     f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            function_args.append({"name": dyn_sym, "type": "int"})
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
@@ -715,14 +717,14 @@ class TLCPUSourceWrapper:
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var) and (dim.name not in dynamic_symbolic_set):
-                        dynamic_symbolic_set.append(dim.name)
-        return dynamic_symbolic_set
+                        dynamic_symbolic_set[dim.name] = str(dim.dtype)
+        return list(dynamic_symbolic_set.items())
 
     def get_cpu_init_func(self):
         # Provide init() and get_last_error() for CPU backend
-- 
GitLab


From 0f980f15c575bf35db73a70fc04a8a53c005b2c8 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <80731350+learning-chip@users.noreply.github.com>
Date: Tue, 18 Nov 2025 14:35:18 +0100
Subject: [PATCH 015/139] Bug fix for Gated Delta Net benchmark script (#1267)

* fix argument order for fla chunk_gated_delta_rule_fwd_h

* explicit import assert_similar from utils

* rename utils module to avoid name clash

* set store_final_state and save_new_value to True

* fix

---------

Co-authored-by: LeiWang1999 <leiwang1999@outlook.com>
---
 examples/gdn/example_chunk_delta_bwd.py   |  2 +-
 examples/gdn/example_chunk_delta_h.py     | 30 +++++++++++++++++------
 examples/gdn/example_chunk_o_bwd.py       |  2 +-
 examples/gdn/example_wy_fast_bwd_split.py |  2 +-
 examples/gdn/{utils.py => test_utils.py}  |  0
 5 files changed, 25 insertions(+), 11 deletions(-)
 rename examples/gdn/{utils.py => test_utils.py} (100%)

diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index 518b0ee2..d9ccc256 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -24,7 +24,7 @@ import torch.nn.functional as F
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-from utils import *
+from test_utils import assert_similar
 
 
 def prepare_input(
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 61c2abd3..cc384ade 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -20,7 +20,7 @@ import torch
 import torch.nn.functional as F
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-from utils import *
+from test_utils import assert_similar
 
 # (zhengju) We can slightly modify the generated cuda code from tilelang lowering
 # in the debug folder to make the performance better. To enable this callback,
@@ -292,9 +292,15 @@ def run_test(
                                                                       getattr(torch, state_dtype))
 
     # fla ref
-    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(K, W, U, G, initial_state,
-                                                                     store_final_state, chunk_size,
-                                                                     save_new_value)
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value)
 
     # tilelang
     kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
@@ -305,8 +311,16 @@ def run_test(
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
 
-    fla_time = do_bench(chunk_gated_delta_rule_fwd_h, K, W, U, G, initial_state, store_final_state,
-                        chunk_size, save_new_value)
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value)
     tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
 
     # check correctness
@@ -371,8 +385,8 @@ def main():
         chunk_size=64,
         use_g=True,
         use_initial_state=False,
-        store_final_state=False,
-        save_new_value=False,
+        store_final_state=True,
+        save_new_value=True,
         block_DK=32,
         block_DV=32,
         threads=128,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 7e87a2c4..ff4d3f7a 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -19,7 +19,7 @@ except ImportError:
     fla = None
 
 import torch
-from utils import *
+from test_utils import assert_similar
 
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 618a82b4..42a0040d 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -501,7 +501,7 @@ def run_test(
     dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
         dim=-1)
 
-    from utils import assert_similar
+    from test_utils import assert_similar
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
diff --git a/examples/gdn/utils.py b/examples/gdn/test_utils.py
similarity index 100%
rename from examples/gdn/utils.py
rename to examples/gdn/test_utils.py
-- 
GitLab


From 1b0efb650fd0dfd05d0b643bf5eaa8e9781239ee Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 18 Nov 2025 21:37:01 +0800
Subject: [PATCH 016/139] [Bugfix] Minor fix for some cases (#1278)

---
 .../gemm_v2/correctness_evaluation_tcgen05.py | 25 ++++++++-----------
 .../intrinsics/tcgen05_macro_generator.py     |  5 ++--
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
index f5d76589..1831ac8a 100644
--- a/maint/gemm_v2/correctness_evaluation_tcgen05.py
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -191,7 +191,7 @@ def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
+    tilelang.testing.main()
 
     # # Test Pass
     # for m in [32, 64, 128, 256]:
@@ -203,6 +203,16 @@ if __name__ == "__main__":
     #             run_gemm(m, n, k * 3, False, True, "float16", "float", "float", m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, "float16", "float", "float", m, n, k, 2, 256)
+    #             print(f"Test {m} {n} {k} Pass")
+
     # # Test Pass
     # for m in [32, 64, 128, 256]:
     #     for n in [16, 32, 64, 128]:
@@ -211,16 +221,3 @@ if __name__ == "__main__":
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
     #             run_gemm(m, n, k * 3, False, True, "float8_e5m2", "float", "float", m, n, k, 2, 128)
-    #             print(f"Test {m} {n} {k} Pass")
-
-    tilelang.disable_cache()
-    run_gemm(32, 512, 16, False, True, "float16", "float32", "float32", 32, 512, 16, 0, 128)
-    run_gemm(32, 512, 32, False, True, "float16", "float32", "float32", 32, 512, 32, 0, 128)
-    run_gemm(32, 512, 64, False, True, "float16", "float32", "float32", 32, 512, 64, 0, 128)
-    run_gemm(64, 512, 16, False, True, "float16", "float32", "float32", 64, 512, 16, 0, 128)
-    run_gemm(64, 512, 16, False, True, "float16", "float32", "float32", 32, 512, 16, 0, 128)
-    run_gemm(128, 512, 16, False, True, "float16", "float32", "float32", 128, 512, 16, 0, 128)
-
-    # run_gemm(64, 512, 32, False, True, "float16", "float32", "float32", 64, 512, 32, 0, 128)
-    # run_gemm(64, 512, 64, False, True, "float16", "float32", "float32", 64, 512, 64, 0, 128)
-    # run_gemm(128, 512, 16, False, True, "float16", "float32", "float32", 128, 512, 16, 0, 128)
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 814d28b6..e53ff7cb 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -247,8 +247,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         mask_zero = T.Cast("int32", 0)
         mask0 = mask1 = mask2 = mask3 = mask_zero
 
-        num_inst_m = 4 * self.warp_row_tiles // atom_m
-        num_inst_n = self.warp_col_tiles // atom_n
+        # TCGEN05 only has one warp group
+        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m
+        num_inst_n = self.block_col_warps * self.warp_col_tiles // atom_n
 
         # Helper to allow BufferRegion/BufferLoad as inputs
         def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
-- 
GitLab


From 921b96a31bb10e7aff84dece6e7501cf1fb96c63 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 18 Nov 2025 23:17:49 +0800
Subject: [PATCH 017/139] [Language] Add shape check in `T.view/reshape`
 (#1277)

* [Language] Add shape check in T.view/reshape

* address comments
---
 .../test_tilelang_language_reshape.py         | 21 +++++++++++++
 .../language/test_tilelang_language_view.py   | 31 +++++++++++++++++++
 tilelang/language/customize.py                | 12 ++++---
 tilelang/utils/language.py                    | 13 +++++++-
 4 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index c510bdd3..60588b4a 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -2,6 +2,7 @@ from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
 import torch
+import pytest
 
 
 def reshape_test(N, M, dtype):
@@ -262,5 +263,25 @@ def test_reduce_after_reshape():
     run_reduce_after_reshape(2048, 64, "float16")
 
 
+def reshape_shape_mismatch_test(N, M, dtype):
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N // M, M), dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_reshaped = T.reshape(A, [N // M, M + 1])
+            T.copy(A_reshaped, B)
+
+    return main
+
+
+def test_reshape_shape_mismatch():
+    with pytest.raises(AssertionError):
+        reshape_shape_mismatch_test(1024, 32, "float32")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index c16c5185..a79d428b 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -1,6 +1,7 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import pytest
 
 
 def view_test(N, M, dtype, new_dtype=None):
@@ -54,5 +55,35 @@ def test_reshape_view():
     run_view(2048, 64, "float16", "float32")
 
 
+def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
+    import tilelang.language as T
+
+    new_shape = [N // M, M + 1]
+    if new_dtype:
+        from tvm import DataType
+        dtype_src = DataType(dtype)
+        dtype_dst = DataType(new_dtype)
+        src_bits = dtype_src.bits
+        dst_bits = dtype_dst.bits
+        scale = src_bits / dst_bits
+        new_shape[-1] = int(M * scale)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_viewed = T.view(A, new_shape, dtype=new_dtype)
+            T.copy(A_viewed, B)
+
+    return main
+
+
+def test_view_shape_mismatch():
+    with pytest.raises(AssertionError):
+        view_shape_mismatch_test(1024, 32, "float32")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 9175bdb8..3d40ce47 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
+from tilelang.utils.language import (bits_product, prim_expr_equal)
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
@@ -45,19 +46,22 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     Returns:
         Buffer: A new buffer view with the specified shape
     """
+    assert prim_expr_equal(bits_product(shape, src.dtype),
+                           bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, src.dtype, src.data)
 
 
 def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = None) -> Buffer:
-    """
-         Return a Tensor view of the input buffer with an optional new shape and dtype.
+    """Return a Tensor view of the input buffer with an optional new shape and dtype.
 
-         If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
-         """
+    If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
+    """
     if shape is None:
         shape = src.shape
     if dtype is None:
         dtype = src.dtype
+    assert prim_expr_equal(bits_product(shape, dtype),
+                           bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, dtype, src.data)
 
 
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index de180745..e9fe13da 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
 from functools import reduce
-from tvm import IRModule
+from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
 from tvm import ir, tir
 
@@ -349,6 +349,17 @@ def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_offset argument type: {type(obj)} for object {obj}")
 
 
+def bits_product(shape: list[PrimExpr], dtype: str) -> PrimExpr:
+    """
+    Compute the number of bits in a Buffer (shape with dtype)."""
+    if len(shape) == 0:
+        return tir.IntImm("int32", 1)
+    result = shape[0]
+    for i in range(1, len(shape)):
+        result = result * shape[i]
+    return result * DataType(dtype).bits
+
+
 def prim_expr_equal(lhs, rhs) -> bool:
     """
     Robust equality for PrimExpr shapes/extents.
-- 
GitLab


From 74da369695068da9ddef76dc807792abcea0f6fa Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 18 Nov 2025 23:50:57 +0800
Subject: [PATCH 018/139] [FFI] Use tvm ffi as the default execution backend
 (#1259)

* [Refactor] Update FFI type handling and simplify argument management

* Refactored FFI type definitions in runtime and code generation files to use `TVMFFIAny` instead of `TVMValue`, enhancing type clarity.
* Updated function registration in `runtime.cc` to utilize canonical names for better consistency.
* Simplified argument handling in the `simplify` transformation, ensuring unused buffer parameters are removed only when simplification is enabled.
* Adjusted autotuner and profiler parameters to standardize the execution backend to `tvm_ffi`, improving clarity in backend selection.
* Removed obsolete `adapt_torch2tvm` function from tensor utilities to streamline the codebase and reduce complexity.

* [Update] Sync TVM submodule and enhance kernel source handling

* Updated the TVM submodule to commit cdc2aced, ensuring compatibility with recent changes.
* Added functionality to print kernel source in `example_blocksparse_gemm.py` for better debugging.
* Commented out the main execution call in test files to prevent unintended execution during testing.
* Introduced `tilelang.disable_cache()` in various test files to streamline testing and avoid cache-related issues.
* Refactored kernel source retrieval methods to improve clarity and consistency across different execution backends.

* [Refactor] Clean up imports and improve code formatting

* Removed unused import of `tilelang.testing` in `test_example_blocksparse_gemm.py` to streamline the code.
* Reformatted several lines in `arg_binder.cc`, `make_packed_api.cc`, `tvm_ffi.py`, and `adapter.py` for improved readability and consistency.
* Updated comments and spacing in `tvm_ffi.py` to enhance clarity without altering functionality.

* Update execution backend options and improve resolution logic

- Changed default execution backend from "cython" to "auto" in multiple locations to allow automatic selection based on the target.
- Expanded the list of supported execution backends to include "torch" and "nvrtc" across various classes and functions.
- Enhanced backend resolution logic in `KernelCache` and `AutoTuner` to ensure appropriate backend selection based on the target.
- Updated documentation to reflect changes in execution backend options and their defaults.

* lint fix

* fix

* Enhance argument handling in CUDA and HIP runtime modules

- Updated `ExtractFuncInfo` in `rt_mod_cuda.cc` and `rt_mod_hip.cc` to map boolean argument types to int32, ensuring compatibility with device runtime.
- Refactored `BindDLTensor` in `arg_binder.cc` to improve null handling and validation checks for DLTensor parameters, utilizing expression-level guards to prevent dereferencing null pointers.
- Enhanced error checking for buffer shape, strides, and data fields, ensuring robust handling of optional inputs and maintaining consistency across various checks.

* lint fix

* lint fix

* lint fix

* lint fix

* minor fix

* fix

* recover check

* Refactor argument binding and validation in `arg_binder.cc`

- Improved null handling and validation checks in `BindDLTensor`, ensuring safe dereferencing of pointers.
- Enhanced consistency checks for buffer shape, strides, and data fields, utilizing expression-level guards.
- Updated `MakePackedAPI` to maintain code clarity and consistency in argument handling.
- Minor adjustments in test files to streamline kernel execution and improve readability.

* lint fix

* stride fix

* minor fix

* fix

* lint fix

* lint fix

* Add CUDA stream access policy window helpers and integrate with L2 persistent cache management

- Introduced functions to set and reset the CUDA stream access policy window, allowing for better control over L2 cache usage.
- Updated runtime files to include new FFI packed functions for managing stream attributes.
- Modified lower_hopper_intrin to incorporate prologue and epilogue statements for L2 cache setup and teardown.
- Enhanced tests to verify the inclusion of new FFI calls in the generated kernel source.

* check with symbolic

* support null ptr

* Update CMakeLists and lower.py for code generation and subproject status

- Added `codegen_c_host.cc` to the list of source files in CMakeLists.txt for improved code generation support.
- Updated the function call in `lower.py` to use `target.build.tilelang_c` for C target host code generation, enhancing compatibility.
- Marked the TVM subproject as dirty to indicate local modifications.

* lint fix

* Update comments for clarity in quickstart.py
---
 3rdparty/tvm                                  |   2 +-
 CMakeLists.txt                                |   1 +
 .../example_blocksparse_gemm.py               |   1 -
 examples/gdn/example_chunk_o_bwd.py           |   1 -
 examples/gdn/test_example_gdn_compilation.py  |   1 +
 examples/quickstart.py                        |   5 +-
 pyproject.toml                                |   1 +
 src/runtime/runtime.cc                        | 172 ++++-
 src/runtime/runtime.h                         |   8 +-
 src/target/codegen_c_host.cc                  | 556 +++++++++++++++++
 src/target/codegen_c_host.h                   | 124 ++++
 src/target/codegen_cpp.cc                     |   8 +-
 src/target/rt_mod_cuda.cc                     |   6 +-
 src/target/rt_mod_hip.cc                      |   6 +-
 src/transform/arg_binder.cc                   | 384 +++++++++---
 src/transform/arg_binder.h                    |   4 +
 src/transform/lower_hopper_intrin.cc          |  64 +-
 src/transform/make_packed_api.cc              | 293 ++++-----
 src/transform/simplify.cc                     |  57 +-
 .../python/debug/test_tilelang_debug_print.py |   2 +-
 .../dynamic/test_tilelang_dynamic_symbolic.py |   3 +-
 .../jit/test_tilelang_jit_gemm_ctypes.py      | 411 ------------
 .../python/jit/test_tilelang_jit_nullptr.py   |  13 +-
 .../python/jit/test_tilelang_jit_tvm_ffi.py   | 589 ++++++++++++++++++
 .../language/test_tilelang_language_alloc.py  |   4 +-
 tilelang/autotuner/param.py                   |   6 +-
 tilelang/autotuner/tuner.py                   |  21 +-
 tilelang/cache/__init__.py                    |   3 +-
 tilelang/cache/kernel_cache.py                | 145 +++--
 tilelang/contrib/dlpack.py                    |  20 -
 tilelang/engine/lower.py                      |   2 +-
 tilelang/jit/__init__.py                      |  45 +-
 tilelang/jit/adapter/__init__.py              |   2 +-
 tilelang/jit/adapter/base.py                  |  48 +-
 tilelang/jit/adapter/ctypes/adapter.py        |  25 +-
 tilelang/jit/adapter/cython/adapter.py        |  26 +-
 tilelang/jit/adapter/dlpack.py                |  40 --
 tilelang/jit/adapter/nvrtc/adapter.py         |  21 +-
 tilelang/jit/adapter/tvm_ffi.py               | 321 ++++++++++
 tilelang/jit/execution_backend.py             | 100 +++
 tilelang/jit/kernel.py                        |  85 ++-
 tilelang/profiler/__init__.py                 |   4 +-
 tilelang/utils/tensor.py                      |  19 -
 43 files changed, 2721 insertions(+), 928 deletions(-)
 create mode 100644 src/target/codegen_c_host.cc
 create mode 100644 src/target/codegen_c_host.h
 delete mode 100644 testing/python/jit/test_tilelang_jit_gemm_ctypes.py
 create mode 100644 testing/python/jit/test_tilelang_jit_tvm_ffi.py
 delete mode 100644 tilelang/jit/adapter/dlpack.py
 create mode 100644 tilelang/jit/adapter/tvm_ffi.py
 create mode 100644 tilelang/jit/execution_backend.py

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 093b2cdb..f4105f89 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 093b2cdb2187140b197336496d65d61ace89e8ff
+Subproject commit f4105f89a646622acc9818584d1d91e2ca3f533d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72e1d979..f784f11f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ file(GLOB TILE_LANG_SRCS
   src/transform/*.cc
   src/op/*.cc
   src/target/utils.cc
+  src/target/codegen_c_host.cc
   src/target/codegen_cpp.cc
   src/target/rt_mod_cpp.cc
   # intrin_rule doesn't have system dependency
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 7b9cff7c..8cd3a821 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -166,7 +166,6 @@ def main():
             enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index ff4d3f7a..20aa8414 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -468,7 +468,6 @@ def run_test(
     kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
                                         gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
                                         block_DK, block_DV, threads, num_stages)
-    print(kernel.get_kernel_source())
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index e184dbca..75a62171 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -117,6 +117,7 @@ def test_example_chunk_o_bwd_compilation():
     kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
                                         gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
                                         block_DK, block_DV, threads, num_stages)
+
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
                                                                 W)  # noqa: F841
     if use_g:
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 42514ee3..46a39e0d 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -55,10 +55,9 @@ block_M = 128
 block_N = 128
 block_K = 32
 
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
+# Define the kernel (matmul) and compile/lower it into an executable module
 matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
+# Test the kernel in Python with PyTorch data
 import torch
 
 # Create random input tensors on the GPU
diff --git a/pyproject.toml b/pyproject.toml
index 8c417d56..706cd529 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,6 +104,7 @@ tilelang = "tilelang"
 # TVM
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index a00786e2..b2a7127d 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -13,6 +13,12 @@
 namespace tvm {
 namespace tl {
 
+#if 1
+// Thread-local storage for restoring the L2 persisting cache limit
+static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
+static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
+#endif
+
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -91,19 +97,21 @@ struct TensorMapArgs {
 // set device api
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
-                                                                Any *ret) {
-    TensorMapArgs T = TensorMapArgs::Extract(args);
-    CUresult result = cuTensorMapEncodeTiled(
-        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
-        T.l2Promotion, T.oobFill);
-    if (result != CUDA_SUCCESS) {
-      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
-                << T.ToDebugString();
-    }
-    *ret = static_cast<int>(result);
-  });
+  // Register using the canonical names defined in runtime.h
+  refl::GlobalDef().def_packed(
+      tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << '\n'
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
 }
 
 struct TensorMapIm2ColArgs {
@@ -183,7 +191,7 @@ struct TensorMapIm2ColArgs {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
-      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+      tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
@@ -201,5 +209,141 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
+//
+// CUDA L2 Persisting Cache Access Policy Window helpers.
+// Exposed as TVM FFI packed functions similar to TMA initialization.
+//
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  // Set stream access policy window and adjust persisting L2 cache size
+  // Args:
+  //  [0]: void* base_ptr (required)
+  //  [1]: int64 num_bytes (required)
+  //  [2]: float hit_ratio (optional, default 0.8)
+  //  [3]: void* stream (optional, default 0 => default stream)
+  //  [4]: int64 l2_limit_bytes (optional, default = num_bytes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_set_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";
+
+        void *base_ptr = args[0].cast<void *>();
+        size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
+        float hit_ratio = 0.8f;
+        if (args.size() >= 3) {
+          // Accept double/float
+          hit_ratio = static_cast<float>(args[2].cast<double>());
+        }
+        CUstream stream = nullptr;
+        if (args.size() >= 4) {
+          stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
+        }
+        size_t l2_limit_bytes = num_bytes;
+        if (args.size() >= 5) {
+          l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
+        }
+
+        // Clamp requested limit to device capability
+        CUdevice device;
+        CUresult result = cuCtxGetDevice(&device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current CUDA device: " << result;
+        }
+        int max_persisting = 0;
+        result = cuDeviceGetAttribute(
+            &max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
+            device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
+                    << result;
+        }
+        if (max_persisting > 0 &&
+            l2_limit_bytes > static_cast<size_t>(max_persisting)) {
+          l2_limit_bytes = static_cast<size_t>(max_persisting);
+        }
+
+        // Save current limit to restore later
+        size_t init_persisting_l2_cache_size = 0;
+        result = cuCtxGetLimit(&init_persisting_l2_cache_size,
+                               CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
+                    << result;
+        }
+        __tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
+        __tl_prev_persisting_l2_cache_saved = true;
+
+        // Set new limit
+        result =
+            cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set persisting L2 cache size limit: "
+                    << result;
+        }
+
+        // Apply access policy window to stream
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
+        stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
+        stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
+        stream_attribute.accessPolicyWindow.hitProp =
+            CU_ACCESS_PROPERTY_PERSISTING;
+        stream_attribute.accessPolicyWindow.missProp =
+            CU_ACCESS_PROPERTY_STREAMING;
+
+        result = cuStreamSetAttribute(stream,
+                                      CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+                                      &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set stream access policy window: " << result;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+
+  // Reset stream access policy window and restore the previous L2 cache size
+  // Args:
+  //  [0]: void* stream (optional, default 0)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_reset_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        CUstream stream = nullptr;
+        if (args.size() >= 1) {
+          stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
+        }
+
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        // num_bytes = 0 disables the access policy window on the stream
+        stream_attribute.accessPolicyWindow.num_bytes = 0;
+
+        CUresult result = cuStreamSetAttribute(
+            stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+            &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset stream access policy window: "
+                    << result;
+        }
+
+        result = cuCtxResetPersistingL2Cache();
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
+        }
+
+        if (__tl_prev_persisting_l2_cache_saved) {
+          result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
+                                 __tl_prev_persisting_l2_cache_size);
+          if (result != CUDA_SUCCESS) {
+            LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
+                      << result;
+          }
+          __tl_prev_persisting_l2_cache_saved = false;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index fb9dfcfd..4b389fc0 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
 constexpr const char *tvm_tensormap_create_im2col =
     "__tvm_tensormap_create_im2col";
 #endif // (CUDA_MAJOR_VERSION >= 12)
+
+// CUDA stream access policy window helpers
+constexpr const char *tvm_cuda_stream_set_access_policy_window =
+    "__tvm_cuda_stream_set_access_policy_window";
+constexpr const char *tvm_cuda_stream_reset_access_policy_window =
+    "__tvm_cuda_stream_reset_access_policy_window";
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_RUNTIME_RUNTIME_H_
\ No newline at end of file
+#endif //  TVM_TL_RUNTIME_RUNTIME_H_
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
new file mode 100644
index 00000000..b5e74b0a
--- /dev/null
+++ b/src/target/codegen_c_host.cc
@@ -0,0 +1,556 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.cc
+ */
+#include "codegen_c_host.h"
+
+#include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/extra/module.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/codegen.h>
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// For escaping strings embedded into generated C sources
+#include "support/str_escape.h"
+
+namespace tvm {
+namespace tl {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name_ = name_supply_->FreshName(tvm::ffi::symbol::tvm_ffi_library_ctx);
+}
+
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts,
+                        bool emit_fwd_func_decl, std::string target_str,
+                        const std::unordered_set<std::string> &devices) {
+  emit_asserts_ = emit_asserts;
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  declared_globals_.clear();
+  decl_stream << "// tilelang target: " << target_str << "\n";
+  decl_stream << "#define TVM_EXPORTS\n";
+  decl_stream << "#include \"tvm/runtime/base.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/c_api.h\"\n";
+  decl_stream << "#include <math.h>\n";
+  // snprintf for richer assert messages with actual values
+  decl_stream << "#include <stdio.h>\n";
+  decl_stream << "#include <stdbool.h>\n";
+  CodeGenCHost::InitGlobalContext();
+  tvm::codegen::CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::InitGlobalContext() {
+  decl_stream << "void* " << tvm::ffi::symbol::tvm_ffi_library_ctx
+              << " = NULL;\n";
+}
+
+void CodeGenCHost::DefineModuleName() {
+  decl_stream << "void* " << module_name_ << " = NULL;\n";
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func) {
+  return AddFunction(gvar, func, /*emit_fwd_func_decl=*/false);
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func,
+                               bool emit_fwd_func_decl) {
+  auto global_symbol =
+      func->GetAttr<tvm::ffi::String>(tvm::attr::kGlobalSymbol);
+  if (global_symbol) {
+    function_names_.push_back(global_symbol.value());
+  }
+
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  tvm::codegen::CodeGenC::AddFunction(gvar, func);
+  if (func->HasNonzeroAttr(tvm::tir::attr::kIsEntryFunc) && !has_main_func_) {
+    ICHECK(global_symbol.has_value())
+        << "CodeGenCHost: The entry func must have the global_symbol "
+           "attribute, "
+        << "but function " << gvar << " only has attributes " << func->attrs;
+    function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
+    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+    PrintFuncPrefix(stream);
+    PrintType(func->ret_type, stream);
+    stream << " " << tvm::ffi::symbol::tvm_ffi_main
+           << "(void* self, void* args,int num_args, void* result) {\n";
+    stream << "  return " << static_cast<std::string>(global_symbol.value())
+           << "(self, args, num_args, result);\n";
+    stream << "}\n";
+    has_main_func_ = true;
+  }
+}
+
+void CodeGenCHost::GenerateForwardFunctionDeclarations(
+    tvm::ffi::String global_symbol, const tvm::ffi::Array<tvm::Type> &arg_types,
+    const tvm::Type &ret_type) {
+  if (!emit_fwd_func_decl_) {
+    return;
+  }
+  for (auto &func_already_defined : GetFunctionNames()) {
+    if (global_symbol == func_already_defined) {
+      return;
+    }
+  }
+  this->PrintFuncPrefix(fwd_decl_stream);
+  this->PrintType(ret_type, fwd_decl_stream);
+  fwd_decl_stream << " " << global_symbol << "(";
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    if (i > 0) {
+      fwd_decl_stream << ", ";
+    }
+    tvm::codegen::CodeGenSourceBase::PrintType(arg_types[i], fwd_decl_stream);
+  }
+  fwd_decl_stream << ");\n";
+}
+
+void CodeGenCHost::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
+  os << "#ifdef __cplusplus\n"
+     << "extern \"C\"\n"
+     << "#endif\n";
+}
+
+void CodeGenCHost::PrintType(tvm::DataType t, std::ostream &os) { // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
+    os << "void*";
+    return;
+  }
+  if (t.is_void()) {
+    os << "void";
+    return;
+  }
+  if (t == tvm::DataType::Bool()) {
+    os << "bool";
+    return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+    case 16:
+      os << "half";
+      break;
+    case 32:
+      os << "float";
+      break;
+    case 64:
+      os << "double";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  if (t.is_bfloat16()) {
+    os << "__bf16";
+    return;
+  }
+  if (t.is_int() || t.is_uint()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+    case 8:
+      os << "int8_t";
+      break;
+    case 16:
+      os << "int16_t";
+      break;
+    case 32:
+      os << "int32_t";
+      break;
+    case 64:
+      os << "int64_t";
+      break;
+    case 1:
+      os << "int32_t";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::BroadcastNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  int lanes = op->dtype.lanes();
+  os << "((";
+  PrintType(op->dtype, os);
+  os << ")(";
+  for (int i = 0; i < lanes; ++i) {
+    if (i != 0)
+      os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(
+    const std::string &func_name, const std::string &packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name_ << ", \""
+               << func_name << "\""
+               << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *func_name = op->args[0].as<StringImmNode>();
+  ICHECK(func_name != nullptr)
+      << "tvm_call_[c]packed_lowered expects first argument as function name";
+  int64_t begin = op->args[2].as<IntImmNode>()->value;
+  int64_t end = op->args[3].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+
+  std::string packed_func_name;
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    packed_func_name = GetPackedName(op);
+    this->PrintGetFuncFromBackend(func_name->value, packed_func_name);
+  } else {
+    // directly use the original symbol
+    ICHECK(op->op.same_as(builtin::tvm_call_cpacked_lowered()));
+    packed_func_name =
+        tvm::ffi::symbol::tvm_ffi_symbol_prefix + func_name->value;
+  }
+
+  std::string args_stack = PrintExpr(op->args[1]);
+  this->PrintIndent();
+  std::string result = name_supply_->FreshName("result");
+  this->stream << "TVMFFIAny " << result << ";\n";
+  this->PrintIndent();
+  // must make sure type_index is set to none
+  this->stream << result << ".type_index = kTVMFFINone;\n";
+  this->PrintIndent();
+  this->stream << result << ".zero_padding = 0;\n";
+  this->PrintIndent();
+  this->stream << result << ".v_int64 = 0;\n";
+  this->PrintIndent();
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->stream << "if (TVMFFIFunctionCall(" << packed_func_name << ", ";
+  } else {
+    this->stream << "if (" << packed_func_name << "(NULL, ";
+  }
+  this->stream << "(TVMFFIAny*) " << args_stack << ", " << num_args << ", "
+               << "&" << result << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+std::string CodeGenCHost::GetPackedName(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr)
+      << "tvm_call_packed_lowered expects first argument as function name";
+  std::string func_name = s->value;
+  std::string packed_func_name = func_name + "_packed";
+  std::string unique_name;
+  auto it = declared_globals_.find(packed_func_name);
+  if (it != declared_globals_.end()) {
+    unique_name = it->second;
+  } else {
+    unique_name = name_supply_->FreshName(packed_func_name);
+    declared_globals_[packed_func_name] = unique_name;
+    decl_stream << "static void* " << unique_name << " = NULL;\n";
+  }
+  return unique_name;
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  using namespace tvm::tir;
+  if (op->op.same_as(builtin::tvm_stack_alloca())) {
+    std::string stack_name = name_supply_->FreshName("stack");
+    const std::string &type = op->args[0].as<StringImmNode>()->value;
+    const IntImmNode *num = op->args[1].as<IntImmNode>();
+    ICHECK(num != nullptr);
+    static_assert(alignof(TVMFFIAny) % alignof(DLTensor) == 0, "invariant");
+    size_t unit = sizeof(TVMFFIAny);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(ffi::Shape::index_type) + unit - 1) / unit;
+    } else if (type == "tvm_ffi_any") {
+      size = (num->value * sizeof(TVMFFIAny) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(DLTensor) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    tvm::codegen::CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
+  using namespace tvm::tir;
+  if (emit_asserts_) {
+    std::string cond = PrintExpr(op->condition);
+    PrintIndent();
+    stream << "if (!(" << cond << ")) {\n";
+    int assert_if_scope = this->BeginScope();
+    {
+      // Prepare the base error message
+      const auto *msg_node = op->message.as<StringImmNode>();
+      ICHECK(msg_node != nullptr) << "Assert message expected to be StringImm";
+      const std::string &raw_msg = msg_node->value;
+      const std::string esc_msg = tvm::support::StrEscape(
+          raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
+          /*escape_whitespace_special_chars=*/true);
+
+      // If the assertion condition contains any equality checks anywhere
+      // in a composite boolean expression, append the actual LHS/RHS values
+      // Collect all EQ nodes within the condition (including inside And/Or/Not)
+      std::vector<const EQNode *> eq_nodes;
+      {
+        std::vector<PrimExpr> stk;
+        stk.push_back(op->condition);
+        while (!stk.empty()) {
+          PrimExpr cur = stk.back();
+          stk.pop_back();
+          if (const auto *eq = cur.as<EQNode>()) {
+            eq_nodes.push_back(eq);
+            continue;
+          }
+          if (const auto *an = cur.as<AndNode>()) {
+            stk.push_back(an->a);
+            stk.push_back(an->b);
+            continue;
+          }
+          if (const auto *on = cur.as<OrNode>()) {
+            stk.push_back(on->a);
+            stk.push_back(on->b);
+            continue;
+          }
+          if (const auto *nn = cur.as<NotNode>()) {
+            stk.push_back(nn->a);
+            continue;
+          }
+        }
+      }
+
+      if (!eq_nodes.empty()) {
+        // Build a single detailed message that includes all LHS/RHS pairs
+        PrintIndent();
+        stream << "char __tvm_assert_msg_buf[1024];\n";
+        PrintIndent();
+        stream << "int __tvm_assert_msg_len = snprintf(__tvm_assert_msg_buf, "
+                  "sizeof(__tvm_assert_msg_buf), \"%s\", \""
+               << esc_msg << "\");\n";
+
+        auto escape_for_printf_literal = [&](const std::string &s) {
+          std::string out;
+          out.reserve(s.size());
+          for (char c : s) {
+            if (c == '%') {
+              out += "%%";
+            } else if (c == '"') {
+              out += "\\\"";
+            } else if (c == '\\') {
+              out += "\\\\";
+            } else {
+              out.push_back(c);
+            }
+          }
+          return out;
+        };
+
+        for (const auto *eq : eq_nodes) {
+          std::string lhs = PrintExpr(eq->a);
+          std::string rhs = PrintExpr(eq->b);
+          std::string lhs_disp = escape_for_printf_literal(lhs);
+          std::string rhs_disp = escape_for_printf_literal(rhs);
+          PrintIndent();
+          stream << "__tvm_assert_msg_len += snprintf(__tvm_assert_msg_buf + "
+                    "__tvm_assert_msg_len, "
+                    "sizeof(__tvm_assert_msg_buf) - __tvm_assert_msg_len, \"; ("
+                 << lhs_disp << " == " << rhs_disp
+                 << ") got: %lld, expected: %lld\", (long long)(" << lhs
+                 << "), (long long)(" << rhs << "));\n";
+        }
+        PrintIndent();
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
+                  "__tvm_assert_msg_buf);\n";
+      } else {
+        // Fallback: just emit the base message
+        PrintIndent();
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \"" << esc_msg
+               << "\");\n";
+      }
+    }
+    PrintIndent();
+    stream << "return -1;\n";
+    this->EndScope(assert_if_scope);
+    PrintIndent();
+    stream << "}\n";
+  }
+  this->PrintStmt(op->body);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MinNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, "<", os);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MaxNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, ">", os);
+}
+
+template <typename T>
+inline void CodeGenCHost::PrintTernaryCondExpr(const T *op, const char *compare,
+                                               std::ostream &os) { // NOLINT(*)
+  std::ostringstream temp_a;
+  VisitExpr(op->a, temp_a);
+  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::ostringstream temp_b;
+  VisitExpr(op->b, temp_b);
+  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+
+  os << "((" << a_id << ") " << compare << " (" << b_id << ") "
+     << "? (" << a_id << ") : (" << b_id << "))";
+}
+
+} // namespace tl
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using tvm::codegen::CodeGenSourceBase;
+using tvm::codegen::CSourceModuleCreate;
+using tvm::ffi::Array;
+using tvm::ffi::Map;
+using tvm::ffi::Module;
+using tvm::ffi::String;
+
+// Build function that mirrors TVM's host C codegen, registered under a
+// TileLang-specific name.
+::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
+                                      ::tvm::Target target) {
+  bool output_ssa = false;
+  bool emit_asserts = true;
+  bool emit_fwd_func_decl = true;
+
+  std::unordered_set<std::string> devices;
+  if (mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+          "device_contexts") != nullptr) {
+    ::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String> device_contexts =
+        mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+               "device_contexts")
+            .value();
+    for (auto const &context : device_contexts) {
+      devices.insert(context.second.data());
+    }
+  }
+
+  CodeGenCHost cg;
+  cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
+  cg.SetConstantsByteAlignment(
+      target->GetAttr<::tvm::Integer>("constants-byte-alignment").value_or(16));
+
+  auto is_aot_executor_fn = [](::tvm::tir::PrimFunc const &func) -> bool {
+    return func->GetAttr<::tvm::Bool>("runner_function", ::tvm::Bool(false))
+        .value();
+  };
+
+  std::vector<std::pair<::tvm::GlobalVar, ::tvm::tir::PrimFunc>> funcs;
+  for (auto [gvar, base_func] : mod->functions) {
+    ICHECK(base_func->IsInstance<::tvm::tir::PrimFuncNode>())
+        << "CodegenCHost: Can only take PrimFunc";
+    auto prim_func = ::tvm::Downcast<::tvm::tir::PrimFunc>(base_func);
+    funcs.push_back({gvar, prim_func});
+  }
+
+  auto sort_key = [&is_aot_executor_fn](const auto &kv) {
+    return std::tuple{is_aot_executor_fn(kv.second), kv.first->name_hint};
+  };
+  std::sort(funcs.begin(), funcs.end(),
+            [&sort_key](const auto &kv_a, const auto &kv_b) {
+              return sort_key(kv_a) < sort_key(kv_b);
+            });
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.DeclareFunction(gvar, prim_func);
+  }
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.AddFunction(gvar, prim_func, emit_fwd_func_decl);
+  }
+
+  std::string code = cg.Finish();
+  return ::tvm::codegen::CSourceModuleCreate(code, "c", cg.GetFunctionNames());
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangCHost);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/target/codegen_c_host.h b/src/target/codegen_c_host.h
new file mode 100644
index 00000000..8d54cb4a
--- /dev/null
+++ b/src/target/codegen_c_host.h
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.h
+ * \brief Generate C host code (TileLang copy).
+ */
+#ifndef TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#define TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "target/source/codegen_c.h"
+#include "tvm/target/codegen.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+// TileLang copy of TVM's CodeGenCHost, under the tl namespace.
+// Inherits from tvm::codegen::CodeGenC.
+class CodeGenCHost : public tvm::codegen::CodeGenC {
+public:
+  CodeGenCHost();
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
+            std::string target_str,
+            const std::unordered_set<std::string> &devices);
+
+  void InitGlobalContext();
+
+  void AddFunction(const tvm::GlobalVar &gvar,
+                   const tvm::tir::PrimFunc &f) override;
+  void AddFunction(const tvm::GlobalVar &gvar, const tvm::tir::PrimFunc &f,
+                   bool emit_fwd_func_decl);
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a
+   * deterministic order. This helps with debugging.
+   *
+   * \param functions A vector of unordered range of current module.
+   */
+  void AddFunctionsOrdered(
+      std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> functions);
+  void DefineModuleName();
+
+  using tvm::codegen::CodeGenC::PrintType;
+  void PrintType(tvm::DataType t, std::ostream &os) final; // NOLINT(*)
+  void PrintFuncPrefix(std::ostream &os) final;            // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const tvm::tir::BroadcastNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::CallNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  // overload min and max to use the ternary operator, so we don't rely on the
+  // standard library implementations
+  void VisitExpr_(const tvm::tir::MinNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::MaxNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+
+  void VisitStmt_(const tvm::tir::AssertStmtNode *op) final; // NOLINT(*)
+
+  void GenerateForwardFunctionDeclarations(
+      tvm::ffi::String global_symbol,
+      const tvm::ffi::Array<tvm::Type> &arg_types,
+      const tvm::Type &ret_type) override;
+  tvm::ffi::Array<tvm::ffi::String> GetFunctionNames() {
+    return function_names_;
+  }
+
+private:
+  std::string module_name_;
+  /* \brief mapping global packed func to the unique name */
+  std::unordered_map<std::string, std::string> declared_globals_;
+  /* \brief names of the functions declared in this module */
+  tvm::ffi::Array<tvm::ffi::String> function_names_;
+  /*! \brief whether to emit asserts in the resulting C code */
+  bool emit_asserts_;
+  /*! \brief whether to emit forwared function declarations in the resulting C
+   * code */
+  bool emit_fwd_func_decl_;
+  /*! \brief whether to generate the entry function if encountered */
+  bool has_main_func_ = false;
+
+  std::string GetPackedName(const tvm::tir::CallNode *op);
+  void PrintGetFuncFromBackend(const std::string &func_name,
+                               const std::string &packed_func_name);
+  void PrintCallPacked(const tvm::tir::CallNode *op);
+  /*!
+   * \brief Print ternary conditional operator implementing binary `op`
+   * Forces the operands to be in SSA form.
+   * \param op binary operator being expressed
+   * \param compare string representation of comparison operator
+   * \param os stream reference to print into
+   */
+  template <typename T>
+  inline void PrintTernaryCondExpr(const T *op, const char *compare,
+                                   std::ostream &os); // NOLINT(*)
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index 9accf530..975f9a48 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -203,12 +203,12 @@ void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
   this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
+               << "(TVMFFIAny*) stack_value"
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -228,13 +228,13 @@ void CodeGenTileLangCPP::PrintFuncCallC(
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
 
   this->stream << "if (" << packed_func_name << "( "
-               << "(TVMValue*) stack_value "
+               << "(TVMFFIAny*) stack_value "
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index bb69170f..cbef0e64 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -24,7 +24,11 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index 50991d63..1e5c689c 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -35,7 +35,11 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 7df6d0cc..6a0909b8 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -51,6 +51,43 @@ void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
   }
 }
 
+bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                             const std::string &arg_name, bool with_lets,
+                             const PrimExpr &nullable_guard) {
+  // Currently only used in BindDLTensor, nullable_guard is already a defined
+  // bool, so use it directly.
+  auto MakeGuarded = [&](PrimExpr basic) -> PrimExpr {
+    // is_null || basic
+    return Or(nullable_guard, basic);
+  };
+
+  ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  if (const VarNode *v = arg.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it == def_map_->end()) {
+      // First time binding: identical behavior as Bind_
+      Var v_arg = Downcast<Var>(arg);
+      defs_.emplace_back(v_arg);
+      if (with_lets) {
+        (*def_map_)[v] = arg;
+        init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+      } else {
+        (*def_map_)[v] = value;
+      }
+      return true;
+    } else {
+      // Second or later binding: add is_null short-circuit
+      PrimExpr cond = MakeGuarded(it->second == value);
+      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
+    }
+  } else {
+    // For non-Var expressions, also add is_null short-circuit
+    PrimExpr cond = MakeGuarded(arg == value);
+    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
+  }
+  return false;
+}
+
 bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
                       const std::string &arg_name, bool with_lets) {
   ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
@@ -96,8 +133,30 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
                            const std::string &arg_name, bool fuzzy_match) {
   ICHECK_EQ(arg.scope(), value.scope())
       << "Argument " << arg_name << " Buffer bind scope mismatch";
-  ICHECK_EQ(arg->dtype, value->dtype)
-      << "Argument " << arg_name << " Buffer bind data type mismatch";
+  // Relax dtype check to allow FP8 E4M3 variants to bind together.
+  auto dtype_compatible = [](DataType expected, DataType provided) -> bool {
+    if (expected == provided)
+      return true;
+    // If expected is float8_e4m3, allow float8_e4m3fn/float8_e4m3fnuz as well.
+    if (expected.is_float8_e4m3()) {
+      return provided.is_float8_e4m3() || provided.is_float8_e4m3fn() ||
+             provided.is_float8_e4m3fnuz();
+    }
+    // If expected is float8_e5m2, allow float8_e5m2fnuz as well.
+    if (expected.is_float8_e5m2()) {
+      return provided.is_float8_e5m2() || provided.is_float8_e5m2fnuz();
+    }
+    // If expected is bool, allow binding from int8/uint8 with same lanes.
+    if (expected.is_bool()) {
+      bool is_i8 = provided.is_int() && provided.bits() == 8;
+      bool is_u8 = provided.is_uint() && provided.bits() == 8;
+      return (is_i8 || is_u8) && expected.lanes() == provided.lanes();
+    }
+    return false;
+  };
+  ICHECK(dtype_compatible(arg->dtype, value->dtype))
+      << "Argument " << arg_name << " Buffer bind data type mismatch: expected "
+      << arg->dtype << ", got " << value->dtype;
   if (value->data_alignment % arg->data_alignment != 0) {
     LOG(WARNING) << "Trying to bind buffer to another one with lower alignment "
                     "requirement "
@@ -167,10 +226,15 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   const DataType tvm_ndim_type = DataType::Int(32);
   const Stmt nop = Evaluate(0);
 
-  init_nest_.emplace_back(AssertStmt(
-      !Call(DataType::Bool(), builtin::isnullptr(), {handle}),
-      StringImm(arg_name + " is expected to have non-NULL DLTensor* pointer"),
-      nop));
+  // Allow NULL DLTensor* for optional inputs.  When the handle is NULL,
+  // avoid dereferencing it by using expression-level conditionals and
+  // short-circuiting guards in asserts. Cache the null check in a Let-bound
+  // boolean so codegen does not repeat `(handle == NULL)` everywhere.
+  Var is_null_var(arg_name + "_is_null", DataType::Bool());
+  init_nest_.emplace_back(
+      LetStmt(is_null_var,
+              Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
+  const PrimExpr &is_null = is_null_var;
 
   // dimension checks
   PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
@@ -193,25 +257,91 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   PrimExpr a_ndim =
       make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
   std::ostringstream ndim_err_msg;
+  // Note: We cannot embed runtime values into the message string.
+  // Keep message human-friendly without printing TIR exprs.
   ndim_err_msg << arg_name << ".ndim is expected to equal "
-               << buffer->shape.size();
+               << buffer->shape.size() << ", but got mismatched ndim";
   auto msg = StringImm(ndim_err_msg.str());
-  init_nest_.emplace_back(AssertStmt(a_ndim == v_ndim, msg, nop));
+  // Only check ndim when handle is non-NULL (using short-circuit OR)
+  v_ndim = tvm::if_then_else(Not(is_null), v_ndim, make_zero(tvm_ndim_type));
+  init_nest_.emplace_back(AssertStmt(Or(is_null, a_ndim == v_ndim), msg, nop));
   // type checks
   std::ostringstream type_err_msg;
-  type_err_msg << arg_name << ".dtype is expected to be " << buffer->dtype;
-  PrimExpr cond =
-      (TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode) ==
-           IntImm(DataType::UInt(8), buffer->dtype.code()) &&
-       TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits) ==
-           IntImm(DataType::UInt(8), buffer->dtype.bits()) &&
-       TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes) ==
-           IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+  // Avoid dumping TIR expressions in error text; just state mismatch.
+  // Include expected dtype triplet for clarity.
+  type_err_msg << arg_name << ".dtype is expected to be " << buffer->dtype
+               << ", but got incompatible dtype";
+  // Guard all dtype field loads by `is_null` using if_then_else
+  PrimExpr v_type_code = tvm::if_then_else(
+      Not(is_null),
+      TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
+      IntImm(DataType::UInt(8), buffer->dtype.code()));
+  PrimExpr v_type_bits = tvm::if_then_else(
+      Not(is_null),
+      TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
+      IntImm(DataType::UInt(8), buffer->dtype.bits()));
+  PrimExpr v_type_lanes = tvm::if_then_else(
+      Not(is_null),
+      TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
+      IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+  PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
+  PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
+  PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
+
+  PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
+                   v_type_lanes == expect_lanes);
+
+  // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
+  if (buffer->dtype.is_float8_e4m3()) {
+    PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
+    PrimExpr code_e4m3fn = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
+    PrimExpr code_e4m3fnuz =
+        IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
+    PrimExpr code_match =
+        (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
+         v_type_code == code_e4m3fnuz);
+    cond = cond || (code_match && v_type_bits == expect_bits &&
+                    v_type_lanes == expect_lanes);
+  }
+  // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
+  if (buffer->dtype.is_float8_e5m2()) {
+    PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
+    PrimExpr code_e5m2fnuz =
+        IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
+    PrimExpr code_match =
+        (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
+    cond = cond || (code_match && v_type_bits == expect_bits &&
+                    v_type_lanes == expect_lanes);
+  }
+  // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
+  if (buffer->dtype.is_bool()) {
+    PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+    PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
+    PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
+    PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+    PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
+    PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
+    PrimExpr int8_ok =
+        (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
+    PrimExpr uint8_ok =
+        (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
+    // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
+    // bits=8 or bits=1.
+    PrimExpr kdlbool8_ok =
+        (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
+    PrimExpr kdlbool1_ok =
+        (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
+    // Also accept any dtype whose bitwidth=1, regardless of code, to be
+    // defensive.
+    PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
+    cond = cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+  }
   if (!(buffer->dtype == DataType::Int(1) ||
         buffer->dtype == DataType::Int(4) ||
         buffer->dtype == DataType::UInt(4))) {
     auto type_msg = StringImm(type_err_msg.str());
-    asserts_.emplace_back(AssertStmt(cond, type_msg, nop));
+    // Only check dtype when handle is non-NULL (short-circuit)
+    asserts_.emplace_back(AssertStmt(Or(is_null, cond), type_msg, nop));
   }
 
   // shape field
@@ -220,32 +350,70 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
                   tvm_shape_type, shape_handle_name());
   Var v_shape(shape_handle_name(), DataType::Handle());
   def_handle_dtype_.Set(v_shape, make_const(tvm_shape_type, 0));
-  init_nest_.emplace_back(LetStmt(
-      buf_shape->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape), nop));
+  // Use if_then_else for NULL guard on the shape pointer itself, avoiding
+  // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
+  init_nest_.emplace_back(
+      LetStmt(buf_shape->data,
+              tvm::if_then_else(
+                  Not(is_null),
+                  TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
+                  make_zero(DataType::Handle())),
+              nop));
   init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
+
   for (size_t k = 0; k < buffer->shape.size(); ++k) {
+    // These packed-bit dtype shapes were not bound in the original
+    // implementation, so we just use them as is.
     if (buffer->dtype == DataType::Int(4) ||
         buffer->dtype == DataType::UInt(4) ||
         buffer->dtype == DataType::Int(1)) {
       break;
     }
-    Bind_(buffer->shape[k],
-          cast(buffer->shape[k].dtype(),
-               BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)})),
-          shape_element_name(k), true);
+
+    // The "real" runtime shape value read from DLTensor
+    PrimExpr raw_shape_val =
+        cast(buffer->shape[k].dtype(),
+             BufferLoad(buf_shape,
+                        {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+    // Bind to the value of the symbolic dimension (e.g., m) in TIR, with an
+    // is_null guard:
+    //   handle is NULL → use 0, placeholder but no dereference
+    //   handle non-NULL → actually read from DLTensor's shape array
+    PrimExpr bound_shape_val = tvm::if_then_else(
+        is_null, make_zero(buffer->shape[k].dtype()), raw_shape_val);
+
+    // When first encountering a Var (e.g., m), this will generate:
+    //   Let(m, bound_shape_val, ...)
+    // Constant dimensions will only generate consistency assertions.
+    BindNullable(buffer->shape[k], bound_shape_val, shape_element_name(k), true,
+                 is_null);
+
+    // Keep an explicit "consistency check": when non-NULL, the symbolic
+    // dimension must equal the DLTensor's shape.
+    Stmt shape_check = AssertStmt(
+        Or(is_null, buffer->shape[k] == raw_shape_val),
+        StringImm(shape_element_name(k) + " mismatch with DLTensor shape"),
+        Evaluate(0));
+    asserts_.emplace_back(shape_check);
   }
+
   // strides field
   Buffer buf_strides =
       decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
                   tvm_shape_type, arg_name + ".strides");
   def_handle_dtype_.Set(buf_strides->data, tir::TypeAnnotation(tvm_shape_type));
-  init_nest_.emplace_back(LetStmt(
-      buf_strides->data,
-      TVMArrayGet(DataType::Handle(), handle, builtin::kArrStrides), nop));
+  init_nest_.emplace_back(
+      LetStmt(buf_strides->data,
+              tvm::if_then_else(
+                  Not(is_null),
+                  TVMArrayGet(DataType::Handle(), handle, builtin::kArrStrides),
+                  make_zero(DataType::Handle())),
+              nop));
   init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
   PrimExpr v_strides_is_null =
       Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
   if (buffer->strides.empty()) {
     // Assert the buffer is compact
     DataType stype = buffer->DefaultIndexType();
@@ -253,13 +421,16 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
     ffi::Array<PrimExpr> conds;
     for (size_t i = buffer->shape.size(); i != 0; --i) {
       size_t k = i - 1;
-      PrimExpr svalue =
-          cast(stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+      PrimExpr svalue = cast(
+          stype, BufferLoad(buf_strides,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
       conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
       expect_stride = expect_stride * buffer->shape[k];
     }
     std::ostringstream stride_err_msg;
-    stride_err_msg << stride_handle_name() << ": expected to be compact array";
+    stride_err_msg
+        << stride_handle_name()
+        << ": expected to be compact array, but got non-compact strides";
     if (!conds.empty()) {
       auto stride_msg = StringImm(stride_err_msg.str());
       Stmt check =
@@ -267,6 +438,7 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
                               Span span) { return logical_and(a, b, span); },
                            const_true(1), conds),
                      stride_msg, Evaluate(0));
+      // Only check when strides array is actually present at runtime
       check = IfThenElse(Not(v_strides_is_null), check);
       asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
     }
@@ -277,13 +449,27 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
       DataType stride_dtype = buffer->strides[k].dtype();
       PrimExpr explicit_stride =
           cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+               BufferLoad(buf_strides,
+                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
       PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-      PrimExpr value = tvm::if_then_else(
+
+      PrimExpr core_value = tvm::if_then_else(
           v_strides_is_null, stride_from_shape_cast, explicit_stride);
-      value = tvm::if_then_else(buffer->shape[k] == 1, make_zero(stride_dtype),
-                                value);
-      Bind_(buffer->strides[k], value, stride_element_name(k), true);
+      core_value = tvm::if_then_else(buffer->shape[k] == 1,
+                                     make_zero(stride_dtype), core_value);
+
+      // Bind like shape: define var when needed, and only assert when non-NULL
+      PrimExpr bound_stride_val =
+          tvm::if_then_else(is_null, make_zero(stride_dtype), core_value);
+      BindNullable(buffer->strides[k], bound_stride_val, stride_element_name(k),
+                   true, is_null);
+
+      Stmt stride_check = AssertStmt(
+          Or(is_null, buffer->strides[k] == core_value),
+          StringImm(stride_element_name(k) + " mismatch with DLTensor strides"),
+          Evaluate(0));
+      asserts_.emplace_back(stride_check);
+
       PrimExpr shape_extent = cast(stride_dtype, buffer->shape[k]);
       stride_from_shape =
           analyzer_.Simplify(stride_from_shape_cast * shape_extent);
@@ -291,7 +477,7 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   } else {
     PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
 
-    for (int k = buffer->strides.size() - 1; k >= 0; k--) {
+    for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0; --k) {
       DataType stride_dtype = buffer->strides[k].dtype();
       PrimExpr explicit_stride =
           cast(stride_dtype,
@@ -300,75 +486,127 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
           stride_dtype, BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
       PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
 
-      Bind_(buffer->strides[k],
-            tvm::if_then_else(v_strides_is_null, stride_from_shape_cast,
-                              explicit_stride),
-            stride_element_name(k), true);
+      PrimExpr core_value = tvm::if_then_else(
+          v_strides_is_null, stride_from_shape_cast, explicit_stride);
+
+      PrimExpr bound_stride_val =
+          tvm::if_then_else(is_null, make_zero(stride_dtype), core_value);
+      BindNullable(buffer->strides[k], bound_stride_val, stride_element_name(k),
+                   true, is_null);
+
+      Stmt stride_check = AssertStmt(
+          Or(is_null, buffer->strides[k] == core_value),
+          StringImm(stride_element_name(k) + " mismatch with DLTensor strides"),
+          Evaluate(0));
+      asserts_.emplace_back(stride_check);
 
       stride_from_shape =
           analyzer_.Simplify(stride_from_shape_cast * shape_stride);
     }
   }
+
   // Byte_offset field.
   int data_bytes = GetVectorBytes(buffer->dtype);
 
   if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    Bind_(make_const(DataType::UInt(64), const_offset->value * data_bytes),
-          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
-          arg_name + ".byte_offset", true);
+    // Constant elem_offset: only need consistency check, no need for additional
+    // Var binding.
+    PrimExpr actual_byte_offset = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+        make_const(DataType::UInt(64), 0));
+    PrimExpr expect_byte_offset =
+        make_const(DataType::UInt(64), const_offset->value * data_bytes);
+    Stmt byte_off_check =
+        AssertStmt(Or(is_null, expect_byte_offset == actual_byte_offset),
+                   StringImm(arg_name + ".byte_offset mismatch"), nop);
+    asserts_.emplace_back(byte_off_check);
   } else {
-    if (Bind_(buffer->elem_offset,
-              cast(buffer->elem_offset.dtype(),
-                   (TVMArrayGet(DataType::UInt(64), handle,
-                                builtin::kArrByteOffset) /
-                    make_const(DataType::UInt(64), data_bytes))),
-              arg_name + ".elem_offset", true)) {
-      if (buffer->offset_factor > 1) {
-        PrimExpr offset = buffer->elem_offset;
-        PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
-        PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
-                        arg_name + ".elem_offset", &asserts_);
-      }
+    PrimExpr actual_byte_offset = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+        make_const(DataType::UInt(64), 0));
+    PrimExpr expect_elem_off =
+        cast(buffer->elem_offset.dtype(),
+             (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
+
+    // Like shape/stride, do NULL-safe binding for elem_offset:
+    //   handle is NULL → 0
+    //   handle non-NULL → actual_byte_offset / data_bytes
+    PrimExpr bound_elem_off = tvm::if_then_else(
+        is_null, make_zero(buffer->elem_offset.dtype()), expect_elem_off);
+    BindNullable(buffer->elem_offset, bound_elem_off, arg_name + ".elem_offset",
+                 true, is_null);
+
+    // Strict consistency check for non-NULL case
+    Stmt elem_off_check =
+        AssertStmt(Or(is_null, buffer->elem_offset == expect_elem_off),
+                   StringImm(arg_name + ".elem_offset mismatch"), nop);
+    asserts_.emplace_back(elem_off_check);
+
+    if (buffer->offset_factor > 1) {
+      PrimExpr offset = buffer->elem_offset;
+      PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
+      PrimExpr zero = make_zero(offset.dtype());
+      Stmt off_factor_check =
+          AssertStmt(Or(is_null, truncmod(offset, factor) == zero),
+                     StringImm(arg_name + ".elem_offset factor mismatch"), nop);
+      asserts_.emplace_back(off_factor_check);
     }
   }
+
   // device info.
-  Bind_(device_type,
-        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
-        arg_name + ".device_type", true);
-  Bind_(device_id,
-        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
-        arg_name + ".device_id", true);
+  // Define device_id from handle when available (so later passes can use it)
+  PrimExpr actual_dev_type = tvm::if_then_else(
+      Not(is_null),
+      TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
+      make_zero(DataType::Int(32)));
+  PrimExpr actual_dev_id = tvm::if_then_else(
+      Not(is_null),
+      TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
+      make_zero(DataType::Int(32)));
+  // Bind device_id to a safe expression (0 when NULL handle)
+  BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
+               is_null);
+  // Check device_type consistency (device_id equality is implicitly ensured by
+  // binding above)
+  init_nest_.emplace_back(
+      AssertStmt(Or(is_null, device_type == actual_dev_type),
+                 StringImm(arg_name + ".device_type mismatch"), nop));
 
   // Data field.  Because the validation of the data field may depend
   // on a dynamic size defined by the other DLTensor* parameters, this
   // field must be generated last.
-  if (Bind_(buffer->data,
-            TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
-            arg_name + ".data", true)) {
+  // Bind data pointer using expression-level guard to avoid deref on NULL.
+  {
     Var vptr(buffer->data);
+    PrimExpr data_ptr = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
+        make_zero(DataType::Handle()));
+    BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
 
     // Check if the data pointer is NULL.  This check is skipped for
-    // size-0 arrays, since CUDA provides a NULL pointer for size-zero
-    // allocations.
+    // size-0 arrays and also skipped when handle itself is NULL.
     auto alloc_size = [&]() -> PrimExpr {
       PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
-      for (const auto &dim : buffer->shape) {
+      for (const auto &dim : buffer->shape)
         product *= dim;
-      }
       return product;
     }();
     asserts_.emplace_back(AssertStmt(
-        alloc_size == 0 ||
-            !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-        StringImm(arg_name + " is expected to have non-NULL data pointer"),
+        Or(is_null, (alloc_size == 0) ||
+                        !Call(DataType::Bool(), builtin::isnullptr(), {vptr})),
+        StringImm(arg_name +
+                  " is expected to have non-NULL data pointer, but got NULL"),
         nop));
 
-    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
     // mark alignment of external bufs
     init_nest_.emplace_back(
         AttrStmt(vptr, tir::attr::storage_alignment,
                  IntImm(DataType::Int(32), buffer->data_alignment), nop));
+
+    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
   }
 }
 
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index d04e7e9b..cf9f8466 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -154,6 +154,10 @@ public:
     return def_handle_dtype_;
   }
 
+  bool BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                    const std::string &arg_name, bool with_lets,
+                    const PrimExpr &nullable_guard);
+
 private:
   // Internal bind function
   bool Bind_(const PrimExpr &arg, const PrimExpr &value,
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index b082a574..e9c848ac 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -26,10 +26,13 @@ public:
     LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
     Map<Var, Array<PrimExpr>> init_desc_arg_map;
+    // Collect prologue/epilogue statements for host-side setup/teardown
+    Array<Stmt> prologue_stmts;
+    Array<Stmt> epilogue_stmts;
     for (const auto &[call, var] : substituter.desc_map_) {
       // Should allocate 128 bytes for TensorMap on stack
       Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
-                             {StringImm("arg_value"), 16});
+                             {StringImm("tvm_ffi_any"), 16});
       Array<PrimExpr> init_desc_args;
       if (call->op.same_as(create_tma_descriptor())) {
         init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
@@ -44,11 +47,66 @@ public:
       // add to function attribute
       Call init_desc =
           Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
-      fptr->body =
-          LetStmt(var, alloc_desc, SeqStmt({Evaluate(init_desc), fptr->body}));
+      // Accumulate TMA descriptor init into prologue
+      prologue_stmts.push_back(LetStmt(var, alloc_desc, Evaluate(init_desc)));
       init_desc_arg_map.Set(var, init_desc_args);
     }
     f = WithAttr(std::move(f), "tma_descriptor_args", init_desc_arg_map);
+
+    // Additionally, if L2 persistent cache annotations were lowered earlier,
+    // materialize TVM FFI calls to set the stream access policy window.
+    if (f->attrs.defined() && f->attrs->dict.count("l2_persistent_map")) {
+      auto l2_map =
+          f->GetAttr<Map<String, Array<PrimExpr>>>("l2_persistent_map");
+      if (l2_map.defined()) {
+        // Build a lookup from buffer name to Buffer object
+        std::unordered_map<std::string, Buffer> name2buf;
+        for (const auto &kv : f->buffer_map) {
+          name2buf.emplace(kv.second->name, kv.second);
+        }
+        for (const auto &kv : l2_map.value()) {
+          const std::string buf_name = kv.first;
+          const Array<PrimExpr> &args = kv.second;
+          if (name2buf.count(buf_name) == 0) {
+            continue;
+          }
+          const Buffer &buf = name2buf.at(buf_name);
+          // Build base pointer expression (read access)
+          PrimExpr base_ptr = buf.access_ptr(1);
+          // Args packed: func_name, base_ptr, num_bytes, hit_ratio
+          Array<PrimExpr> packed_args;
+          packed_args.push_back(
+              StringImm(tvm_cuda_stream_set_access_policy_window));
+          packed_args.push_back(base_ptr);
+          // size_in_bytes (args[1]) then hit_ratio (args[0])
+          ICHECK_GE(args.size(), 2);
+          packed_args.push_back(args[1]);
+          packed_args.push_back(args[0]);
+          prologue_stmts.push_back(Evaluate(Call(
+              DataType::Int(32), builtin::tvm_call_packed(), packed_args)));
+        }
+        // Add a single epilogue call to reset the access policy window and
+        // restore L2 limit
+        Array<PrimExpr> reset_args;
+        reset_args.push_back(
+            StringImm(tvm_cuda_stream_reset_access_policy_window));
+        epilogue_stmts.push_back(Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), reset_args)));
+      }
+    }
+
+    // Stitch prologue statements before the original body
+    if (!prologue_stmts.empty()) {
+      // Chain the Let/Evaluate statements sequentially
+      Stmt seq = prologue_stmts.size() == 1 ? prologue_stmts[0]
+                                            : SeqStmt(prologue_stmts);
+      fptr->body = SeqStmt({seq, fptr->body});
+    }
+    if (!epilogue_stmts.empty()) {
+      Stmt seq_end = epilogue_stmts.size() == 1 ? epilogue_stmts[0]
+                                                : SeqStmt(epilogue_stmts);
+      fptr->body = SeqStmt({fptr->body, seq_end});
+    }
     return f;
   }
 
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 545d2403..187a75dc 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -20,6 +20,7 @@
 /*!
  * \file make_packed_api.cc Lower PrimFunc to use the packed function API.
  */
+#include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,6 +33,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -43,13 +45,11 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 using namespace ffi;
-static constexpr const char *kDeviceContextVar = "device_api_context";
 
 namespace {
 class ReturnRewriter : public StmtMutator {
 public:
-  explicit ReturnRewriter(Var ret_var, Var ret_tcode)
-      : ret_var_(std::move(ret_var)), ret_tcode_(std::move(ret_tcode)) {}
+  explicit ReturnRewriter(Var ret_var) : ret_var_(ret_var) {}
 
   Stmt VisitStmt_(const ForNode *node) override {
     if (node->kind == ForKind::kParallel)
@@ -79,8 +79,6 @@ private:
   struct ConvertedInfo {
     int type_index{-1};
     PrimExpr expr;
-    Buffer dummy_val_buffer;
-    Buffer dummy_tcode_buffer;
   };
 
   ConvertedInfo ConvertForFFI(const PrimExpr &val) {
@@ -88,7 +86,11 @@ private:
 
     // convert val's data type to FFI data type, return type code
     DataType dtype = val.dtype();
-    if (dtype.is_int() || dtype.is_uint()) {
+    if (dtype.is_bool()) {
+      info.type_index = ffi::TypeIndex::kTVMFFIBool;
+      info.expr = Cast(DataType::Int(64), val);
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
       info.expr = Cast(DataType::Int(64), val);
     } else if (dtype.is_float()) {
@@ -101,56 +103,39 @@ private:
       LOG(FATAL) << "data type " << dtype << " not supported yet";
     }
 
-    // If multiple return locations have the same data type, use the
-    // same dummy buffer declaration.
-    auto it = dummy_val_buffer_map_.find(info.type_index);
-    if (it != dummy_val_buffer_map_.end()) {
-      info.dummy_val_buffer = it->second;
-    } else {
-      info.dummy_val_buffer =
-          Buffer(ret_var_, info.expr.dtype(), {1}, {1}, ConstInt32(0),
-                 ret_var_->name_hint, 0, 0, kDefault);
-      dummy_val_buffer_map_[info.type_index] = info.dummy_val_buffer;
-    }
-
-    // The type_index is always a 32-bit int, so we don't need to have a
-    // separate map.
-    if (!dummy_tcode_buffer_.defined()) {
-      dummy_tcode_buffer_ =
-          Buffer(ret_tcode_, DataType::Int(32), {1}, {1}, ConstInt32(0),
-                 ret_tcode_->name_hint, 0, 0, kDefault);
-    }
-    info.dummy_tcode_buffer = dummy_tcode_buffer_;
-
     return info;
   }
 
-  Stmt WriteToOut(const PrimExpr &val) {
+  Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
-    Stmt store_val = BufferStore(info.dummy_val_buffer, info.expr, {0});
-    Stmt store_tcode =
-        BufferStore(info.dummy_tcode_buffer, info.type_index, {0});
+    Stmt store_tindex = tir::Evaluate(
+        tir::Call(DataType::Int(32), tir::builtin::tvm_struct_set(),
+                  {ret_var_, IntImm(DataType::Int(32), 0),
+                   IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyTypeIndex),
+                   IntImm(DataType::Int(32), info.type_index)}));
+    Stmt store_zero_padding = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyZeroPadding),
+         IntImm(DataType::Int(32), 0)}));
+    Stmt store_val = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyUnionValue),
+         info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
-    return SeqStmt({store_val, store_tcode, ret_zero});
+    return SeqStmt({store_tindex, store_zero_padding, store_val, ret_zero});
   }
 
   Var ret_var_;
-  Var ret_tcode_;
   int in_parallel_{0};
-
-  std::unordered_map<int, Buffer> dummy_val_buffer_map_;
-  Buffer dummy_tcode_buffer_;
 };
 
-Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) {
-  ReturnRewriter rewriter(std::move(ret_var), std::move(ret_tcode));
-  return rewriter(std::move(body));
-}
-
 class SubroutineCallRewriter : public StmtExprMutator {
 public:
-  static Optional<Stmt> Apply(const Map<GlobalVar, String> &packed_func_methods,
-                              Stmt stmt) {
+  static ffi::Optional<Stmt>
+  Apply(const ffi::Map<GlobalVar, ffi::String> &packed_func_methods,
+        Stmt stmt) {
     SubroutineCallRewriter rewriter(packed_func_methods);
     stmt = rewriter.VisitStmt(stmt);
     if (rewriter.made_change_) {
@@ -162,16 +147,16 @@ public:
 
 private:
   explicit SubroutineCallRewriter(
-      const Map<GlobalVar, String> &packed_func_methods)
+      const ffi::Map<GlobalVar, ffi::String> &packed_func_methods)
       : packed_func_methods(packed_func_methods) {}
 
   PrimExpr VisitExpr_(const CallNode *op) override {
     auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
 
     if (auto *gvar_ptr = node->op.as<GlobalVarNode>()) {
-      auto gvar = tvm::ffi::GetRef<GlobalVar>(gvar_ptr);
+      auto gvar = ffi::GetRef<GlobalVar>(gvar_ptr);
       if (auto symbol = packed_func_methods.Get(gvar)) {
-        Array<PrimExpr> cpacked_args;
+        ffi::Array<PrimExpr> cpacked_args;
         cpacked_args.push_back(tir::StringImm(symbol.value()));
         for (auto arg : node->args) {
           cpacked_args.push_back(arg);
@@ -187,19 +172,18 @@ private:
 
     return node;
   }
-  const Map<GlobalVar, String> &packed_func_methods;
+  const ffi::Map<GlobalVar, ffi::String> &packed_func_methods;
   bool made_change_{false};
 };
 
 } // namespace
 
-inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, const std::string &msg) {
-  return AssertStmt(std::move(lhs) == std::move(rhs), tvm::tir::StringImm(msg),
-                    Evaluate(0));
+inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
+  return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
-inline Stmt MakeAssertNotNull(PrimExpr ptr, const std::string &msg) {
-  Call isnull(DataType::Bool(), builtin::isnullptr(), {std::move(ptr)});
+inline Stmt MakeAssertNotNull(PrimExpr ptr, std::string msg) {
+  Call isnull(DataType::Bool(), builtin::isnullptr(), {ptr});
   return AssertStmt(!isnull, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
@@ -254,21 +238,16 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   auto *func_ptr = func.CopyOnWrite();
+  // set the global symbol to the packed function name
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
 
   // Data field definitions
   // The packed fields
+  Var v_self_handle("self_handle", DataType::Handle());
   Var v_packed_args("args", DataType::Handle());
-  Buffer buf_packed_arg_type_ids =
-      decl_buffer({IntImm(DataType::Int(32), func_ptr->params.size())},
-                  DataType::Int(32), "arg_type_ids");
   Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_out_ret_value("out_ret_value", PointerType(PrimType(DataType::Void())));
-  Var v_out_ret_tcode("out_ret_tcode",
-                      PointerType(PrimType(DataType::Int(32))));
-  Var v_resource_handle("resource_handle", DataType::Handle());
-  // The arguments of the function.
+  Var v_result("result", PointerType(PrimType(DataType::Void())));
 
   // The device context
   Var device_id("dev_id");
@@ -278,37 +257,24 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   std::vector<Stmt> seq_init, seq_check, arg_buffer_declarations;
   std::unordered_map<const VarNode *, PrimExpr> vmap;
   ArgBinder binder(&vmap);
-  std::vector<Stmt> shape_checks;
-  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-  bool disable_dynamic_tail_split =
-      ctxt->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
 
   // ---------------------------
   // local function definitions
   // load i-th argument as type t
-  auto f_arg_value = [&](DataType t, int i) {
-    Array<PrimExpr> call_args{
+  auto f_load_arg_value = [&](DataType arg_type, int i) {
+    ffi::Array<PrimExpr> call_args{
         v_packed_args, IntImm(DataType::Int(32), i),
-        IntImm(DataType::Int(32), builtin::kTVMValueContent)};
+        IntImm(DataType::Int(32), builtin::kTVMFFIAnyUnionValue)};
     // load 64 bit version
-    DataType api_type = APIType(t);
+    DataType api_type = APIType(arg_type);
     PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
     // cast to the target version.
-    if (api_type != t) {
-      res = Cast(t, res);
+    if (api_type != arg_type) {
+      res = Cast(arg_type, res);
     }
     return res;
   };
 
-  // Find the device API context argument based on name
-  for (const auto &param : func_ptr->params) {
-    if (param->name_hint == kDeviceContextVar) {
-      num_args--;
-      v_resource_handle = param;
-      break;
-    }
-  }
-
   // Assert correct type codes for each argument.  This must be done
   // *before* any initialization steps produced by
   // `binder.BindDLTensor()`.  The validity of those initialization
@@ -321,12 +287,10 @@ PrimFunc MakePackedAPI(PrimFunc func) {
         return error_message.str();
       }()));
 
-  seq_init.push_back(MakeAssertNotNull(
-      v_packed_args, name_hint + ": TVMValue* arg pointer was NULL"));
-  seq_init.push_back(MakeAssertNotNull(
-      buf_packed_arg_type_ids->data, name_hint + ": int* type_codes was NULL"));
-
-  seq_init.emplace_back(DeclBuffer(buf_packed_arg_type_ids, nop));
+  if (num_args > 0) {
+    seq_init.push_back(
+        MakeAssertNotNull(v_packed_args, name_hint + ": args pointer is NULL"));
+  }
 
   // Need to delay binding of the buffers, in case some arguments also
   // appear in the buffer.
@@ -335,26 +299,17 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
   for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
     Var param = func_ptr->params[i];
-
-    // Ignore the device context argument, as it will still be passed
-    // as a native argument.
-    if (param->name_hint == kDeviceContextVar) {
-      continue;
-    }
-
-    var_def.emplace_back(f_arg_value(param.dtype(), i), param);
-    if (func_ptr->buffer_map.count(param)) {
-      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
-    }
-
-    // type code checks
-    Var type_index(param->name_hint + ".code", DataType::Int(32));
-    seq_init.emplace_back(LetStmt(
+    PrimExpr arg_value;
+    // type index checks
+    Var type_index(param->name_hint + ".type_index", DataType::Int(32));
+    seq_init.push_back(LetStmt(
         type_index,
-        BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}),
+        tir::Call(DataType::Int(32), builtin::tvm_struct_get(),
+                  {v_packed_args, IntImm(DataType::Int(32), i),
+                   IntImm(DataType::Int(32), builtin::kTVMFFIAnyTypeIndex)}),
         nop));
-    DataType t = param.dtype();
-    if (t.is_handle()) {
+    DataType dtype = param.dtype();
+    if (dtype.is_handle()) {
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be pointer";
       seq_init.emplace_back(
@@ -363,23 +318,63 @@ PrimFunc MakePackedAPI(PrimFunc func) {
                          type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
                          type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                      tvm::tir::StringImm(msg.str()), nop));
-    } else if (t.is_int() || t.is_uint()) {
+      // if type_index is Tensor, we need to add the offset of the DLTensor
+      // header which always equals 16 bytes, this ensures that T.handle always
+      // shows up as a DLTensor*
+      const int64_t object_cell_offset = sizeof(TVMFFIObject);
+      static_assert(object_cell_offset == 24);
+      arg_value = f_load_arg_value(param.dtype(), i);
+      PrimExpr handle_from_tensor =
+          Call(DataType::Handle(), tir::builtin::handle_add_byte_offset(),
+               {arg_value, IntImm(DataType::Int(32), object_cell_offset)});
+      arg_value = Select(type_index == ffi::TypeIndex::kTVMFFITensor,
+                         handle_from_tensor, arg_value);
+    } else if (dtype.is_bool()) {
+      std::ostringstream msg;
+      msg << name_hint << ": Expect arg[" << i << "] to be boolean";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value =
+          Cast(DataType::Bool(), f_load_arg_value(DataType::Int(64), i));
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be int";
-      seq_init.emplace_back(AssertStmt(type_index == kDLInt,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value = f_load_arg_value(param.dtype(), i);
     } else {
-      ICHECK(t.is_float());
+      ICHECK(dtype.is_float());
       std::ostringstream msg;
       msg << name_hint << ": Expect arg[" << i << "] to be float";
-      seq_init.emplace_back(AssertStmt(type_index == kDLFloat,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      // use select so we can also handle int conversion to bool
+      arg_value = tir::Select(
+          type_index == ffi::TypeIndex::kTVMFFIFloat,
+          /* true_value = */ f_load_arg_value(param.dtype(), i),
+          /* false_value = */
+          Cast(param.dtype(), f_load_arg_value(DataType::Int(64), i)));
+    }
+    var_def.emplace_back(arg_value, param);
+    if (func_ptr->buffer_map.count(param)) {
+      // buffer binding now depends on type index
+      // if the index is Tensor handle, we need to offset to get the DLTensor*
+      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
     }
   }
 
-  Array<Var> args{v_packed_args,     buf_packed_arg_type_ids->data,
-                  v_num_packed_args, v_out_ret_value,
-                  v_out_ret_tcode,   v_resource_handle};
+  // signature: (void* handle, TVMFFIAny* packed_args, int num_args, TVMFFIAny*
+  // v_result)
+  ffi::Array<Var> args{v_self_handle, v_packed_args, v_num_packed_args,
+                       v_result};
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
@@ -392,83 +387,57 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     binder.Bind(param, expr, name_hint + "." + param->name_hint, true);
   }
 
-  for (const auto &kv : buffer_def) {
-    binder.BindDLTensor(kv.second, device_type, device_id, kv.first,
-                        name_hint + "." + kv.first->name_hint);
-    arg_buffer_declarations.push_back(DeclBuffer(kv.second, nop));
+  for (const auto &[var, buffer] : buffer_def) {
+    binder.BindDLTensor(buffer, device_type, device_id, var,
+                        name_hint + "." + var->name_hint);
+    arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
-
-  func =
-      WithAttrs(std::move(func),
-                {{tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)},
-                 {tvm::attr::kTarget, target_host}});
-  Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
+  // reset global symbol to attach prefix
+  func = WithAttrs(
+      std::move(func),
+      {{tvm::attr::kCallingConv, static_cast<int>(CallingConv::kCPackedFunc)},
+       {tvm::attr::kTarget, target_host},
+       {tvm::attr::kGlobalSymbol,
+        ffi::symbol::tvm_ffi_symbol_prefix + global_symbol.value()}});
+
+  Stmt body = ReturnRewriter(v_result)(func_ptr->body);
   body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::compute_scope,
                   StringImm(name_hint + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
-    auto node = String("default");
+    ffi::Any node = ffi::String("default");
     seq_check.push_back(AttrStmt(node, tir::attr::device_id, device_id, nop));
     seq_check.push_back(
         AttrStmt(node, tir::attr::device_type, device_type, nop));
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(DataType::Int(32), tir::builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device),
                          device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
   }
 
-  // (zhengju) For dynamic constraint, we need to check the buffer shape and
-  // dtype to make sure the buffer can be vectorized.
-  for (const auto &kv : buffer_def) {
-    if (disable_dynamic_tail_split) {
-      Optional<Integer> opt_dynamic_alignment =
-          ctxt->GetConfig(kDynamicAlignment, Optional<Integer>());
-      int dynamic_alignment = opt_dynamic_alignment.value_or(Integer(8))->value;
-      // The vectorize dimension will be the last dimension of the buffer
-      auto vectorize_dim = kv.second->shape[kv.second->shape.size() - 1];
-      auto shape_vectorize_expr = [&]() -> PrimExpr {
-        PrimExpr result = IntImm(kv.second->DefaultIndexType(), 1);
-        result = result * vectorize_dim;
-        result = FloorMod(result, IntImm(result->dtype, dynamic_alignment));
-        return result;
-      }();
-      shape_checks.emplace_back(AssertStmt(
-          shape_vectorize_expr == 0,
-          tvm::tir::StringImm(
-              kv.second->name +
-              ": Vectorize dimension in buffer must be divisible by " +
-              std::to_string(dynamic_alignment)),
-          nop));
-    }
-  }
-
   // Return error code of zero on success
   body = SeqStmt({body, Evaluate(ret(Integer(0)))});
 
-  if (!disable_dynamic_tail_split) {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations},
-                     body);
-  } else {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations, shape_checks},
-                     body);
-  }
-
+  body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
+                    arg_buffer_declarations},
+                   body);
   func_ptr->body = body;
   func_ptr->params = args;
 
-  Array<Var> undefined = UndefinedVars(func_ptr->body, func_ptr->params);
+  ffi::Array<Var> undefined = UndefinedVars(body, func_ptr->params);
+
   ICHECK_EQ(undefined.size(), 0)
       << "In PrimFunc " << name_hint << " variables " << undefined
       << " are used, but are not passed in as API arguments";
 
-  func_ptr->buffer_map = Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32)); // return the function.
+  func_ptr->buffer_map = ffi::Map<Var, Buffer>();
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+
+  // return the function.
   return func;
 }
 
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index d64c7016..5a83f0df 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -240,37 +240,42 @@ public:
     simplifier.MarkBufferMapShapes(func);
     func.CopyOnWrite()->body = simplifier(func->body);
 
-    // Begin to remove useless var and buffer
-    // First get used buffers
-    simplifier.used_buffers_ = CollectUsedBuffers(func);
-
-    bool param_updated = false;
-    Array<Var> new_params;
-    Map<Var, Buffer> new_buffer_map;
-    // Check whether each buffer is used
-    for (const auto &var : func->params) {
-      if (func->buffer_map.find(var) != func->buffer_map.end()) {
-        if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
-            simplifier.used_buffers_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
-        } else if (simplifier.used_in_buffer_def_.find(
-                       func->buffer_map[var]->data.get()) !=
-                   simplifier.used_in_buffer_def_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
+    // Optionally remove unused buffer parameters
+    if (simplify_arguments) {
+      // First get used buffers
+      simplifier.used_buffers_ = CollectUsedBuffers(func);
+
+      bool param_updated = false;
+      Array<Var> new_params;
+      Map<Var, Buffer> new_buffer_map;
+      // Check whether each buffer is used
+      for (const auto &var : func->params) {
+        if (func->buffer_map.find(var) != func->buffer_map.end()) {
+          if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
+              simplifier.used_buffers_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else if (simplifier.used_in_buffer_def_.find(
+                         func->buffer_map[var]->data.get()) !=
+                     simplifier.used_in_buffer_def_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else {
+            param_updated = true;
+          }
         } else {
-          param_updated = true;
+          // Non-buffer parameters (e.g., scalars) are always retained
+          new_params.push_back(var);
         }
       }
-    }
 
-    if (param_updated) {
-      return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
-                      new_buffer_map, func->attrs, func->span);
-    } else {
-      return func;
+      if (param_updated) {
+        return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
+                        new_buffer_map, func->attrs, func->span);
+      }
     }
+    // Either no change to params or argument simplification disabled
+    return func;
   }
 
 private:
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 1bc76161..fcfae4ed 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -13,7 +13,7 @@ def debug_print_buffer(M=16, N=16, dtype="float16"):
             shared_buf = T.alloc_shared([M, N], dtype)
             T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program, target="cuda", execution_backend="tvm_ffi")
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
index 07f4d784..4b9dff71 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
@@ -514,5 +514,4 @@ def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py b/testing/python/jit/test_tilelang_jit_gemm_ctypes.py
deleted file mode 100644
index fd5243f0..00000000
--- a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py
+++ /dev/null
@@ -1,411 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.language as T
-import tilelang.testing
-import tilelang
-import torch
-from tilelang.utils.tensor import map_torch_type
-
-
-def matmul(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_gemm(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
-
-
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
-def matmu_jit_kernel(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_gemm_jit_kernel(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmu_jit_kernel(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
-
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-
-    A = torch.randn(M, K, dtype=in_dtype).cuda()
-    B = torch.randn(K, N, dtype=in_dtype).cuda()
-
-    if trans_A:
-        A = A.T
-    if trans_B:
-        B = B.T
-
-    def ref_program(A, B):
-        import torch
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(out_dtype)
-        return C
-
-    ref_C = ref_program(A, B)
-    C = matmul_kernel(A, B)
-
-    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_gemm_jit_kernel():
-    run_gemm_jit_kernel(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
-def run_ctypes_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
-
-    profiler = matmul_kernel.get_profiler()
-
-    ctypes_latency = profiler.do_bench(func=matmul_kernel)
-    print(f"Ctypes Latency: {ctypes_latency} ms")
-
-    assert ctypes_latency is not None
-
-    tvm_latency = profiler.do_bench()
-    print(f"TVM Latency: {tvm_latency} ms")
-
-    assert tvm_latency is not None
-
-
-def test_ctypes_kernel_do_bench():
-    run_ctypes_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_ctypes_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
-    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
-
-    if trans_A:
-        tensor_a = tensor_a.T
-    if trans_B:
-        tensor_b = tensor_b.T
-    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
-
-    num_streams = 4
-    for _ in range(num_streams):
-        stream = torch.cuda.Stream()
-        with torch.cuda.stream(stream):
-            matmul_kernel(tensor_a, tensor_b, tensor_c)
-
-
-def test_ctypes_kernel_multi_stream():
-    run_ctypes_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_ctypes_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
-    if isinstance(M, T.Var):
-        M = 1024
-    if isinstance(N, T.Var):
-        N = 1024
-    if isinstance(K, T.Var):
-        K = 768
-
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-
-    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
-    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
-
-    if trans_A:
-        tensor_a = tensor_a.T
-    if trans_B:
-        tensor_b = tensor_b.T
-    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
-
-    matmul_kernel(tensor_a, tensor_b, tensor_c)
-
-    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_ctypes_dynamic_shape():
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
-
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
-
-
-if __name__ == "__main__":
-    # tilelang.testing.main()
-    test_gemm_f16f16f16_nn()
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index 6241ea90..07d4e04c 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -83,28 +83,27 @@ def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_
 
 
 def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    func = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
+    kernel = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
 
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
     d = torch.randn(N, device="cuda", dtype=map_torch_type(accum_dtype))
-
-    func(a, b, c, None, M, N, K, False)
+    kernel(a, b, c, None, M, N, K, False)
 
     ref_no_bias = (a @ b.T).to(map_torch_type(accum_dtype))
     ref_with_bias = ref_no_bias + d
 
     torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
 
-    func(a, b, c, d, M, N, K, True)
+    kernel(a, b, c, d, M, N, K, True)
 
     torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
 
-    func = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    func(a, b, c, None, False)
+    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
+    kernel(a, b, c, None, False)
     torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-    func(a, b, c, d, True)
+    kernel(a, b, c, d, True)
     torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
 
 
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
new file mode 100644
index 00000000..cd5d9c75
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -0,0 +1,589 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    stramp = "&*(XS)"
+
+    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
+    def tilelang_callback_cuda_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    kernel_source = matmul_kernel.get_kernel_source()
+
+    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
+
+
+def test_gemm_f16f16f16_nn():
+    run_gemm(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_tvm_ffi_kernel_do_bench(M,
+                                N,
+                                K,
+                                trans_A,
+                                trans_B,
+                                in_dtype,
+                                out_dtype,
+                                dtypeAccum,
+                                block_M,
+                                block_N,
+                                block_K,
+                                num_stages=3,
+                                num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+
+    profiler = matmul_kernel.get_profiler()
+
+    tvm_ffi_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"tvm_ffi Latency: {tvm_ffi_latency} ms")
+
+    assert tvm_ffi_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_tvm_ffi_kernel_do_bench():
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
+                                256, 32, 2)
+
+
+def run_tvm_ffi_kernel_multi_stream(M,
+                                    N,
+                                    K,
+                                    trans_A,
+                                    trans_B,
+                                    in_dtype,
+                                    out_dtype,
+                                    dtypeAccum,
+                                    block_M,
+                                    block_N,
+                                    block_K,
+                                    num_stages=3,
+                                    num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_tvm_ffi_kernel_multi_stream():
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
+                                    128, 256, 32, 2)
+
+
+def run_tvm_ffi_dynamic_shape(M,
+                              N,
+                              K,
+                              trans_A,
+                              trans_B,
+                              in_dtype,
+                              out_dtype,
+                              dtypeAccum,
+                              block_M,
+                              block_N,
+                              block_K,
+                              num_stages=3,
+                              num_threads=128):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(
+        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_dynamic_shape():
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
+        256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
+        "float16", 128, 256, 32, 2)
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N,
+                       C,
+                       H,
+                       W,
+                       F,
+                       K,
+                       S,
+                       D,
+                       P,
+                       block_M,
+                       block_N,
+                       block_K,
+                       num_stages,
+                       threads,
+                       dtype="float16",
+                       accum_dtype="float"):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((N, H, W, C), dtype),
+            kernel: T.Tensor((KH, KW, C, F), dtype),
+            out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(
+                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
+                threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.annotate_layout({
+                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+            })
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_tvm_ffi_im2col_tma_desc(N,
+                                C,
+                                H,
+                                W,
+                                F,
+                                K,
+                                S,
+                                D,
+                                P,
+                                block_M,
+                                block_N,
+                                block_K,
+                                num_stages=3,
+                                num_threads=256):
+    """Test im2col TMA descriptor functionality in tvm_ffi backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages,
+                                 num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(
+        out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_im2col_tma_desc():
+    """Test im2col TMA descriptor with tvm_ffi backend."""
+    if not check_hopper():
+        import pytest
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_tvm_ffi_im2col_tma_desc(
+        N=4,
+        C=64,
+        H=32,
+        W=32,
+        F=64,
+        K=3,
+        S=1,
+        D=1,
+        P=1,
+        block_M=64,
+        block_N=128,
+        block_K=32,
+        num_stages=3,
+        num_threads=256)
+
+
+def test_tvm_ffi_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="tvm_ffi")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype="float32",
+    ):
+
+        @T.prim_func
+        def kernel(
+                A: T.Tensor((M, N), dtype),
+                B: T.Tensor((M, N), dtype),
+                C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    source = kernel.get_host_source()
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 202d6bfa..149a1c28 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -113,7 +113,6 @@ def run_alloc_var_with_initializer(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    print(code)
     assert f"= {init_value};" in code
 
 
@@ -151,8 +150,7 @@ def run_alloc_multi_vars_with_initializer(
     program = alloc_multi_vars_with_initializer(N, block_N, dtype)
 
     kernel = tilelang.compile(program, out_idx=[1])
-    code = kernel.get_kernel_source()
-    print(code)
+    code = kernel.get_kernel_source(kernel_only=True)
     assert code.count("= 1;") == 1
     assert code.count("= 2;") == 1
 
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index b93c4448..3e401cc5 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -33,7 +33,7 @@ class CompileArgs:
     """Compile arguments for the auto-tuner. Detailed description can be found in `tilelang.jit.compile`.
     Attributes:
         out_idx: List of output tensor indices.
-        execution_backend: Execution backend to use for kernel execution (default: "cython").
+        execution_backend: Execution backend to use for kernel execution (default: "auto").
         target: Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host: Target host for cross-compilation (default: None).
         verbose: Whether to enable verbose output (default: False).
@@ -42,7 +42,7 @@ class CompileArgs:
     """
 
     out_idx: list[int] | int | None = None
-    execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython"
+    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto"
     target: Literal['auto', 'cuda', 'hip'] = 'auto'
     target_host: str | Target = None
     verbose: bool = False
@@ -208,7 +208,7 @@ class AutotuneResult:
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] | int | None = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
         func: Callable = None,
         verbose: bool = False,
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 7138f4c1..47ac888c 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -139,8 +139,9 @@ class AutoTuner:
 
     def set_compile_args(self,
                          out_idx: list[int] | int | None = None,
-                         target: Literal['auto', 'cuda', 'hip'] = 'auto',
-                         execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+                         target: Literal['auto', 'cuda', 'hip', 'metal'] = 'auto',
+                         execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                                                    "torch"] = "auto",
                          target_host: str | Target = None,
                          verbose: bool = False,
                          pass_configs: dict[str, Any] | None = None):
@@ -157,10 +158,15 @@ class AutoTuner:
         Returns:
             AutoTuner: Self for method chaining.
         """
+        # Normalize target to a concrete TVM Target and resolve execution backend
+        t = Target(determine_target(target))
+        from tilelang.jit.execution_backend import resolve_execution_backend
+        resolved_backend = resolve_execution_backend(execution_backend, t)
+
         self.compile_args = CompileArgs(
             out_idx=out_idx,
-            target=Target(determine_target(target)),
-            execution_backend=execution_backend,
+            target=t,
+            execution_backend=resolved_backend,
             target_host=target_host,
             verbose=verbose,
             pass_configs=pass_configs)
@@ -591,7 +597,7 @@ class AutoTuner:
             func=best_kernel.prim_func,
             kernel=best_kernel)
 
-        if self.compile_args.execution_backend in ("dlpack", "torch"):
+        if self.compile_args.execution_backend in ("torch"):
             logger.warning("DLPack backend does not support cache saving to disk.")
         else:
             with self._lock:
@@ -728,8 +734,9 @@ def autotune(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py
index c338ce61..144c2729 100644
--- a/tilelang/cache/__init__.py
+++ b/tilelang/cache/__init__.py
@@ -18,7 +18,8 @@ def cached(
     *args,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] | None = "cython",
+    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
+    | None = "auto",
     verbose: bool | None = False,
     pass_configs: dict | None = None,
     compile_flags: list[str] | str | None = None,
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index d0a801fb..74ecb278 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -13,14 +13,15 @@ from typing import Callable, Literal
 import cloudpickle
 from tvm.target import Target
 from tvm.tir import PrimFunc
-
+from tvm.runtime import Executable
 from tilelang.engine.param import KernelParam
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
 
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
 KERNEL_CUBIN_PATH = "kernel.cubin"
 KERNEL_PY_PATH = "kernel.py"
@@ -40,7 +41,7 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython"
+    execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi"
 
     def __new__(cls):
         """
@@ -69,7 +70,7 @@ class KernelCache:
         self,
         func: Callable,
         out_idx: list[int],
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         args=None,
         target: str | Target = "auto",
         target_host: str | Target = None,
@@ -117,7 +118,8 @@ class KernelCache:
         *args,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                                   "torch"] = "auto",
         verbose: bool = False,
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
@@ -135,12 +137,30 @@ class KernelCache:
         Returns:
             JITKernel: The compiled kernel, either freshly compiled or from cache
         """
+        # Normalize target and resolve execution backend before proceeding
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+        norm_target = Target(_determine_target(target)) if isinstance(target, str) else target
+        requested_backend = execution_backend
+        execution_backend = resolve_execution_backend(requested_backend, norm_target)
+        if verbose:
+            allowed_now = allowed_backends_for_target(norm_target, include_unavailable=False)
+            # Avoid duplicate logs when caller already resolved explicitly
+            if requested_backend in (None, "auto") or requested_backend != execution_backend:
+                self.logger.info(
+                    "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+                    execution_backend,
+                    requested_backend,
+                    norm_target.kind.name,
+                    ", ".join(sorted(allowed_now)),
+                )
+
         if not env.is_cache_enabled():
             return JITKernel(
                 func,
                 out_idx=out_idx,
                 execution_backend=execution_backend,
-                target=target,
+                target=norm_target,
                 target_host=target_host,
                 verbose=verbose,
                 pass_configs=pass_configs,
@@ -152,7 +172,7 @@ class KernelCache:
             out_idx=out_idx,
             execution_backend=execution_backend,
             args=args,
-            target=target,
+            target=norm_target,
             target_host=target_host,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -168,7 +188,7 @@ class KernelCache:
                 self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
 
             # Then check disk cache
-            kernel = self._load_kernel_from_disk(key, target, target_host, out_idx,
+            kernel = self._load_kernel_from_disk(key, norm_target, target_host, out_idx,
                                                  execution_backend, pass_configs, compile_flags,
                                                  func, verbose)
             if kernel is not None:
@@ -186,18 +206,15 @@ class KernelCache:
             func,
             out_idx=out_idx,
             execution_backend=execution_backend,
-            target=target,
+            target=norm_target,
             target_host=target_host,
             verbose=verbose,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
-        if execution_backend in ("dlpack", "torch"):
-            self.logger.warning("DLPack or torch backend does not support cache saving to disk.")
-        else:
-            with self._lock:
-                if env.is_cache_enabled():
-                    self._save_kernel_to_disk(key, kernel, func, verbose)
+        with self._lock:
+            if env.is_cache_enabled():
+                self._save_kernel_to_disk(key, kernel, func, verbose)
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -239,6 +256,12 @@ class KernelCache:
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        temp_path = os.path.join(env.TILELANG_TMP_DIR, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
     def _save_kernel_to_disk(self,
                              key: str,
                              kernel: JITKernel,
@@ -265,41 +288,46 @@ class KernelCache:
 
         # Save kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
             if verbose:
-                self.logger.debug(f"Saving kernel source code to file: {kernel_path}")
+                self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(kernel_path, "w",
+                KernelCache._safe_write_file(device_kernel_path, "w",
                                              lambda file: file.write(kernel.kernel_source))
         except Exception as e:
             self.logger.error(f"Error saving kernel source code to disk: {e}")
 
         # Save wrapped kernel source code
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
             if verbose:
-                self.logger.debug(
-                    f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            KernelCache._safe_write_file(
-                wrapped_kernel_path, "w",
-                lambda file: file.write(kernel.adapter.get_kernel_source()))
+                self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            if self.execution_backend == "tvm_ffi":
+                KernelCache._safe_write_file(
+                    host_kernel_path, "w",
+                    lambda file: file.write(kernel.adapter.get_host_source()))
+            else:
+                KernelCache._safe_write_file(
+                    host_kernel_path, "w",
+                    lambda file: file.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
-            self.logger.error(f"Error saving wrapped kernel source code to disk: {e}")
+            self.logger.error(f"Error saving host kernel source code to disk: {e}")
 
         # Save the kernel library
         try:
             # Save CUBIN or SO file
-            kernel_lib_path = KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH
+            if self.execution_backend == "nvrtc":
+                kernel_lib_path = KERNEL_CUBIN_PATH
+            elif self.execution_backend == "tvm_ffi":
+                kernel_lib_path = EXECUTABLE_PATH
+            else:
+                kernel_lib_path = KERNEL_LIB_PATH
+
             kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            KernelCache._safe_write_file(
-                kernel_lib_path, "wb",
-                lambda file: file.write(KernelCache._load_binary(src_lib_path)))
 
             # Save an extra Python file for NVRTC
             if self.execution_backend == "nvrtc":
+                src_lib_path = kernel.adapter.libpath
                 kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
                 src_lib_path = src_lib_path.replace(".cubin", ".py")
                 if verbose:
@@ -307,6 +335,19 @@ class KernelCache:
                 KernelCache._safe_write_file(
                     kernel_py_path, "wb",
                     lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+            elif self.execution_backend == "tvm_ffi":
+                executable = kernel.adapter.executable
+                if verbose:
+                    self.logger.debug(f"Saving kernel executable to file: {executable}")
+                KernelCache._safe_write_executable(executable, kernel_lib_path)
+            else:
+                src_lib_path = kernel.adapter.libpath
+                if verbose:
+                    self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                KernelCache._safe_write_file(
+                    kernel_lib_path, "wb",
+                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
         except Exception as e:
             self.logger.error(f"Error saving kernel library to disk: {e}")
 
@@ -326,7 +367,7 @@ class KernelCache:
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
         func: Callable = None,
@@ -349,25 +390,39 @@ class KernelCache:
             JITKernel: The loaded kernel if found, None otherwise.
         """
         cache_path = self._get_cache_path(key)
-        wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-        kernel_lib_path = os.path.join(
-            cache_path, KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH)
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        if self.execution_backend == "nvrtc":
+            kernel_lib_path = KERNEL_CUBIN_PATH
+        elif self.execution_backend == "tvm_ffi":
+            kernel_lib_path = EXECUTABLE_PATH
+        else:
+            kernel_lib_path = KERNEL_LIB_PATH
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
         params_path = os.path.join(cache_path, PARAMS_PATH)
         if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
             return None
 
-        kernel_global_source: str | None = None
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
         # Load the kernel source file (optional)
+        try:
+            if verbose:
+                self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
+        except Exception as e:
+            self.logger.error(f"Error loading kernel source code from disk: {e}")
         try:
             if verbose:
                 self.logger.debug(
-                    f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
+                    f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
         except Exception as e:
-            self.logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+            self.logger.error(f"Error loading host kernel source code from disk: {e}")
 
         # Load kernel parameters
         try:
@@ -378,10 +433,11 @@ class KernelCache:
         except Exception as e:
             self.logger.error(f"Error loading kernel parameters from disk: {e}")
 
-        if kernel_global_source and kernel_params:
+        if host_kernel_source and device_kernel_source and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -392,6 +448,7 @@ class KernelCache:
                 compile_flags=compile_flags,
             )
         else:
+            # TODO(lei): report what the reason is.
             return None
 
     def _clear_disk_cache(self):
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index e61d80ce..6772fe11 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -59,23 +59,3 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
         return tvm_func(*args)
 
     return _wrapper
-
-
-def to_pytorch_func(tvm_func):
-    """Convert a tvm function into one that accepts PyTorch tensors
-
-    Parameters
-    ----------
-    tvm_func: Function
-        Built tvm function operating on arrays
-
-    Returns
-    -------
-    wrapped_func: Function
-        Wrapped tvm function that operates on PyTorch tensors
-    """
-    # pylint: disable=import-outside-toplevel
-    import torch
-    import torch.utils.dlpack
-
-    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index d0c27b4c..c2a14552 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -146,7 +146,7 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     if target_host.kind.name == "llvm":
         host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm.ffi.get_global_func("target.build.c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 24378ac8..9f0e25f4 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -23,7 +23,6 @@ except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
 from tilelang.language.v2 import PrimFunc
-from tilelang.jit.adapter.utils import is_metal_target
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
@@ -46,7 +45,8 @@ _T = TypeVar('_T')
 def compile(
     func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                               "torch"] = "auto",
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     verbose: bool = False,
@@ -61,8 +61,9 @@ def compile(
         The TileLang TIR function to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        Execution backend to use for kernel execution. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
         Compilation target, either as a string or a TVM Target object (default: "auto").
     target_host : Union[str, Target], optional
@@ -80,8 +81,19 @@ def compile(
     # This path is not a performance critical path, so we can afford to convert the target.
     target = Target(determine_target(target))
 
-    if is_metal_target(target):
-        assert execution_backend == 'torch', 'Currently metal target only support `tl.jit(execution_backend="torch")`'
+    # Resolve execution backend (handles aliases, auto, validation per target)
+    requested_backend = execution_backend
+    from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+    execution_backend = resolve_execution_backend(requested_backend, target)
+    if verbose:
+        allowed_now = allowed_backends_for_target(target, include_unavailable=False)
+        logger.info(
+            "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+            execution_backend,
+            requested_backend,
+            target.kind.name,
+            ", ".join(sorted(allowed_now)),
+        )
 
     return cached(
         func=func,
@@ -97,7 +109,8 @@ def compile(
 
 def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
                 out_idx: list[int] | int | None = None,
-                execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+                execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                                           "torch"] = "auto",
                 target: str | Target = "auto",
                 target_host: str | Target | None = None,
                 verbose: bool = False,
@@ -113,8 +126,9 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
         The TileLang TIR functions to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        Execution backend to use for kernel execution. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
         Compilation target, either as a string or a TVM Target object (default: "auto").
     target_host : Union[str, Target], optional
@@ -165,7 +179,7 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
 class JITImpl(Generic[_P, _KP, _T]):
     func: Callable[_P, _T] | PrimFunc[_KP, _T]
     out_idx: list[int] | int | None
-    execution_backend: Literal["dlpack", "ctypes", "cython"]
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
     target: str | Target
     target_host: str | Target
     verbose: bool
@@ -286,7 +300,8 @@ def jit(
     out_idx: Any = None,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                               "torch"] = "auto",
     verbose: bool = False,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
@@ -301,7 +316,8 @@ def jit(  # This is the new public interface
         out_idx: Any = None,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
+                                   "torch"] = "auto",
         verbose: bool = False,
         pass_configs: dict[str, Any] | None = None,
         debug_root_path: str | None = None,
@@ -322,8 +338,9 @@ def jit(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py
index 0e8fb98c..dcfdaf5b 100644
--- a/tilelang/jit/adapter/__init__.py
+++ b/tilelang/jit/adapter/__init__.py
@@ -1,5 +1,5 @@
 from .base import BaseKernelAdapter  # noqa: F401
-from .dlpack import TorchDLPackKernelAdapter  # noqa: F401
+from .tvm_ffi import TVMFFIKernelAdapter  # noqa: F401
 from .ctypes import CtypesKernelAdapter  # noqa: F401
 from .cython import CythonKernelAdapter  # noqa: F401
 from .nvrtc import NVRTCKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/base.py b/tilelang/jit/adapter/base.py
index 9d998bc9..6bd69cff 100644
--- a/tilelang/jit/adapter/base.py
+++ b/tilelang/jit/adapter/base.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Any, Callable
 from tilelang.engine.param import KernelParam
+import torch
 
 
 class BaseKernelAdapter(ABC):
@@ -46,11 +47,54 @@ class BaseKernelAdapter(ABC):
     def _convert_torch_func(self) -> callable:
         pass
 
+    # --- Common helpers to align with PyTorch stream/device semantics ---
+    @staticmethod
+    def get_current_stream_functor() -> Callable[[], int]:
+        """Return a callable that reads Torch's current CUDA stream pointer.
+
+        The returned lambda yields the raw CUDA stream handle of the current
+        PyTorch stream on the active device. It's a thunk (evaluated at call
+        time) so that any upstream stream guards are respected. If CUDA is
+        unavailable, it returns a lambda that yields 0.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                get_stream = torch._C._cuda_getCurrentRawStream
+                return lambda: get_stream(current_device())
+            except Exception:
+                # Fallback to Python API if internal handles are unavailable
+                return lambda: int(torch.cuda.current_stream().cuda_stream)
+        # CPU or CUDA unavailable: no stream semantics
+        return lambda: 0
+
+    @staticmethod
+    def get_current_device_functor() -> Callable[[], torch.device]:
+        """Return a callable that yields Torch's current device.
+
+        Similar to the stream functor, we capture a callable that, when called,
+        fetches the current device according to PyTorch. On CPU or when CUDA is
+        unavailable, returns ``torch.device('cpu')``.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                return lambda: torch.device("cuda", current_device())
+            except Exception:
+                return lambda: torch.device("cuda", torch.cuda.current_device())
+        # CPU fallback
+        return lambda: torch.device("cpu")
+
     def __call__(self, *args: Any, **kwds: Any) -> Any:
         return self.func(*args, **kwds)
 
-    def get_kernel_source(self) -> str:
-        return self.mod.imported_modules[0].get_source()
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
+        if kernel_only:
+            return self.mod.imports[0].inspect_source()
+        else:
+            return self.mod.inspect_source() + "\n\n" + self.mod.imports[0].inspect_source()
 
     def _post_init(self):
         self.func = self._convert_torch_func()
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index bf0aef51..e2677305 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -14,6 +14,7 @@ from tilelang.utils.target import determine_target
 from tilelang.utils.language import retrieve_func_from_module
 
 
+# TODO(lei): remove ctypes adapter.
 class CtypesKernelAdapter(BaseKernelAdapter):
     """Adapter class that converts TVM/TIR functions to callable CUDA kernels using ctypes.
 
@@ -28,9 +29,9 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Pass configs for the compiler
@@ -47,7 +48,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
                  func_or_mod: tir.PrimFunc | tvm.IRModule,
                  host_mod: tvm.IRModule | None = None,
                  device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
+                 host_kernel_source: str | None = None,
+                 device_kernel_source: str | None = None,
                  verbose: bool = False,
                  pass_configs: dict[str, Any] | None = None,
                  compile_flags: list[str] | None = None):
@@ -62,7 +64,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -111,7 +114,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
                       result_idx: list[int],
                       target: str,
                       func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
+                      host_kernel_source: str,
+                      device_kernel_source: str,
                       kernel_lib_path: str,
                       verbose: bool = False,
                       pass_configs: dict[str, Any] | None = None,
@@ -119,8 +123,9 @@ class CtypesKernelAdapter(BaseKernelAdapter):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -288,7 +293,7 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index bc43533b..fe8fe5bd 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -48,9 +48,9 @@ class CythonKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Maps pointer arguments to their corresponding (buffer_index, shape_dimension)
@@ -77,7 +77,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
                  func_or_mod: tir.PrimFunc | tvm.IRModule,
                  host_mod: tvm.IRModule | None = None,
                  device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
+                 device_kernel_source: str | None = None,
                  verbose: bool = False,
                  pass_configs: dict[str, Any] | None = None,
                  compile_flags: list[str] | None = None):
@@ -92,7 +92,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -121,9 +121,9 @@ class CythonKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
+        self.host_kernel_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
 
-        self.lib_generator.update_lib_code(self.wrapped_source)
+        self.lib_generator.update_lib_code(self.host_kernel_source)
         self.lib_generator.compile_lib()
         self.lib = self.lib_generator.load_lib()
 
@@ -150,7 +150,8 @@ class CythonKernelAdapter(BaseKernelAdapter):
                       result_idx: list[int],
                       target: str,
                       func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
+                      host_kernel_source: str,
+                      device_kernel_source: str,
                       kernel_lib_path: str,
                       verbose: bool = False,
                       pass_configs: dict[str, Any] | None = None,
@@ -158,8 +159,8 @@ class CythonKernelAdapter(BaseKernelAdapter):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -382,7 +383,8 @@ class CythonKernelAdapter(BaseKernelAdapter):
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            assert self.host_kernel_source is not None, "Wrapped source is not available"
+            return self.host_kernel_source
diff --git a/tilelang/jit/adapter/dlpack.py b/tilelang/jit/adapter/dlpack.py
deleted file mode 100644
index 402dfb2f..00000000
--- a/tilelang/jit/adapter/dlpack.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""The profiler and convert to torch utils"""
-import torch
-from tilelang.contrib.dlpack import to_pytorch_func
-from .base import BaseKernelAdapter
-
-
-class TorchDLPackKernelAdapter(BaseKernelAdapter):
-
-    def _convert_torch_func(self) -> callable:
-        torch_func = to_pytorch_func(self.mod)
-
-        def func(*ins: list[torch.Tensor]):
-            if len(ins) + len(self.result_idx) != len(self.params):
-                raise ValueError(
-                    f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
-                )
-            ins_idx = 0
-            args = []
-
-            # use the device of the first input tensor if available
-            device = ins[0].device if len(ins) > 0 else torch.cuda.current_device()
-
-            for i in range(len(self.params)):
-                if i in self.result_idx:
-                    dtype = self.params[i].dtype
-                    shape = list(map(int, self.params[i].shape))
-                    tensor = torch.empty(*shape, dtype=dtype, device=device)
-                else:
-                    tensor = ins[ins_idx]
-                    ins_idx += 1
-                args.append(tensor)
-
-            torch_func(*args)
-
-            if len(self.result_idx) == 1:
-                return args[self.result_idx[0]]
-            else:
-                return [args[i] for i in self.result_idx]
-
-        return func
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index 5f8a2827..4a465d33 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -34,7 +34,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
                  func_or_mod: tir.PrimFunc | tvm.IRModule,
                  host_mod: tvm.IRModule | None = None,
                  device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
+                 device_kernel_source: str | None = None,
                  verbose: bool = False,
                  pass_configs: dict[str, Any] | None = None,
                  compile_flags: list[str] | None = None):
@@ -43,7 +43,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
 
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -74,10 +74,10 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
+        self.host_func, self.function_names = self.wrapper.wrap(device_kernel_source)
 
         self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
-        self.lib_generator.update_lib_code(self.kernel_global_source)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
         self.lib_generator.compile_lib()
@@ -97,7 +97,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
                       result_idx: list[int],
                       target: str,
                       func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
+                      host_kernel_source: str,
+                      device_kernel_source: str,
                       kernel_lib_path: str,
                       verbose: bool = False,
                       pass_configs: dict[str, Any] | None = None,
@@ -105,7 +106,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -167,7 +169,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
                     dynamic_symbolic_map[shape] = (i, j)
         return dynamic_symbolic_map
 
-    def get_kernel_source(self) -> str | None:
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
         Returns
@@ -175,7 +177,10 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         Optional[str]
             The kernel source code, or None if not available
         """
-        return self.kernel_global_source
+        if kernel_only:
+            return self.device_kernel_source
+        else:
+            return self.host_func
 
     def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
         """Low-level function to call the compiled CUDA kernel.
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
new file mode 100644
index 00000000..e06e9862
--- /dev/null
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -0,0 +1,321 @@
+"""Utilities to adapt TVM FFI kernels to Torch tensors.
+
+This adapter intentionally captures PyTorch's current CUDA stream and device
+via light-weight callables so that, when the wrapped function is invoked,
+the execution observes the same stream context as the active Torch code.
+On non-CUDA builds, the stream/device fall back to 0/CPU semantics.
+"""
+from __future__ import annotations
+
+from typing import Callable, Any
+
+import torch
+from tilelang import tvm
+from tvm import runtime, tir
+from tvm.target import Target
+from tvm.relax import TensorType
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.engine.param import KernelParam
+
+
+class TVMFFIKernelAdapter(BaseKernelAdapter):
+    """Adapter that runs a TVM runtime.Executable with Torch tensors.
+
+    Notes
+    - We capture the "current" PyTorch CUDA stream/device as thunks (callables)
+      rather than materializing them at construction time. This ensures the
+      actual stream/device is read just-in-time when the function runs, matching
+      the user's current Torch context (e.g., after a stream guard/switch).
+    - The stream pointer returned is a raw CUDA stream handle compatible with
+      TVM's device API; on CPU or when CUDA is unavailable, we return 0.
+    """
+    # Class attributes to store compiled kernel information
+    target: str | Target = "cuda"
+    ir_module: tvm.IRModule | None = None
+    # The global source code of the kernel -> global means the source code of the kernel
+    # that is not wrapped by the wrapper code
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
+    executable: tvm.runtime.Executable | None = None
+    # Pass configs for the compiler
+    pass_configs: dict[str, Any] | None = None
+    # host_mod
+    host_mod: tvm.IRModule | None = None
+    # device_mod
+    device_mod: tvm.IRModule | None = None
+    # rt_mod
+    rt_mod: tvm.runtime.Module | None = None
+    # Maps symbolic variables to their corresponding buffer and shape indices
+    dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] | None = None
+
+    # Stream/device functors are inherited from BaseKernelAdapter
+    def __init__(self,
+                 params: list[KernelParam],
+                 result_idx: list[int],
+                 target: str | Target,
+                 func_or_mod: tir.PrimFunc | tvm.IRModule,
+                 host_mod: tvm.IRModule | None = None,
+                 device_mod: tvm.IRModule | None = None,
+                 rt_mod: tvm.runtime.Module | None = None,
+                 host_kernel_source: str | None = None,
+                 device_kernel_source: str | None = None,
+                 verbose: bool = False,
+                 pass_configs: dict[str, Any] | None = None,
+                 compile_flags: list[str] | None = None):
+        """Initialize the adapter with the given TIR function or module.
+
+        Args:
+            params: List of tensor types for inputs/outputs
+            result_idx: Indices of output tensors
+            target: Target platform (e.g., 'cuda')
+            func_or_mod: TIR function or module to be compiled
+            verbose: Enable verbose logging
+        """
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        self.target = Target.canon_target(determine_target(target))
+
+        self.host_mod = host_mod
+        self.device_mod = device_mod
+        self.rt_mod = rt_mod
+        self.verbose = verbose
+        self.pass_configs = pass_configs
+        self.compile_flags = compile_flags
+        self.dynamic_symbolic_map = self._process_dynamic_symbolic()
+
+        self._post_init()
+
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
+        """Extract information about dynamic shapes from the TIR function.
+
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map = {}
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, shape in enumerate(buffer.shape):
+                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
+                        (shape not in params)):
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
+                        (stride not in params)):
+                        dynamic_symbolic_map[stride] = (1, i, j)
+        return dynamic_symbolic_map
+
+    def _convert_torch_func(self) -> Callable[..., Any]:
+        # Capture thunks that reflect Torch's current stream and device.
+        # These are evaluated at call time to align TVM execution with the
+        # caller's active PyTorch stream/device.
+        # current_stream_functor = self.get_current_stream_functor()
+        current_device_functor = self.get_current_device_functor()
+
+        # Convert TVM types to native Python types during initialization
+        param_dtypes = [param.dtype for param in self.params]
+        # Convert TVM shape arrays to native Python lists
+        param_shapes = []
+
+        for param in self.params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
+                else:
+                    native_shape.append(dim)
+            param_shapes.append(native_shape)
+
+        if self.executable is None:
+            self.executable = runtime.Executable(self.rt_mod)
+
+        dynamic_symbolic_map = self._process_dynamic_symbolic()
+        executable = self.executable
+
+        # Prepare helpers for friendly dtype error messages
+        prim_func = self.prim_func
+        buffer_map = prim_func.buffer_map
+        params = prim_func.params
+        # Expected dtype string per parameter index (for buffers only)
+        expected_dtype_strs: list[str | None] = []
+        # Track whether each param is a buffer (has dtype) vs scalar
+        is_buffer_param: list[bool] = []
+        for p in params:
+            if p in buffer_map:
+                expected_dtype_strs.append(str(buffer_map[p].dtype))
+                is_buffer_param.append(True)
+            else:
+                expected_dtype_strs.append(None)
+                is_buffer_param.append(False)
+        # Global function name used in error messages
+        global_symbol = str(prim_func.attrs.get("global_symbol", "main"))
+
+        # Map torch dtype to TVM-style dtype string
+        def torch_dtype_to_tvm_str(dtype: torch.dtype) -> str:
+            try:
+                import torch as _torch
+            except Exception:  # pragma: no cover
+                # Fallback, though torch should always be available here
+                return str(dtype)
+            fp8_e4m3fn = getattr(_torch, "float8_e4m3fn", None)
+            fp8_e4m3fnuz = getattr(_torch, "float8_e4m3fnuz", None)
+            fp8_e5m2 = getattr(_torch, "float8_e5m2", None)
+            fp8_e5m2fnuz = getattr(_torch, "float8_e5m2fnuz", None)
+            if fp8_e4m3fn is not None and dtype == fp8_e4m3fn:
+                return "float8_e4m3"
+            if fp8_e4m3fnuz is not None and dtype == fp8_e4m3fnuz:
+                return "float8_e4m3fnuz"
+            if fp8_e5m2 is not None and dtype == fp8_e5m2:
+                return "float8_e5m2"
+            if fp8_e5m2fnuz is not None and dtype == fp8_e5m2fnuz:
+                return "float8_e5m2"
+            # Strip torch. prefix for readability
+            s = str(dtype)
+            return s[6:] if s.startswith("torch.") else s
+
+        def func(*inputs: torch.Tensor | Any):
+            # Validate input count strictly
+            expected_inputs = len(self.params) - len(self.result_idx)
+            if len(inputs) != expected_inputs:
+                raise ValueError(
+                    f"Expected {expected_inputs} inputs, got {len(inputs)} (params={len(self.params)}, outputs={len(self.result_idx)})"
+                )
+
+            # Resolve the device used for outputs. Prefer the first tensor input's device
+            # if available, otherwise use PyTorch's current device.
+            out_device: torch.device | None = None
+
+            # Stitch the full positional argument list expected by the TVM executable
+            ins_idx: int = 0
+            tensor_list: list[torch.Tensor] = []
+
+            # Prepare input and output tensors
+            for i in range(len(self.params)):
+                if i in self.result_idx:
+                    dtype = param_dtypes[i]
+                    shape = []
+                    # Now working with native Python list, no FFI calls needed
+                    for s in param_shapes[i]:
+                        if isinstance(s, tir.Var):
+                            for key in dynamic_symbolic_map:
+                                if (str(s) == str(key)):
+                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[
+                                        key]
+                                    shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                        else:  # Already converted to Python int during initialization
+                            shape.append(s)
+
+                    if out_device is None:
+                        out_device = current_device_functor()
+
+                    if len(shape) == 0:
+                        param_name = self.params[i].name if hasattr(self.params[i],
+                                                                    'name') else f'parameter_{i}'
+                        raise ValueError(
+                            f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
+                            f"Expected shape: {shape}")
+                    tensor = torch.empty(*shape, dtype=dtype, device=out_device)
+                else:
+                    tensor = inputs[ins_idx]
+                    # Input dtype validation with clear error message
+                    if is_buffer_param[i]:
+                        expected_dtype_str = expected_dtype_strs[i]
+                        expected_torch_dtype = param_dtypes[i]
+                        # Only check when the argument is a tensor and expected dtype is known
+                        if isinstance(
+                                tensor, torch.Tensor
+                        ) and expected_dtype_str is not None and tensor.dtype != expected_torch_dtype:
+                            param_var = params[i]
+                            # Reconstruct TVM-like handle name A_handle for error clarity
+                            handle_name = f"{param_var.name}_handle"
+                            actual_dtype_str = torch_dtype_to_tvm_str(tensor.dtype)
+                            raise RuntimeError(
+                                f"{global_symbol}.{handle_name}.dtype is expected to be {expected_dtype_str}, but got {actual_dtype_str}"
+                            )
+                    ins_idx += 1
+                tensor_list.append(tensor)
+
+            executable(*tensor_list)
+
+            # Return outputs in the requested form
+            if len(self.result_idx) == 1:
+                return tensor_list[self.result_idx[0]]
+            return [tensor_list[i] for i in self.result_idx]
+
+        return func
+
+    @classmethod
+    def from_database(cls,
+                      params: list[TensorType],
+                      result_idx: list[int],
+                      target: str,
+                      func_or_mod: tir.PrimFunc | tvm.IRModule,
+                      host_kernel_source: str,
+                      device_kernel_source: str,
+                      kernel_lib_path: str,
+                      verbose: bool = False,
+                      pass_configs: dict[str, Any] | None = None,
+                      compile_flags: list[str] | None = None):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
+        adapter.pass_configs = pass_configs
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        target = determine_target(target, return_object=True)
+        adapter.target = Target.canon_target(determine_target(target))
+
+        adapter.verbose = verbose
+        adapter.executable = runtime.load_module(kernel_lib_path)
+        adapter._post_init()
+        return adapter
+
+    def get_host_source(self):
+        """Returns the source code of the host module."""
+        if self.host_kernel_source is not None:
+            return self.host_kernel_source
+        return self.rt_mod.inspect_source()
+
+    def get_device_source(self):
+        """Returns the source code of the device module."""
+        if self.device_kernel_source is not None:
+            return self.device_kernel_source
+        return self.rt_mod.imports[0].inspect_source()
+
+    def get_kernel_source(self, kernel_only: bool = False):
+        """Returns the source code of the compiled kernel."""
+        if kernel_only:
+            return self.get_device_source()
+        else:
+            return self.get_device_source() + "\n\n" + self.get_host_source()
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
new file mode 100644
index 00000000..fe600002
--- /dev/null
+++ b/tilelang/jit/execution_backend.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from tvm.target import Target
+
+# Canonical names for execution backends used internally
+_CANONICAL_MAP = {
+    "dlpack": "tvm_ffi",  # historical alias
+}
+
+
+def _canon_backend(name: str | None) -> str | None:
+    if name is None:
+        return None
+    key = str(name).lower()
+    return _CANONICAL_MAP.get(key, key)
+
+
+def _target_kind(target: Target) -> str:
+    # tvm.target.Target always has kind.name
+    return target.kind.name
+
+
+def allowed_backends_for_target(target: Target, *, include_unavailable: bool = True) -> list[str]:
+    """Return allowed execution backends for a given TVM target kind.
+
+    include_unavailable: if False, this will filter out backends that are known
+    to be unavailable at runtime (e.g., NVRTC without cuda-python installed).
+    """
+    kind = _target_kind(target)
+
+    if kind == "cuda":
+        allowed = ["tvm_ffi", "nvrtc", "cython", "ctypes"]
+    elif kind == "hip":
+        allowed = ["tvm_ffi", "cython", "ctypes"]
+    elif kind == "metal":
+        allowed = ["torch"]
+    elif kind == "c":  # CPU C backend
+        allowed = ["cython", "ctypes", "tvm_ffi"]
+    else:
+        # Fallback: prefer portable hosts
+        allowed = ["cython", "ctypes", "tvm_ffi"]
+
+    if not include_unavailable:
+        # Drop NVRTC if not importable
+        try:
+            from tilelang.jit.adapter.nvrtc import is_nvrtc_available  # lazy
+            if not is_nvrtc_available and "nvrtc" in allowed:
+                allowed = [b for b in allowed if b != "nvrtc"]
+        except Exception:
+            # Be conservative and keep nvrtc if detection itself fails
+            pass
+
+    return allowed
+
+
+def _format_options(options: Iterable[str]) -> str:
+    return ", ".join(sorted(options))
+
+
+def resolve_execution_backend(requested: str | None, target: Target) -> str:
+    """Resolve an execution backend string to a concrete backend.
+
+    - Supports the alias "dlpack" -> "tvm_ffi".
+    - Supports the sentinel "auto" which selects a sensible default per target.
+    - Validates the combination (target, backend) and raises with helpful
+      alternatives when invalid.
+    """
+    req = _canon_backend(requested)
+    allowed_all = allowed_backends_for_target(target, include_unavailable=True)
+    allowed_avail = allowed_backends_for_target(target, include_unavailable=False)
+
+    # Default selection for auto/None
+    if req in (None, "auto"):
+        kind = _target_kind(target)
+        if kind == "cuda":
+            choice = "tvm_ffi"
+        elif kind == "metal":
+            choice = "torch"
+        else:
+            choice = "cython"
+        # If the chosen default is not available (very rare), fall back to first available
+        if choice not in allowed_avail and allowed_avail:
+            choice = allowed_avail[0]
+        return choice
+
+    # Validate against allowed
+    if req not in allowed_all:
+        raise ValueError(
+            f"Invalid execution backend '{requested}' for target '{_target_kind(target)}'. "
+            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'.")
+
+    # Promote to availability-aware set for nicer errors (e.g., nvrtc not installed)
+    if req not in allowed_avail:
+        raise ValueError(
+            f"Execution backend '{requested}' requires extra dependencies and is not available now. "
+            f"Try one of: {_format_options(allowed_avail)}.")
+
+    return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index 6f5eb0b5..22cecf99 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -15,7 +15,7 @@ from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
 from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  TorchDLPackKernelAdapter, MetalKernelAdapter)
+                                  TVMFFIKernelAdapter, MetalKernelAdapter)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
@@ -55,7 +55,7 @@ class JITKernel(Generic[_P, _T]):
         self,
         func: PrimFunc = None,
         out_idx: list[int] | int = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         target: str | Target = "auto",
         target_host: str | Target = None,
         verbose: bool = False,
@@ -72,8 +72,8 @@ class JITKernel(Generic[_P, _T]):
             The TileLang TIR function to compile and wrap.
         out_idx : Union[List[int], int], optional
             Index(es) of the output tensors to return (default: None).
-        execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-            Execution backend to use for kernel execution (default: "cython").
+        execution_backend : Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+            Execution backend to use for kernel execution.
         target : Union[str, Target], optional
             Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host : Union[str, Target], optional
@@ -102,7 +102,7 @@ class JITKernel(Generic[_P, _T]):
 
         # Validate the execution backend.
         assert execution_backend in [
-            "dlpack",
+            "tvm_ffi",
             "ctypes",
             "cython",
             "nvrtc",
@@ -143,13 +143,14 @@ class JITKernel(Generic[_P, _T]):
     def from_database(
         cls,
         func: PrimFunc,
-        kernel_global_source: str,
+        host_kernel_source: str,
+        device_kernel_source: str,
         kernel_lib_path: str,
         params: list[KernelParam],
         target: str | Target,
         target_host: str | Target,
         out_idx: list[int] | int,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"],
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"],
         pass_configs: dict[str, Any] | None = None,
         compile_flags: list[str] | None = None,
     ):
@@ -172,7 +173,8 @@ class JITKernel(Generic[_P, _T]):
             params=params,
             result_idx=out_idx,
             target=target,
-            kernel_global_source=kernel_global_source,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
             kernel_lib_path=kernel_lib_path,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -223,8 +225,8 @@ class JITKernel(Generic[_P, _T]):
         compile_flags = self.compile_flags
 
         # Compile the function with TVM, optimizing with shared memory lowering.
-        enable_host_codegen = execution_backend == "dlpack"
-        enable_device_compile = execution_backend == "dlpack"
+        enable_host_codegen = execution_backend == "tvm_ffi"
+        enable_device_compile = execution_backend == "tvm_ffi"
         with tvm.transform.PassContext(opt_level=3, config=pass_configs), self.target:
             artifact = tilelang.lower(
                 tilelang_func,
@@ -236,13 +238,23 @@ class JITKernel(Generic[_P, _T]):
         self.artifact = artifact
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            # Use TorchDLPackKernelAdapter for interoperability with PyTorch via DLPack.
+        if execution_backend == "tvm_ffi":
+            # Use TVMFFIKernelAdapter for interoperability with PyTorch via DLPack.
             # But we need to ensure that the runtime is enabled and the runtime module is not None.
-            assert tvm.runtime.enabled("llvm"), "DLPack backend requires LLVM runtime."
-            assert (artifact.rt_mod is not None), "DLPack backend requires a runtime module."
-            adapter = TorchDLPackKernelAdapter(
-                artifact.rt_mod, params=artifact.params, result_idx=out_idx)
+            assert (artifact.rt_mod is not None), "tvm_ffi backend requires a runtime module."
+            adapter = TVMFFIKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                rt_mod=artifact.rt_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         elif execution_backend == "ctypes":
             adapter = CtypesKernelAdapter(
                 params=artifact.params,
@@ -251,7 +263,7 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -264,7 +276,7 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -278,7 +290,7 @@ class JITKernel(Generic[_P, _T]):
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -308,7 +320,8 @@ class JITKernel(Generic[_P, _T]):
                                       result_idx: list[int] | int,
                                       target: str | Target,
                                       func_or_mod: PrimFunc | tvm.runtime.Module,
-                                      kernel_global_source: str,
+                                      host_kernel_source: str,
+                                      device_kernel_source: str,
                                       kernel_lib_path: str,
                                       pass_configs: dict[str, Any] | None = None,
                                       compile_flags: list[str] | None = None) -> BaseKernelAdapter:
@@ -316,15 +329,26 @@ class JITKernel(Generic[_P, _T]):
         execution_backend = self.execution_backend
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            raise ValueError("DLPack backend is not supported for TileLang JIT.")
+        if execution_backend == "tvm_ffi":
+            adapter = TVMFFIKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         elif execution_backend == "ctypes":
             adapter = CtypesKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -335,7 +359,8 @@ class JITKernel(Generic[_P, _T]):
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
             )
@@ -346,7 +371,8 @@ class JITKernel(Generic[_P, _T]):
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -394,7 +420,7 @@ class JITKernel(Generic[_P, _T]):
         return Profiler(self.params, self.out_idx,
                         tensor_supply_type).with_default_adapter(self.adapter)
 
-    def get_kernel_source(self) -> str:
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
         """
         Returns the source code of the compiled kernel function.
 
@@ -403,14 +429,17 @@ class JITKernel(Generic[_P, _T]):
         str
             The source code of the compiled kernel function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc"}:
-            return self.adapter.get_kernel_source()
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi"}:
+            return self.adapter.get_kernel_source(kernel_only=kernel_only)
         return self.artifact.kernel_source
 
     def get_host_source(self) -> str:
         """
         Returns the source code of the host function.
         """
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi"}:
+            return self.adapter.get_host_source()
+        assert self.artifact.host_mod is not None, "host_mod is not available"
         return str(self.artifact.host_mod)
 
     def run_once(self, func: Callable | None = None) -> None:
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index 3ff2baab..5af1fc2b 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -10,7 +10,6 @@ from tilelang.utils.tensor import (
     get_tensor_supply,
     TensorSupplyType,
     torch_assert_close,
-    adapt_torch2tvm,
 )
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
@@ -274,9 +273,8 @@ class Profiler:
             device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
             time_evaluator = self.mod.time_evaluator(
                 self.mod.entry_name, device, number=rep, repeat=n_repeat)
-            tvm_inputs = [adapt_torch2tvm(inp) for inp in ins]
             # Transform Latency to ms
-            return time_evaluator(*tvm_inputs).mean * 1e3
+            return time_evaluator(*ins).mean * 1e3
         else:
             raise ValueError(f"Unknown profiler: {profiler}")
 
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 79947750..b275708c 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,9 +1,7 @@
 """The profiler and convert to torch utils"""
 from enum import Enum
 import torch
-from tvm import runtime
 from tvm import tir
-from torch.utils.dlpack import to_dlpack
 import numpy as np
 
 
@@ -37,23 +35,6 @@ def map_torch_type(intype: str) -> torch.dtype:
         return getattr(torch, intype)
 
 
-def adapt_torch2tvm(arg):
-    float8_dtype_map = {
-        torch.float8_e4m3fn: "float8_e4m3",
-        torch.float8_e4m3fnuz: "float8_e4m3",
-        torch.float8_e5m2: "float8_e5m2",
-        torch.float8_e5m2fnuz: "float8_e5m2",
-    }
-    if isinstance(arg, torch.Tensor):
-        if arg.dtype in {
-                torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz
-        }:
-            return runtime.from_dlpack(to_dlpack(arg.view(torch.int8)))._create_view(
-                shape=arg.shape, dtype=float8_dtype_map[arg.dtype])
-        return runtime.from_dlpack(to_dlpack(arg))
-    return arg
-
-
 def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
 
     from tilelang.engine.param import KernelParam
-- 
GitLab


From 4c8b9adab435f3e6fa05a4da4aaaec4a8f66c2d9 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:09:35 +0800
Subject: [PATCH 019/139] [Bugfix] Supply missing `T.print` for bool type
 (#1279)

* fix for bool dtype

* lint fix

* fix

* ci fix
---
 3rdparty/tvm                  |  2 +-
 src/tl_templates/cuda/debug.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index f4105f89..f4affc7f 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit f4105f89a646622acc9818584d1d91e2ca3f533d
+Subproject commit f4affc7f31e36e7f88c0fe1c715b03215c6a0c62
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 7dbb31ea..e8976874 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -29,6 +29,14 @@ __device__ void debug_print_var<signed char>(const char *msg, signed char var) {
          threadIdx.z, var);
 }
 
+// Specialization for plain char type
+template <> __device__ void debug_print_var<char>(const char *msg, char var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=char "
+         "value=%d\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, (int)var);
+}
+
 // Specialization for unsigned char type
 template <>
 __device__ void debug_print_var<unsigned char>(const char *msg,
@@ -58,6 +66,14 @@ __device__ void debug_print_var<unsigned int>(const char *msg,
          threadIdx.z, var);
 }
 
+// Specialization for bool type
+template <> __device__ void debug_print_var<bool>(const char *msg, bool var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+         "value=%s\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, var ? "true" : "false");
+}
+
 // Specialization for float type
 template <> __device__ void debug_print_var<float>(const char *msg, float var) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-- 
GitLab


From cd681e6384c72fb8fd0375e21b58791e549ce8fc Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 19 Nov 2025 14:17:45 +0800
Subject: [PATCH 020/139] [Fix] Fix memory leak bug (#1281)

* add typing stub for tir.ir

* remove idents

* minor update

* [Refactor] add numpy conversion for dtype

* fix lint error

* remove unused np.float_ in dtype conversion

* fix type in np.int_

* fix typo

* minor fix

* remove debug files

* fix memory leak bug

* fix lint error

* add comments

* fix lint error

* remove duplicated, because tilelang doesn't dependent deprecated
---
 .../python/language/test_tilelang_capture.py  | 40 ++++++++++++++++
 tilelang/language/v2/ast.py                   | 39 ++++++++++++---
 tilelang/language/v2/builder.py               | 48 +++++++++++--------
 tilelang/language/v2/utils.py                 | 20 --------
 4 files changed, 101 insertions(+), 46 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_capture.py

diff --git a/testing/python/language/test_tilelang_capture.py b/testing/python/language/test_tilelang_capture.py
new file mode 100644
index 00000000..875fa681
--- /dev/null
+++ b/testing/python/language/test_tilelang_capture.py
@@ -0,0 +1,40 @@
+import tilelang.language as T
+import tilelang.testing
+import torch
+import weakref
+import gc
+
+
+def test_tilelang_capture():
+
+    @tilelang.jit(
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },)
+    def get_dummy_kernel():
+
+        @T.prim_func
+        def dummy_kernel(a: T.Tensor[(1,), T.float32],):
+            with T.Kernel(1) as _:
+                a[0] = 1
+
+        return dummy_kernel
+
+    a = torch.randn(1, 1024)
+    a_weak = weakref.ref(a)
+    _kernel = get_dummy_kernel()
+    del a
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+    a_upgrade = a_weak()
+    assert a_upgrade is None, "A is not garbage collected"
+
+    # use objgraph to debug
+    # if a_upgrade is not None:
+    #     objgraph.show_backrefs([a_upgrade], max_depth=5)
+
+
+if __name__ == '__main__':
+    tilelang.testing.main()
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/v2/ast.py
index cf879ee5..307efdac 100644
--- a/tilelang/language/v2/ast.py
+++ b/tilelang/language/v2/ast.py
@@ -248,8 +248,9 @@ class BaseBuilder:
 
 class DSLMutator(ast.NodeTransformer):
 
-    def __init__(self):
+    def __init__(self, closure_names: list[str]):
         self.tmp_counter = 0
+        self.closure_names = closure_names
 
     def get_tmp(self) -> str:
         name = f"__{self.tmp_counter}"
@@ -494,9 +495,11 @@ class DSLMutator(ast.NodeTransformer):
         node.body = stmts + node.body
         node.decorator_list.clear()
         return quote1(
-            f"def {node.name}(__tb):\n"
-            "  range = __tb.override('range')\n"
-            "  pass\n"
+            f"def make_closure({', '.join(self.closure_names)}):\n"
+            f"  def {node.name}(__tb):\n"
+            "    range = __tb.override('range')\n"
+            "    pass\n"
+            f"    return {node.name}\n"
             f"  return {node.name}",
             passes=[node],
         )
@@ -595,7 +598,29 @@ def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
 
     tree = utils.get_ast(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
-    tree = DSLMutator().visit(tree)
-    fn = utils.get_compiled_object(tree, func.__name__, filename,
-                                   utils.inspect_function_capture(func))
+    nonlocals = utils.get_func_nonlocals(func)
+
+    # DSLMutator generates a function named `make_closure`
+    #   it accepts all names inside nonlocal, and returns the mutated function
+    #   this is because we must separate the closure namespace form the global namespace
+    #     if we directly inject closure variables into the global namespace,
+    #     it generates a new `globals` dict, and the dict owns all reference to the original globalns
+    #     which makes memory leak, because the original globalns cannot be freed
+    #     ```py
+    #     a = 123
+    #     def foo():
+    #       x = foo.__globals__ # OK, globals are maintained by python
+    #       x = {**foo.__globals__, } # Not OK: globals are copied, and the original globals cannot be freed
+    #       def bar(): x
+    #       return bar
+    #     ```
+    tree = DSLMutator(nonlocals.keys()).visit(tree)
+
+    make_closure = utils.get_compiled_object(
+        tree,
+        'make_closure',
+        filename,
+        func.__globals__,  # use the original globalns
+    )
+    fn = make_closure(**nonlocals)
     return IRGenerator(gen=fn, source=ast.unparse(tree))
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 684880b7..6931c5af 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -18,6 +18,7 @@ try:
 except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
     from typing_extensions import ParamSpec, Self
 from . import dtypes as dt
+from . import utils
 import threading
 import logging
 
@@ -593,22 +594,27 @@ def get_type_hints(func):
     # Build eval namespaces from function globals plus captured closure variables
     # This lets annotations reference symbols like `n`, `h`, or dtype vars
     # defined in the outer scope of a nested function.
-    globalns = dict(getattr(func, '__globals__', {}))
-    localns = dict(globalns)
-    try:
-        freevars = getattr(func.__code__, 'co_freevars', ())
-        cells = getattr(func, '__closure__', ()) or ()
-        closure_bindings = {
-            name: cell.cell_contents for name, cell in zip(freevars, cells) if name not in localns
-        }
-        if closure_bindings:
-            localns.update(closure_bindings)
-            # Also update globals so ForwardRef eval sees them uniformly
-            globalns.update(closure_bindings)
-    except Exception:
-        # Be permissive: absence or access issues with closure shouldn't crash
-        pass
-
+    globalns = func.__globals__
+    # Here we add nonlocals into localns, to capture the parameters declared in the parent function
+    # ```py
+    # def foo():
+    #   n = 128 # n is nonlocal
+    #   def bar(
+    #       A: T.Tensor(n, T.float32) # we add nonlocal in its eval context
+    #   ):
+    #      for i in range(n): ...
+    # ```
+    #
+    # This is incomplete and buggy
+    #   the only bug scenario the function body doesn't use the the parameters
+    #   but such define-no-use scenario is very rare in writing kernels
+    #
+    # ```py
+    # def foo():
+    #   n = 128
+    #   def bar(A: T.Tensor((n,), T.float32)):
+    #     ... # empty function, do not use `n`
+    localns = utils.get_func_nonlocals(func)
     for name, value in annot.items():
         if name == 'return':
             continue
@@ -618,8 +624,10 @@ def get_type_hints(func):
         if value is None:
             value = type(None)
         if isinstance(value, str):
-            # Handle simple dtype aliases like T.float32 appearing as strings
-            # Evaluate directly only when it matches known dtypes
+            # if the annotation is string, is can be: (i) a T.float32 like annotations, (ii) a ForwardRef object
+            # typing doesn't handle (i), it will try to interpret T.float32
+            #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
+            # here we manually interpret it to return T.float32 object
             try:
                 _, v = value.split('.', maxsplit=1)
             except ValueError:
@@ -631,7 +639,9 @@ def get_type_hints(func):
                 except Exception:
                     pass
             value = ForwardRef(value, is_argument=True, is_class=False)
-        hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+            hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+        else:
+            hints[name] = value
     return hints
 
 
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/v2/utils.py
index 739ecd1e..84f06145 100644
--- a/tilelang/language/v2/utils.py
+++ b/tilelang/language/v2/utils.py
@@ -53,26 +53,6 @@ def get_func_nonlocals(func):
     return nonlocal_vars
 
 
-def inspect_function_capture(func: Callable) -> dict[str, Any]:
-    """Capture function non-locals and global variables.
-
-    Parameters
-    ----------
-    func : Callable
-        The function to inspect.
-
-    Returns
-    -------
-    res : Dict[str, Any]
-        The function variables map with non-local or global variables.
-    """
-    captured = {
-        **func.__globals__,  # type: ignore
-        **get_func_nonlocals(func),
-    }
-    return captured
-
-
 def get_ast(func: Callable):
     _, start = inspect.getsourcelines(func)
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
-- 
GitLab


From 551ac60d19369df615aef578faad2048a521ed99 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 19 Nov 2025 16:27:44 +0800
Subject: [PATCH 021/139] [Enhancement] Enhance CUDA compilation by integrating
 pass context configuration (#1283)

- Updated the `tilelang_callback_cuda_compile` function to accept a `pass_config` parameter, allowing for more flexible compilation options.
- Introduced handling for fast math and PTXAS options based on the provided pass configuration.
- Modified the CUDA build process in `rt_mod_cuda.cc` to utilize the current pass context, improving the integration of compilation settings.
- Refactored NVCC command construction to use a dedicated function for better clarity and maintainability.
---
 src/target/rt_mod_cuda.cc |  6 +++++-
 tilelang/contrib/nvcc.py  |  9 +--------
 tilelang/engine/lower.py  | 42 ++++++++++++++++++++++++++++-----------
 3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index cbef0e64..a5e9b299 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -2,6 +2,7 @@
 #include "runtime/cuda/cuda_module.h"
 #include "runtime/pack_args.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
 
 namespace tvm {
 namespace codegen {
@@ -66,7 +67,10 @@ ffi::Module BuildTileLangCUDA(IRModule mod, Target target) {
   std::string ptx;
   if (const auto f =
           ffi::Function::GetGlobal("tilelang_callback_cuda_compile")) {
-    ptx = (*f)(code, target).cast<std::string>();
+    // Fetch current pass context config and pass into the compile callback
+    tvm::transform::PassContext pass_ctx =
+        tvm::transform::PassContext::Current();
+    ptx = (*f)(code, target, pass_ctx->config).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "cubin";
   } else {
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 202e0f3b..0d55cbf7 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -78,7 +78,7 @@ def compile_cuda(code,
         out_file.write(code)
 
     file_target = path_target if path_target else temp_target
-    cmd = ["nvcc"]
+    cmd = [get_nvcc_compiler()]
     cmd += [f"--{target_format}", "-O3"]
     if kernels_output_dir is not None:
         cmd += ["-lineinfo"]
@@ -332,13 +332,6 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
-    """use nvcc to generate fatbin code for better optimization"""
-    ptx = compile_cuda(code, target_format="fatbin")
-    return ptx
-
-
 @tvm_ffi.register_global_func("tilelang_callback_libdevice_path", override=True)
 def find_libdevice_path(arch):
     """Utility function to find libdevice
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index c2a14552..63391f77 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -11,6 +11,8 @@ import tvm_ffi
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.transform import PassConfigKey
+from tilelang.utils.deprecated import deprecated_warning
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
@@ -54,7 +56,7 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
 
 
 @tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):
+def tilelang_callback_cuda_compile(code, target, pass_config=None):
     project_root = osp.join(osp.dirname(__file__), "../..")
     if "TL_TEMPLATE_PATH" in os.environ:
         tl_template_path = os.environ["TL_TEMPLATE_PATH"]
@@ -69,21 +71,37 @@ def tilelang_callback_cuda_compile(code, target):
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
-    format = "cubin"
+    compile_format = "cubin"
+
+    # Read pass-config keys (string-valued) like in jit.adapter.libgen.compile_lib
+    cfg = pass_config or {}
+    if cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH.value, False):
+        deprecated_warning("TL_DISABLE_FAST_MATH", "TL_ENABLE_FAST_MATH", "0.1.7")
+        disable_fast_math = bool(cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH.value, True))
+        enable_fast_math = not disable_fast_math
+    else:
+        enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH.value, False))
+
+    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL.value, None)
+    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT.value, False))
+
+    options = [
+        "-std=c++17",
+        "-I" + tl_template_path,
+        "-I" + cutlass_path,
+    ]
+    if enable_fast_math:
+        options.append("--use_fast_math")
+    if ptxas_usage_level is not None:
+        options.append(f"--ptxas-options=--register-usage-level={ptxas_usage_level}")
+    if verbose_ptxas_output:
+        options.append("--ptxas-options=--verbose")
 
-    # printing out number of registers
-    debug_option = "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
     ptx = nvcc.compile_cuda(
         code,
-        format,
+        compile_format,
         arch,
-        options=[
-            "-std=c++17",
-            debug_option,
-            "--use_fast_math",
-            "-I" + tl_template_path,
-            "-I" + cutlass_path,
-        ],
+        options=options,
         verbose=False,
     )
 
-- 
GitLab


From 49f353935cb5006b92f6dfd96bf7f64c80c0bdd0 Mon Sep 17 00:00:00 2001
From: liu yuhao <liuyuhao24@mails.ucas.ac.cn>
Date: Wed, 19 Nov 2025 17:21:39 +0800
Subject: [PATCH 022/139] Fix the bug in issue #1266 (#1284)

Co-authored-by: cheeryBloosm <liu_yu_hao@126.com>
---
 examples/deepseek_nsa/example_tilelang_nsa_fwd.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index f8a7ebfb..0b71779b 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -156,13 +156,14 @@ def main():
     DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
 
     block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device='cuda')
     for b in range(B):
         for t in range(SEQ_LEN):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
                 block_indices[b, t, h, :len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN, H), device='cuda')
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
-- 
GitLab


From 9e67b861c94be93d66badd06b19fbc5e415e56dd Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Thu, 20 Nov 2025 01:30:20 +0800
Subject: [PATCH 023/139] [Language][UX] Nested loop checker in pre-lowering
 stage (#1288)

* [Language][UX] Nested loop checker in pre-lowering stage

* rename

* comment

* address comments
---
 src/transform/loop_partition.cc               |   3 +-
 .../test_tilelang_language_nested_loop.py     | 554 ++++++++++++++++++
 tilelang/__init__.py                          |   1 +
 tilelang/analysis/__init__.py                 |   3 +
 tilelang/analysis/nested_loop_checker.py      | 110 ++++
 tilelang/engine/lower.py                      |   4 +
 tilelang/engine/phase.py                      |  11 +
 7 files changed, 685 insertions(+), 1 deletion(-)
 create mode 100644 testing/python/language/test_tilelang_language_nested_loop.py
 create mode 100644 tilelang/analysis/__init__.py
 create mode 100644 tilelang/analysis/nested_loop_checker.py

diff --git a/src/transform/loop_partition.cc b/src/transform/loop_partition.cc
index fe1fe036..b4236c6d 100644
--- a/src/transform/loop_partition.cc
+++ b/src/transform/loop_partition.cc
@@ -93,7 +93,8 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   }
   for (int i = 0; i < old_loop_depth; i++) {
     const ForNode *loop = body.as<ForNode>();
-    ICHECK(loop != nullptr);
+    ICHECK(loop != nullptr)
+        << "No extra statements are allowed between nested parallel loops.";
     vmap.Set(loop->loop_var, indices[i]);
     loop_mins.push_back(loop->min);
     loop_extents.push_back(loop->extent);
diff --git a/testing/python/language/test_tilelang_language_nested_loop.py b/testing/python/language/test_tilelang_language_nested_loop.py
new file mode 100644
index 00000000..b572a707
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_nested_loop.py
@@ -0,0 +1,554 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import pytest
+
+tilelang.testing.set_random_seed()
+
+
+def _require_cuda_tensor(shape, dtype=torch.float32):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+"""
+Nested Parallel cases:
+
+T.Parallel
+    T.Parallel
+
+Rule:
+    - continuous parallels is allowed and will be merged into one T.Parallel.
+    - Non-continuous (e.g. with some statements in the outer-loop) are forbidden.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_parallels(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 +
+                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_parallels(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                B[i] = 0
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_parallels():
+    kernel1 = nested_continuous_parallels(length=256, block=16)
+    kernel2 = nested_triple_continuous_parallels(length=256, block1=8, block2=2)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is invalid
+    with pytest.raises(ValueError):
+        nested_noncontinuous_parallels(length=256, block=16)
+
+
+"""
+Nested Pipeline cases:
+
+T.Pipeline
+    T.Pipeline
+
+is OK.
+"""
+
+
+def matmul_nested_pipelines(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
+                            out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            for _ in T.Pipelined(extra_pipeline_repeats):
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    if trans_A:
+                        T.copy(A[k * block_K, by * block_M], A_shared)
+                    else:
+                        T.copy(A[by * block_M, k * block_K], A_shared)
+                    if trans_B:
+                        T.copy(B[bx * block_N, k * block_K], B_shared)
+                    else:
+                        T.copy(B[k * block_K, bx * block_N], B_shared)
+                    T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_nested_pipelines(
+    order,
+    stage,
+    extra_pipeline_repeats,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    trans_A = False
+    trans_B = False
+    in_dtype = "float16"
+    out_dtype = "float16"
+    dtypeAccum = "float32"
+    num_threads = 128
+    program = matmul_nested_pipelines(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+        extra_pipeline_repeats,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == "float32":
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
+            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_nested_pipelines():
+    run_gemm_nested_pipelines(order=[0, 1, 2], stage=[0, 0, 1], extra_pipeline_repeats=3)
+
+
+"""
+Nested serial cases:
+
+T.serial
+    T.serial
+
+is OK.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_serials(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_serials(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                B[i] = 0
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_serials():
+    kernel1 = nested_continuous_serials(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is valid
+    nested_noncontinuous_serials(length=256, block=16)
+
+
+"""
+Mixed serial and Parallel loops:
+
+(S-P)
+T.serial
+    T.Parallel
+
+(P-S)
+T.Parallel
+    T.serial
+
+Rule:
+    - No Parallel - * - Parallel
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sp(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_ps(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_psp(length=256, block1=8, block2=2, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.serial(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 +
+                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sps(length=256, block1=8, block2=2, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.serial(block2):
+                        B[i * block1 * block2 + j * block2 +
+                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+def test_mixed_sp():
+    kernel1 = nested_continuous_sp(length=256, block=16)
+    kernel2 = nested_continuous_ps(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This should be invalid (Undefined behaviour)
+    with pytest.raises(ValueError):
+        nested_continuous_psp(length=256, block1=16, block2=8)
+
+    kernel3 = nested_continuous_sps(length=256, block1=8, block2=2)
+    result3 = kernel3(data)
+    torch.testing.assert_close(result3, data + 1.0, atol=1e-5, rtol=1e-5)
+
+
+"""
+Mixed Pipelined and Parallel loops:
+
+(Pi-Pa)
+T.Pipelined
+    T.Parallel
+
+(Pa-Pi)
+T.Parallel
+    T.Pipelined
+
+Rule:
+    - Pi-Pa is ok where Pa-Pi is not allowed.
+    - For more nested cases, refer to the rule of T.Parallel.
+"""
+
+
+def matmul_nested_pipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmul_nested_papipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for _ in T.Parallel(1):
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    for i, j in T.Parallel(block_M, block_K):
+                        A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                    for i, j in T.Parallel(block_K, block_N):
+                        B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                    # T.copy(A[by * block_M, k * block_K], A_shared)
+                    # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_mixed_pp(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = "float16"
+    out_dtype = "float16"
+    dtypeAccum = "float32"
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == "float32":
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
+            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_nested_papipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            })
+
+
+def test_mixed_pp():
+    run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index e4be0129..2eae5cdb 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -133,6 +133,7 @@ from .layout import (
     Fragment,  # noqa: F401
 )
 from . import (
+    analysis,  # noqa: F401
     transform,  # noqa: F401
     language,  # noqa: F401
     engine,  # noqa: F401
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
new file mode 100644
index 00000000..b72fc2ba
--- /dev/null
+++ b/tilelang/analysis/__init__.py
@@ -0,0 +1,3 @@
+"""Tilelang IR analysis & visitors."""
+
+from .nested_loop_checker import NestedLoopChecker  # noqa: F401
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
new file mode 100644
index 00000000..4b9741c3
--- /dev/null
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -0,0 +1,110 @@
+from tvm import tir
+from tvm.tir import (
+    For,
+    PrimFunc,
+    PyStmtExprVisitor,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+def is_pipelined_for(op: For) -> bool:
+    """Check if a for loop is pipelined."""
+
+    anno_keys = [
+        "num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_sync",
+        "tl_pipeline_group"
+    ]
+    return any(key in op.annotations for key in anno_keys)
+
+
+@tir.functor.visitor
+class _NestedLoopCheckVisitor(PyStmtExprVisitor):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.in_parallel_context = False
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            child = op.body
+
+            # Special case: continuous nested parallel loop is allowed.
+            if isinstance(child, tir.For) and child.kind == tir.ForKind.PARALLEL:
+                self.visit_stmt(child)
+                return
+
+            # Otherwise
+            if self.in_parallel_context:
+                raise ValueError("Nested parallel loops are not allowed. "
+                                 "Please check your loop structure.")
+            self.in_parallel_context = True
+            self.visit_stmt(child)
+            self.in_parallel_context = False
+            return
+        elif is_pipelined_for(op):
+            if self.in_parallel_context:
+                raise ValueError("Pipelined loop cannot be nested inside a parallel loop. "
+                                 "Please check your loop structure.")
+
+        self.visit_stmt(op.body)
+
+
+def NestedLoopChecker():
+    """
+    User-friendly pass which identifies any invalid any nested-loop pattern.
+
+    Nested loops is an annoying problem in tilelang or other polyhedral-style compilers.
+    It contains many corner cases and undefined behaviours.
+
+    In tilelang, there are four loops:
+        T.serial
+        T.Parallel (T.vectorized)
+        T.Pipelined
+        T.Persistent
+
+    T.Persistent is a new feature which we do not consider here.
+
+    We define the following rules:
+    - (Rule 1) T.serial can be nested inside any other loop type without restriction.
+    - (Rule 2) Consecutive T.Parallel nested loops are not allowed. Including any TileOp (T.copy, etc.) which has
+        "parallel" behaviours is also forbidden.
+
+        Examples:
+        for i in T.Parallel(M):
+            stmt
+            for j in T.Parallel(N):
+                ...
+
+        for i in T.Parallel(M):
+            T.copy(A, B) # forbidden!
+
+        **Only a special case is allowed: strict continuous Parallel loops.** Since we can fuse them into a single T.Parallel loop.
+        Example:
+
+        for i in T.Parallel(M):
+                for j in T.Parallel(N):
+                    ... # allowed
+    - (Rule 3) T.Pipelined inside a T.Parallel is forbidden.
+
+        Examples:
+            for i in T.Parallel(M):
+                for j in T.Pipelined(K): # forbidden!
+                    ...
+
+            for i in T.Pipelined(K):
+                for j in T.Parallel(N): # allowed, ok
+                    ...
+
+    In summary, the problem mainly lies in the "T.Parallel". We highly recommend to use
+    T.Parallel to implement a tiled operator inside a kernel (e.g. T.gemm level) instead of other usages.
+    This guideline can help you avoid most of the issues.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _NestedLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 63391f77..88d89dcc 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -16,6 +16,7 @@ from tilelang.utils.deprecated import deprecated_warning
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
+    PreLowerSemanticCheck,
     LowerAndLegalize,
     OptimizeForTarget,
 )
@@ -242,6 +243,9 @@ def lower(
     _is_host_call = get_host_call(is_device_c=is_cpu_device_backend(target))
     _is_device_call = get_device_call(is_device_c=is_cpu_device_backend(target))
 
+    # Before lowering, do semantic check
+    PreLowerSemanticCheck(mod)
+
     # Phase 1: Lower and legalize the IR
     mod = LowerAndLegalize(mod, target)
 
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index a7cc99f8..35c16a43 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -67,6 +67,17 @@ def should_force_let_inline(pass_ctx: PassContext | None = None) -> bool:
     return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_FORCE_LET_INLINE, False))
 
 
+def PreLowerSemanticCheck(mod: IRModule) -> None:
+    """
+    Check whether the module is valid before lowering. If not, raise a user-friendly error
+    in Python side instead of letting the error dive into the complicated TVM/C++ stack.
+    Note: This is a validation-only pipeline of passes and does not modify or return the module.
+    """
+
+    # Check if there are any invalid nested loops.
+    tilelang.analysis.NestedLoopChecker()(mod)
+
+
 def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # Bind the target device information to the module
     """
-- 
GitLab


From bef7e52e32bb3280a4ad82dcdc61da9f0fc39001 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:05:40 +0800
Subject: [PATCH 024/139] [Compatibility] Support CUDA 11.3 (#1290)

---
 src/tl_templates/cuda/atomic.h   | 41 ++++++++++++++++++++++++++++++--
 src/tl_templates/cuda/debug.h    |  9 +++++++
 src/tl_templates/cuda/gemm_mma.h |  1 -
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index a573886b..0bbc4171 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -12,7 +12,11 @@ using cutlass::bfloat16_t;
 using cutlass::half_t;
 
 #define TL_DEVICE __forceinline__ __device__
-
+#define TL_NOT_IMPLEMENTED()                                                   \
+  {                                                                            \
+    printf("%s not implemented\n", __PRETTY_FUNCTION__);                       \
+    asm volatile("brkpt;\n");                                                  \
+  }
 template <typename T> struct normalize_atomic_type {
   using type = T;
 };
@@ -63,8 +67,12 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
       }
     }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -89,9 +97,13 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
     }
     return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -117,8 +129,13 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
       }
     }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+    return static_cast<T1>(
+        aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -143,9 +160,13 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
     }
     return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -216,8 +237,12 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
       }
     }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -290,9 +315,13 @@ TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
       }
     }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
@@ -618,13 +647,21 @@ AtomicAddx4Ret(float *ref, float *val,
 #endif
 
 template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
+#if CUDART_VERSION >= 11080
   cuda::atomic_ref<T, cuda::thread_scope_device> aref(ref);
   return aref.load(cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
 
 template <typename T1, typename T2>
 TL_DEVICE void AtomicStore(T1 &ref, T2 value, int memory_order) {
   using NT1 = typename normalize_atomic_type<T1>::type;
+#if CUDART_VERSION >= 11080
   cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(ref);
   aref.store(cuda_cast<NT1>(value), cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index e8976874..2724a814 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#if __CUDA_ARCH_LIST__ >= 890
 #include "./cuda_fp8.h"
+#endif
+
 #include "common.h"
 
 #ifndef __CUDACC_RTC__
@@ -117,6 +120,7 @@ __device__ void debug_print_var<double>(const char *msg, double var) {
          threadIdx.z, var);
 }
 
+#if __CUDA_ARCH_LIST__ >= 890
 // Specialization for fp8_e4_t type
 template <>
 __device__ void debug_print_var<fp8_e4_t>(const char *msg, fp8_e4_t var) {
@@ -137,6 +141,8 @@ __device__ void debug_print_var<fp8_e5_t>(const char *msg, fp8_e5_t var) {
       threadIdx.z, (float)var);
 }
 
+#endif
+
 // Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
@@ -242,6 +248,7 @@ __device__ void debug_print_buffer_value<double>(const char *msg,
 }
 
 // Specialization for fp8_e4_t type
+#if __CUDA_ARCH_LIST__ >= 890
 template <>
 __device__ void debug_print_buffer_value<fp8_e4_t>(const char *msg,
                                                    const char *buf_name,
@@ -263,6 +270,8 @@ __device__ void debug_print_buffer_value<fp8_e5_t>(const char *msg,
          threadIdx.z, buf_name, index, (float)var);
 }
 
+#endif
+
 // Specialization for int16 type
 template <>
 __device__ void debug_print_buffer_value<int16_t>(const char *msg,
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
index 71283173..25841a3b 100644
--- a/src/tl_templates/cuda/gemm_mma.h
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -8,7 +8,6 @@
 #include <cute/underscore.hpp>
 
 #include "common.h"
-#include "cuda_fp8.h"
 #include "intrin.h"
 
 namespace cute::tl_mma {
-- 
GitLab


From bccb6485e4003533bb0e21391dd09478e7074562 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Thu, 20 Nov 2025 13:56:09 +0800
Subject: [PATCH 025/139] [Feat] Add support for using `T.Tensor(n * 2 + 1)` in
 function annotation (#1285)

* [Feature] Add support for A: T.Tensor(n + 1) and A: T.Tensor(2*n)

* issue fix

* fix

* fix

* decreate nproc for debugging

---------

Co-authored-by: Lei Wang <leiwang1999@outlook.com>
---
 .github/workflows/ci.yml                      |  2 +-
 .../test_tilelang_example_deepseek_v32.py     |  1 +
 src/transform/arg_binder.cc                   | 76 ++++++++++++++++---
 src/transform/arg_binder.h                    |  1 +
 .../python/jit/test_tilelang_jit_callback.py  |  2 +
 .../python/jit/test_tilelang_jit_tvm_ffi.py   | 62 ---------------
 .../language/test_tilelang_language_annot.py  | 71 +++++++++++++++++
 7 files changed, 142 insertions(+), 73 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_language_annot.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9fe3286..ee796602 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -352,7 +352,7 @@ jobs:
             uv run --no-project -m --
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
-          "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
             ../examples
 
       # NVIDIA CUDA tests
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index e10141b5..2dd27048 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -1,4 +1,5 @@
 # ruff: noqa
+import tilelang
 import tilelang.testing
 
 import topk_selector
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 6a0909b8..361cfe90 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -29,8 +29,14 @@
 #include <tvm/tir/op.h>
 
 #include <sstream>
+#include <unordered_set>
 
 #include "tir/transforms/ir_utils.h"
+#include "tvm/arith/int_solver.h"
+#include "tvm/ffi/cast.h"
+#include "tvm/ffi/container/array.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/stmt_functor.h"
 
 namespace tvm {
 namespace tl {
@@ -51,6 +57,26 @@ void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
   }
 }
 
+std::vector<Var> ArgBinder::getUndefVars(const std::vector<PrimExpr> &args) {
+  std::unordered_set<const VarNode *> visit;
+  std::vector<Var> res;
+  for (const auto &arg : args) {
+    PostOrderVisit(arg, [&](ObjectRef r) {
+      if (auto var = r.as<VarNode>()) {
+        if (!visit.count(var)) {
+          visit.insert(var);
+        }
+        auto it = def_map_->find(var);
+        if (it == def_map_->end()) {
+          // res.push_back(var);
+          res.push_back(ffi::GetRef<Var>(var));
+        }
+      }
+    });
+  }
+  return res;
+}
+
 bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
                              const std::string &arg_name, bool with_lets,
                              const PrimExpr &nullable_guard) {
@@ -60,20 +86,23 @@ bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
     // is_null || basic
     return Or(nullable_guard, basic);
   };
-
   ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  auto BindVar = [&](const VarNode *v, PrimExpr value) {
+    auto v_arg = ffi::GetRef<Var>(v);
+    defs_.emplace_back(v_arg);
+    if (with_lets) {
+      (*def_map_)[v] = value;
+      init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+    } else {
+      (*def_map_)[v] = value;
+    }
+  };
+  // 1. simple binding var = value
   if (const VarNode *v = arg.as<VarNode>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
+      BindVar(v, value);
       // First time binding: identical behavior as Bind_
-      Var v_arg = Downcast<Var>(arg);
-      defs_.emplace_back(v_arg);
-      if (with_lets) {
-        (*def_map_)[v] = arg;
-        init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
-      } else {
-        (*def_map_)[v] = value;
-      }
       return true;
     } else {
       // Second or later binding: add is_null short-circuit
@@ -81,7 +110,34 @@ bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
       BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
     }
   } else {
-    // For non-Var expressions, also add is_null short-circuit
+    // 2. complex binding expr = value
+    //  get undefined variables
+    auto undefs = ffi::Array<Var>(getUndefVars({arg}));
+    if (!undefs.empty()) {
+      // if value is not integer, such as float, we are unable to solve it
+      if (!value.dtype().is_int() && !value.dtype().is_uint()) {
+        LOG(FATAL) << "Unable to solve non-integer variables " << undefs
+                   << " from equation `" << value << "`";
+      }
+      arith::IntConstraints constraints(undefs, {}, {arg == value});
+      auto sol = arith::SolveLinearEquations(constraints);
+      if (!sol->dst->variables.empty()) {
+        LOG(FATAL) << "TVM is unable to solve variables " << undefs
+                   << " from equation " << constraints;
+      }
+      for (const auto &v : undefs) {
+        auto value_opt = sol->src_to_dst.Get(v);
+        ICHECK(value_opt->defined())
+            << "Unable to solve variable `" << v << "` from expression `"
+            << (arg == value) << "`";
+        auto value = ffi::GetRef<PrimExpr>(sol->src_to_dst.Get(v)->get());
+        BindVar(v.as<VarNode>(), value);
+      }
+    }
+    // we must add the assert again
+    //    because the solved expression may contain floordiv (e.g. 3 * m == n
+    //    ==>   m = n // 3) we re-compute the constraint to verify the solution
+    //    is correct
     PrimExpr cond = MakeGuarded(arg == value);
     BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
   }
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index cf9f8466..793ada11 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -159,6 +159,7 @@ public:
                     const PrimExpr &nullable_guard);
 
 private:
+  std::vector<Var> getUndefVars(const std::vector<PrimExpr> &arg);
   // Internal bind function
   bool Bind_(const PrimExpr &arg, const PrimExpr &value,
              const std::string &arg_name, bool with_lets);
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index d5aa00a4..e987368d 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -91,7 +91,9 @@ def run_gemm(
         code = f"// {stramp}\n" + code
         return code
 
+    tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
+    tilelang.enable_cache()
 
     kernel_source = matmul_kernel.get_kernel_source()
 
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index cd5d9c75..f7bde6af 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -52,68 +52,6 @@ def matmul(
     return main
 
 
-def run_gemm(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_global_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
-
-
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
 def matmu_jit_kernel(
     M,
     N,
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
new file mode 100644
index 00000000..7425bf5c
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -0,0 +1,71 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def test_tensor_annot_mul():
+
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic('n')
+
+        @T.prim_func
+        def kernel(A: T.Tensor((n * 4,), T.int32),):
+            with T.Kernel(1) as _:
+                for i in range(n * 4):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_add():
+
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic('n')
+
+        @T.prim_func
+        def kernel(A: T.Tensor((n + 1,), T.int32),):
+            with T.Kernel(1) as _:
+                for i in range(n + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_mul_add():
+
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic('n')
+
+        @T.prim_func
+        def kernel(A: T.Tensor((n * 3 + 1,), T.int32),):
+            with T.Kernel(1) as _:
+                for i in range(n * 3 + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    assert torch.equal(A, expected)
+
+
+if __name__ == '__main__':
+    tilelang.testing.main()
-- 
GitLab


From dd7fdb8ee93cd134fd62636ab65122d7b03173a1 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Thu, 20 Nov 2025 17:33:35 +0800
Subject: [PATCH 026/139] [Feat] add support for passing reference in T.Var
 annotation (#1291)

---
 .../test_tilelang_language_frontend_v2.py     | 34 ++++++++++
 tilelang/language/v2/builder.py               | 63 ++++++++++---------
 2 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 1d9a20fe..41657dd7 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -361,5 +361,39 @@ def test_while_loop():
     assert A[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {A[0].item()}"
 
 
+def test_var_macro():
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert 'x[0] = 1' in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+
 if __name__ == '__main__':
     tilelang.testing.main()
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 6931c5af..e693f850 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -140,6 +140,7 @@ class Builder(BaseBuilder):
         self.frames: list[AnyFrame] = []
         self.ir_builder = IRBuilder()
         self.name_inside_frame: dict[str, AnyFrame] = {}
+        self.arg_annotations = {}
 
     @classmethod
     def current(cls) -> Self:
@@ -155,16 +156,17 @@ class Builder(BaseBuilder):
             yield
 
     @contextmanager
-    def macro(self, name=None):
+    def macro(self, name=None, annotations=None):
         if self.find_frame_idx(BoolOpFrame) is not None:
             raise RuntimeError(
                 f"Macro `{name}` is used inside boolean expressions, "
                 "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs")
-        save = self.name_inside_frame
+        save = self.name_inside_frame, self.arg_annotations
         self.name_inside_frame = {}
+        self.arg_annotations = annotations or {}
         with self.with_frame(MacroFrame()):
             yield
-        self.name_inside_frame = save
+        self.name_inside_frame, self.arg_annotations = save
 
     def get(self):
         return self.ir_builder.get()
@@ -313,32 +315,18 @@ class Builder(BaseBuilder):
         self.check_continue_break()
         locals = self.get_parent_locals()
         orig_value = locals.get(name, None)
-        # annotation like tl.float32
-        # temporarily disable annotation based var declaration, for better pull request separation
-        # if callable(annot):
-        #     annot_val = annot()
-        #     if isinstance(annot_val, tir.Var):
-        #         orig_value = tir.alloc_buffer((1,), dtype=annot_val.dtype, scope='local.var')
-        #         IRBuilder.name(name, orig_value)
-        #         if isinstance(value, EllipsisType) or value is self.empty:
-        #             return orig_value
-        #         elif isinstance(value, (int, float, IntImm, FloatImm)):
-        #             tir.block_attr(
-        #                 {'tl.local_var_init': {
-        #                     orig_value.data: tvm.runtime.convert(value)
-        #                 }})
-        #             return orig_value
         # if orig_value is a local.var, we use buffer_store to modify it immutably
-        #   however, if rvalue is also a local.var, this is a new binding,
+        #   however, if rvalue is not a PrimExpr, such as buffer,
         #   we should not use buffer_store, and bind it instead
         #   ```py
         #   a = tl.alloc_var('float32')  # bind var `a`
         #   a = tl.alloc_var('float32')  # bind a new var `a_1`
+        #   a = tl.alloc_shared((1,), T.float32) # bind a to new buffer
         #   b = a                        # get value of var `b = a_1[0]``
         #   c = tl.alloc_var('float32')  # bind var `c`
         #   c = a                        # get and assign `c[0] = a_1[0]`
         #   ```
-        if is_var(orig_value) and not is_var(value):
+        if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
             tir.buffer_store(orig_value, value, 0)
             return orig_value
         res = self.bind_immutable(name, value)
@@ -486,22 +474,34 @@ class Builder(BaseBuilder):
                 )
         return self.unwrap_value(value)
 
-    def arg(self, name, value):
-        if self.find_frame_idx(MacroFrame) is not None:
-            if isinstance(value, (PrimExpr, int, float)):
-                return self.bind(name, value)
-            else:
-                return value
+    def macro_arg(self, name, value):
+        if self.arg_annotations.get(name, None) is Var:
+            is_var = isinstance(value, tvm.tir.BufferLoad) and value.buffer.scope() == 'local.var'
+            if not is_var:
+                raise ValueError(
+                    f'Argument `{name}` is expected to be a variable allocated by `T.alloc_var`, but got {value}({type(value)})'
+                )
+            return value.buffer
+        elif isinstance(value, (PrimExpr, int, float)):
+            return self.bind(name, value)
+        else:
+            return value
+
+    def prim_func_arg(self, name, value):
         if isinstance(value, (Buffer, Var)):
             return tir.arg(name, value)
         elif value is self.empty:
             raise ValueError(f'Argument `{name}` is not annotated')
-        # elif isinstance(value, Hashable):
-        #     return value
         else:
             raise TypeError(
                 f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
 
+    def arg(self, name, value):
+        if self.find_frame_idx(MacroFrame) is not None:
+            return self.macro_arg(name, value)
+        else:
+            return self.prim_func_arg(name, value)
+
     def override(self, name: str):
         from tilelang.language import serial
         if name == 'range':
@@ -533,6 +533,7 @@ class Macro(Generic[_P, _T]):
     name: str
     orig_func: Callable[_P, _T]
     ir_gen: IRGenerator[_P, _T]
+    annotations: dict[str, Any]
 
     @property
     def source(self) -> str:
@@ -540,7 +541,7 @@ class Macro(Generic[_P, _T]):
 
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         builder = Builder.current()
-        with builder.macro(self.name):
+        with builder.macro(self.name, self.annotations):
             res = self.ir_gen.gen(builder)(*args, **kwargs)
         return res
 
@@ -578,7 +579,9 @@ def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
     """
 
     def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
-        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func))
+        annotations = get_type_hints(func)
+        return Macro(
+            name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
 
     return impl(func) if func is not None else impl
 
-- 
GitLab


From d4b6d0945e7a45db3883c13ed8d7049b568e0e94 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 20 Nov 2025 20:01:38 +0800
Subject: [PATCH 027/139] [Enhancement] Shared Memory Size Can be Dynamic
 (#1294)

* bugfix

* lint fix

* test

* lint fix

* increate procs

* recover
---
 .github/workflows/ci.yml                      |  2 +-
 3rdparty/tvm                                  |  2 +-
 src/tl_templates/cuda/atomic.h                |  3 +-
 .../test_tilelang_language_atomic_add.py      |  7 ++-
 ..._tilelang_runtime_dynamic_shared_memory.py | 52 +++++++++++++++++++
 5 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100644 testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ee796602..f9fe3286 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -352,7 +352,7 @@ jobs:
             uv run --no-project -m --
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
-          "${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
             ../examples
 
       # NVIDIA CUDA tests
diff --git a/3rdparty/tvm b/3rdparty/tvm
index f4affc7f..713e6ade 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit f4affc7f31e36e7f88c0fe1c715b03215c6a0c62
+Subproject commit 713e6ade56eaa72cc85d58d9228dd9f34cc2d03e
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 0bbc4171..f724882e 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -131,8 +131,7 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
   } else {
 #if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    return static_cast<T1>(
-        aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+    aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
 #else
     TL_NOT_IMPLEMENTED();
 #endif
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index 132e002a..2472c20f 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -374,10 +374,9 @@ def test_atomic_return_prev():
     run_atomic_return_prev(32, 32, 8, 8)
 
 
-# TODO(lei): test failed and this is experimental
-# CC @dyq
-# def test_tile_atomic_add():
-#     run_tile_atomic_add(8, 128, 128, 32, 32)
+def test_tile_atomic_add():
+    run_tile_atomic_add(8, 128, 128, 32, 32)
+
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
new file mode 100644
index 00000000..7a42b23b
--- /dev/null
+++ b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.jit
+def dynamic_smem_kernel():
+    # Symbolic length to drive dynamic shared memory allocation
+    length = T.symbolic("len", dtype="int32")  # noqa: F821
+
+    @T.prim_func
+    def main(global_tensor: T.Tensor[(length,), "int32"]):  # noqa: F821
+        # Launch a simple kernel that copies from global memory into shared memory
+        # using a dynamically-sized allocation. No writes back to global_tensor.
+        with T.Kernel(1, threads=32) as _:
+            buffer_shared = T.alloc_shared((length,), dtype="int32")  # noqa: F821
+            T.copy(buffer_shared, global_tensor)
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def _run_and_check(kernel, n):
+    a = _require_cuda_tensor((n,), torch.int32)
+    kernel(a)
+    torch.cuda.synchronize()
+
+
+def test_dynamic_shared_memory_varies_across_calls():
+    kernel = dynamic_smem_kernel()
+
+    # Run with different dynamic shared memory sizes across invocations
+    _run_and_check(kernel, 100)
+    _run_and_check(kernel, 200)
+    # Repeat sizes to exercise attribute caching path
+    _run_and_check(kernel, 200)
+    _run_and_check(kernel, 100)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
-- 
GitLab


From 2426090fdbd9e3e5e6987efd5f37cd0519efee8b Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:04:52 +0800
Subject: [PATCH 028/139] [Fix] Remove unused let_bindings_ in CodeGenC to fix
 #1300 (#1305)

* [Feat] add missing support of uint32x2

* [Feat] Add `T.Ref` annotation and tests

* fix lint error

* minor update for error message on twice decl

* Remove unused let_bindings_ in CodeGenC to fix #1300
---
 3rdparty/tvm                                  |  2 +-
 .../python/language/test_tilelang_intimm.py   | 28 ++++++++++++++++
 .../test_tilelang_language_frontend_v2.py     | 32 +++++++++++++++++++
 tilelang/language/__init__.py                 |  1 +
 tilelang/language/proxy.py                    | 10 +++++-
 tilelang/language/v2/builder.py               |  8 +++--
 tilelang/language/v2/dtypes.py                | 28 ++++++++++++++++
 7 files changed, 105 insertions(+), 4 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_intimm.py

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 713e6ade..bc31e7ad 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 713e6ade56eaa72cc85d58d9228dd9f34cc2d03e
+Subproject commit bc31e7ad9f9fafd7659dfabafe359fd55a0ffc1e
diff --git a/testing/python/language/test_tilelang_intimm.py b/testing/python/language/test_tilelang_intimm.py
new file mode 100644
index 00000000..58fea31d
--- /dev/null
+++ b/testing/python/language/test_tilelang_intimm.py
@@ -0,0 +1,28 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_tilelang_intimm():
+    T.int32(0x7fffffff)
+    T.int32(-0x7fffffff - 1)
+    T.uint32(0xffffffff)
+    T.int64(0x7fffffffffffffff)
+    T.int64(-0x7fffffffffffffff - 1)
+    T.uint64(0xffffffffffffffff)
+
+    a = T.int32()
+    a & 0x7fffffff
+
+    a = T.uint32()
+    a & 0xffffffff
+
+    a = T.int64()
+    a & 0x7fffffffffffffff
+
+    a = T.uint64()
+    a & T.uint64(0xffffffffffffffff)
+
+
+if __name__ == '__main__':
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 41657dd7..2608e251 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -394,6 +394,38 @@ def test_var_macro():
     except ValueError:
         pass
 
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert 'x[0] = 1' in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
 
 if __name__ == '__main__':
     tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 43c721bb..95488bdf 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -22,6 +22,7 @@ from .proxy import (
     FragmentBuffer,  # noqa: F401
     SharedBuffer,  # noqa: F401
     LocalBuffer,  # noqa: F401
+    Ref,  # noqa: F401
 )
 from .loop import serial, Parallel, Persistent, Pipelined  # noqa: F401
 from .frame import has_let_value, get_let_value  # noqa: F401
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index e2f65e83..9e209a1b 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,7 +1,7 @@
 """The language interface for tl programs."""
 from __future__ import annotations
 
-from typing import Any, SupportsIndex, TYPE_CHECKING
+from typing import Any, SupportsIndex, TYPE_CHECKING, Generic, TypeVar
 from collections.abc import Sequence
 from typing_extensions import Self
 
@@ -263,6 +263,11 @@ if TYPE_CHECKING:
 
     class LocalBuffer(BaseTensor):
         ...
+
+    _T = TypeVar('_T')
+
+    class Ref(Generic[_T], tir.Var):
+        ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
     StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
@@ -270,6 +275,9 @@ else:
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
 
+    class Ref:
+        ...
+
 
 def ptr(dtype: str | None = None,
         storage_scope: str = "global",
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index e693f850..643994a4 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -335,7 +335,7 @@ class Builder(BaseBuilder):
             assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
             if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
                 logger.warning(
-                    f'Variable `{name}` shadows another declared value, Are you forgetting to allocate it as a var?',
+                    f'Variable `{name}` is declared twice, are you looking for a T.alloc_var?',
                     stack_info=True,
                     stacklevel=2,
                 )
@@ -475,7 +475,11 @@ class Builder(BaseBuilder):
         return self.unwrap_value(value)
 
     def macro_arg(self, name, value):
-        if self.arg_annotations.get(name, None) is Var:
+        from tilelang.language.proxy import Ref
+        annot_value = self.arg_annotations.get(name, None)
+        if annot_value is Var or annot_value is Ref:
+            if annot_value is Var:
+                logger.warning('Use `T.Var` as macro annotations is deprecated, please use `T.Ref`')
             is_var = isinstance(value, tvm.tir.BufferLoad) and value.buffer.scope() == 'local.var'
             if not is_var:
                 raise ValueError(
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 0702635a..75cf83dd 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -87,8 +87,12 @@ _STR_TO_TVM_DTYPE_CALL = {
     'float8_e8m0fnu': 'Float8E8M0FNU'
 }
 
+int_ = int
+
 
 def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
+    if isinstance(expr, int_):
+        return tvm.tir.const(expr, dtype=self)
     if self in _STR_TO_TVM_DTYPE_CALL:
         attr = _STR_TO_TVM_DTYPE_CALL[self]
         call = getattr(tb_ffi, attr, None)
@@ -151,6 +155,10 @@ if TYPE_CHECKING:
     class int16(dtype): ...
     class int32(dtype): ...
     class int64(dtype): ...
+    class int8x2(dtype): ...
+    class int16x2(dtype): ...
+    class int32x2(dtype): ...
+    class int64x2(dtype): ...
     class int8x4(dtype): ...
     class int16x4(dtype): ...
     class int32x4(dtype): ...
@@ -175,6 +183,10 @@ if TYPE_CHECKING:
     class uint16(dtype): ...
     class uint32(dtype): ...
     class uint64(dtype): ...
+    class uint8x2(dtype): ...
+    class uint16x2(dtype): ...
+    class uint32x2(dtype): ...
+    class uint64x2(dtype): ...
     class uint8x4(dtype): ...
     class uint16x4(dtype): ...
     class uint32x4(dtype): ...
@@ -308,6 +320,10 @@ else:
     int16 = dtype('int16')
     int32 = dtype('int32')
     int64 = dtype('int64')
+    int8x2 = dtype('int8x2')
+    int16x2 = dtype('int16x2')
+    int32x2 = dtype('int32x2')
+    int64x2 = dtype('int64x2')
     int8x4 = dtype('int8x4')
     int16x4 = dtype('int16x4')
     int32x4 = dtype('int32x4')
@@ -332,6 +348,10 @@ else:
     uint16 = dtype('uint16')
     uint32 = dtype('uint32')
     uint64 = dtype('uint64')
+    uint8x2 = dtype('uint8x2')
+    uint16x2 = dtype('uint16x2')
+    uint32x2 = dtype('uint32x2')
+    uint64x2 = dtype('uint64x2')
     uint8x4 = dtype('uint8x4')
     uint16x4 = dtype('uint16x4')
     uint32x4 = dtype('uint32x4')
@@ -464,6 +484,10 @@ _all_dtypes = {
     'int16',
     'int32',
     'int64',
+    'int8x2',
+    'int16x2',
+    'int32x2',
+    'int64x2',
     'int8x4',
     'int16x4',
     'int32x4',
@@ -488,6 +512,10 @@ _all_dtypes = {
     'uint16',
     'uint32',
     'uint64',
+    'uint8x2',
+    'uint16x2',
+    'uint32x2',
+    'uint64x2',
     'uint8x4',
     'uint16x4',
     'uint32x4',
-- 
GitLab


From 17bbc0ca3d929411dfbd3908bc70085c15a56f07 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 21 Nov 2025 17:37:39 +0800
Subject: [PATCH 029/139] [Bugfix] Fallback to the old AtomicAdd implementation
 for legacy architectures (#1306)

---
 src/tl_templates/cuda/atomic.h | 59 ++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index f724882e..05421080 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -169,6 +169,7 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
   }
 }
 
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
 template <typename T1, typename T2>
 TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
@@ -236,14 +237,18 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
       }
     }
   } else {
-#if CUDART_VERSION >= 11080
-    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
-#else
-    TL_NOT_IMPLEMENTED();
-#endif
+    atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
   }
 }
+#else
+template <typename T1, typename T2>
+TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+                         int memory_order = int(cuda::memory_order_relaxed)) {
+  using NT1 = typename normalize_atomic_type<T1>::type;
+  (void)memory_order;
+  atomicAdd(reinterpret_cast<NT1 *>(&ref), cuda_cast<NT1>(val));
+}
+#endif
 
 template <typename T1, typename T2>
 TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
@@ -643,6 +648,48 @@ AtomicAddx4Ret(float *ref, float *val,
     return ret_val;
   }
 }
+#else
+TL_DEVICE void AtomicAddx2(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+TL_DEVICE float2
+AtomicAddx2Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+TL_DEVICE void AtomicAddx4(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+TL_DEVICE float4
+AtomicAddx4Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
 #endif
 
 template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
-- 
GitLab


From bf90a5f58c1ce9a3f20144368d72b02ed5fbeae6 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Fri, 21 Nov 2025 20:27:14 +0800
Subject: [PATCH 030/139] [Fix] Fix frame scope error in T.macro (#1308)

* [Fix] Fix #1307 by adding macro inside function

* fix lint error

* add comments and fix lint error

* Remove debug print from enter_frame method

Removed debug print statement from enter_frame method.

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
---
 .../test_tilelang_language_frontend_v2.py     | 26 +++++++++++++++++++
 tilelang/language/v2/builder.py               | 22 ++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 2608e251..349f3caf 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -427,5 +427,31 @@ def test_var_macro():
         pass
 
 
+def frame_inside_macro():
+
+    @tilelang.jit
+    def get_sample_kernel():
+
+        @T.macro
+        def transform(x):
+            return x + 1
+
+        @T.prim_func
+        def sample_kernel(
+            num_blocks: T.int32,
+            idx_out: T.Tensor[(32,), T.int32],
+        ):
+            with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
+                fragment = T.alloc_fragment(32, 'int32')
+                T.copy(idx_out, fragment)
+
+                for i in T.Parallel(32):
+                    idx_out[i] = transform(fragment[i])
+
+        return sample_kernel
+
+    kernel = get_sample_kernel()  # noqa: F841
+
+
 if __name__ == '__main__':
     tilelang.testing.main()
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 643994a4..c54b0701 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -80,6 +80,10 @@ class MacroFrame(Frame):
     ...
 
 
+class ExitedMacroFrame(Frame):
+    ...
+
+
 class BoolOpFrame(Frame):
     ...
 
@@ -164,8 +168,22 @@ class Builder(BaseBuilder):
         save = self.name_inside_frame, self.arg_annotations
         self.name_inside_frame = {}
         self.arg_annotations = annotations or {}
-        with self.with_frame(MacroFrame()):
-            yield
+        pos = len(self.frames)
+        # here we add a ExitedMacroFrame to preserve the frame stack inside macro
+        # because macro may bind some variable, and return it
+        #
+        # ```py
+        # @T.macro
+        # def foo(x):
+        #    y = x + 1
+        #    return y
+        # @T.prim_func
+        # def bar():
+        #    c = foo(1) # macro generates let y = x + 1
+        #    d = c # d = c should lay inside frame of `let y = x + 1`
+        self.frames.append(MacroFrame())
+        yield
+        self.frames[pos] = ExitedMacroFrame()
         self.name_inside_frame, self.arg_annotations = save
 
     def get(self):
-- 
GitLab


From 0d101c110f74ebf2ef8c11a5ece9dfb314b48baa Mon Sep 17 00:00:00 2001
From: Yunqian Fan <pannenets.f@foxmail.com>
Date: Fri, 21 Nov 2025 21:20:18 +0800
Subject: [PATCH 031/139] [WIP] support more dtypes for tcgen05 (#1229)

support ld with pack for fp32 dtype

add dump

add tempalte expand

remove unused dtype and change to rebased apis
---
 .../example_tilelang_gemm_fp8_sm100.py        | 126 +++
 src/op/copy.cc                                |  14 +-
 src/op/gemm_py.cc                             |   2 +
 src/op/tcgen5_meta.h                          |  38 +-
 src/tl_templates/cuda/copy_sm100.h            |  35 +-
 src/tl_templates/cuda/gemm_sm100.h            |  76 +-
 src/tl_templates/cuda/tcgen_05_ld.h           | 755 +++++++++++++++++-
 tilelang/intrinsics/mma_macro_generator.py    |   3 +
 .../intrinsics/tcgen05_macro_generator.py     |   9 +-
 tilelang/jit/adapter/wrapper.py               |   1 +
 tilelang/tileop/gemm/gemm_tcgen05.py          |   5 +-
 11 files changed, 976 insertions(+), 88 deletions(-)
 create mode 100644 examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
new file mode 100644
index 00000000..4628a997
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -0,0 +1,126 @@
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm_v2(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=(k == 0),
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+M, N, K = 4096, 4096, 8192
+block_M, block_N, block_K = 64, 256, 32
+trans_A, trans_B = False, True
+num_stages = 2
+threads = 256
+for tvm_fp8_dtype in ["float8_e4m3", "float8_e5m2"]:
+    for tvm_acc_dtype in ["float16", "float32"]:  # , torch.float16]:
+        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
+        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
+        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
+
+        func = matmul(
+            M,
+            N,
+            K,
+            block_M,
+            block_N,
+            block_K,
+            trans_A,
+            trans_B,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            threads,
+        )
+        jit_kernel = tilelang.compile(
+            func,
+            out_idx=[2],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
+            },
+        )
+        # jit_kernel.export_ptx("./dump.ptx")
+        # jit_kernel.export_sources("./dump.cu")
+
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+
+        c = jit_kernel(a, b)
+        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
+        c = c.float()
+        diff = calc_diff(c, ref_c)
+        # assert diff < 1e-3, f"{diff}"
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
+
+        profiler = jit_kernel.get_profiler()
+        latency = profiler.do_bench()
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
+        print(
+            f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS"
+        )
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 5d352904..8ffef5ea 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -1117,6 +1117,11 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
+  bool src_needs_pack =
+      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
+  bool dst_needs_unpack =
+      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
+
   if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
     is_ld = true;
   } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
@@ -1124,9 +1129,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: "
-              << "src scope = " << src.scope()
-              << ", dst scope = " << dst.scope();
+    ICHECK(0) << "Unsupported tensor memory copy: " << "src scope = "
+              << src.scope() << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1246,8 +1250,10 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
+      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ">"));
+                               std::to_string(num_chunks_each_wg) + ", " +
+                               bool_str + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index ac506ee0..6097998c 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -428,6 +428,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           result.push_back(Integer(meta.atom_m));
           result.push_back(Integer(meta.atom_n));
           result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
         }
         return result;
       });
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index bb63c8dc..350a2bc8 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -15,16 +15,19 @@ using runtime::DataType;
 
 struct TCGEN5MMAMeta {
   int atom_m, atom_n, atom_k;
+  bool enable_ws, enable_2cta;
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
 GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
 #define FAIL                                                                   \
-  return { false, TCGEN5MMAMeta{0, 0, 0} }
-#define SUCCESS(atom_m, atom_n, atom_k)                                        \
   return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
+    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
+  }
+#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
+  return {                                                                     \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
   }
   std::vector<int> ws_valid_atom_ns = {256, 128, 64};
   if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
@@ -34,39 +37,52 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     if (M % 128 == 0) {
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16);
+          SUCCESS(128, atom_n, 16, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16);
+          SUCCESS(64, atom_n, 16, false, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16);
+          SUCCESS(32, atom_n, 16, false, false);
       FAIL;
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
-             (c_dtype.is_float() && c_dtype.bits() == 32)) {
+  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e4m3() ||
+              ab_dtype.is_float8_e5m2() || ab_dtype.is_float8_e5m2fnuz() ||
+              ab_dtype.is_float6_e2m3fn() || ab_dtype.is_float6_e3m2fn() ||
+              ab_dtype.is_float4_e2m1fn()) &&
+             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
+              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
       FAIL;
     if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32);
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32);
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32);
+          SUCCESS(32, atom_n, 32, true, false);
       FAIL;
     } else {
       FAIL;
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index c4047c34..aa898bcc 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -51,6 +51,21 @@ __device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
                :
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
+__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e5_32_t *ptr) {
+  ulonglong4 ret;
+  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
+               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
+               : "l"(ptr));
+  return ret;
+}
+
+__device__ __forceinline__ void st_global_256(fp8_e5_32_t *ptr,
+                                              fp8_e5_32_t &val8) {
+  ulonglong4 &val = *((ulonglong4 *)&val8);
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
 
 __device__ __forceinline__ unsigned long long
 pack_bfloat16x4(const bfloat16_t x, const bfloat16_t y, const bfloat16_t z,
@@ -95,38 +110,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 856d37dd..6c68c2c2 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,46 +243,96 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
-                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 64 && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 64 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index b2eb2f81..9e5e3420 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,7 +10,9 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-class tmem_ld_32dp32bNx {
+template <bool Pack16> class tmem_ld_32dp32bNx;
+
+template <> class tmem_ld_32dp32bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -180,9 +182,180 @@ public:
     }
   }
 };
+template <> class tmem_ld_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-class tmem_ld_16dp64bNx {
+template <bool Pack16> class tmem_ld_16dp64bNx;
+template <> class tmem_ld_16dp64bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -352,39 +525,43 @@ public:
     }
   }
 };
-
-// 16 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_16dp128bNx {
+template <> class tmem_ld_16dp64bNx<true> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -395,9 +572,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -414,9 +591,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -449,9 +626,9 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 128) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -519,32 +696,39 @@ public:
   }
 };
 
-// 16 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_16dp256bNx {
+// 16 data path lanes, 128-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp128bNx;
+template <> class tmem_ld_16dp128bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -555,9 +739,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -574,9 +758,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -609,9 +793,492 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp256bNx;
+template <> class tmem_ld_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -681,32 +1348,32 @@ public:
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-class tmem_ld_32dp64bNx {
+template <bool Pack16 = false> class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_32dp128bNx {
+template <bool Pack16 = false> class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_32dp256bNx {
+template <bool Pack16 = false> class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 8c546c63..bbfeb157 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -45,7 +45,10 @@ class TensorCoreIntrinEmitter:
         "int8": "int8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
+        "float8_e4m3fn": "e4m3",
+        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index e53ff7cb..966f4dc4 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -169,12 +169,11 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
                 f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, atom_k = (int(x) for x in meta)
-        enable_ws = atom_m != 128
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
 
         # by default, we utilize non-swizzle layout offset
         a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
@@ -382,10 +381,10 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         k = int(self.chunk)
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
                              f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, _ = (int(x) for x in meta)
+        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
 
         if m % atom_m != 0 or n % atom_n != 0:
             raise ValueError(
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 48b8e908..75607976 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -144,6 +144,7 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 52c192e5..1de9fe87 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -85,6 +85,9 @@ class GemmTCGEN5(GemmBase):
             raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
                              f"A scope {self.A.scope()}, B scope {self.B.scope()}")
 
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(
+            self.M, self.N, self.K)
+
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
         if self.B.scope() not in {"shared", "shared.dyn"}:
@@ -103,7 +106,7 @@ class GemmTCGEN5(GemmBase):
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype != "float32":
+        if accum_dtype not in ["float32", 'float16']:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
-- 
GitLab


From 470eb74cac8e1ea4f99547de5ea5cb24feabb2c9 Mon Sep 17 00:00:00 2001
From: LJC00118 <77378439+LJC00118@users.noreply.github.com>
Date: Sat, 22 Nov 2025 12:03:23 +0800
Subject: [PATCH 032/139] Improve memory access safety and `T.assume` handling
 (#1292)

* Improve memory access safety and T.assume handling

* Improve memory access safety and T.assume handling

* bugfix

* lint fix

* bugfix

* bugfix

* refactor legalize safe memory access pass

---------

Co-authored-by: Lei Wang <leiwang1999@outlook.com>
---
 src/transform/legalize_safe_memory_access.cc | 168 ++++++-------------
 src/transform/simplify.cc                    |  10 ++
 2 files changed, 58 insertions(+), 120 deletions(-)

diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index 68a0cdbb..1a9da919 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -24,32 +24,6 @@ namespace tl {
 using namespace tir;
 using arith::IRMutatorWithAnalyzer;
 
-// Helper class to find leaf For nodes in a given IR
-class LeafForFinder : public StmtVisitor {
-public:
-  std::vector<For> leaf_for_nodes;
-
-private:
-  void VisitStmt_(const ForNode *op) final {
-    has_child_for_ = false;
-    bool parent_has_child_for = parent_has_child_for_;
-    parent_has_child_for_ = false;
-
-    StmtVisitor::VisitStmt(op->body);
-
-    if (!has_child_for_) {
-      leaf_for_nodes.push_back(tvm::ffi::GetRef<For>(op));
-    }
-
-    parent_has_child_for_ = parent_has_child_for;
-    parent_has_child_for_ = true;
-  }
-
-private:
-  bool has_child_for_ = false;
-  bool parent_has_child_for_ = false;
-};
-
 // GlobalMemChecker for a BufferLoad/BufferStore node:
 // 1. Identify BufferLoad and BufferStore nodes.
 // 2. Check if the buffer is in global scope.
@@ -109,13 +83,16 @@ struct GlobalMemChecker : public StmtExprVisitor {
       PrimExpr index = indices[i];
       PrimExpr shape_dim = buffer->shape[i];
 
-      bool has_variable = false;
+      bool is_index_constant = true;
       PostOrderVisit(index, [&](const ObjectRef &obj) {
         if (const VarNode *v = obj.as<VarNode>()) {
-          has_variable = true;
+          is_index_constant = false;
+        }
+        if (const BufferLoadNode *v = obj.as<BufferLoadNode>()) {
+          is_index_constant = false;
         }
       });
-      if (!has_variable) {
+      if (is_index_constant) {
         // If index is a constant, we can skip the check
         continue;
       }
@@ -145,18 +122,31 @@ private:
   bool recursively_collect_conds_;
 };
 
-class SafeMemorysRewriter : public StmtExprMutator {
-  arith::Analyzer *analyzer_;
-
+class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
 public:
-  explicit SafeMemorysRewriter(Map<Buffer, PrimExpr> annotated_safe_value_map,
-                               arith::Analyzer *analyzer)
-      : annotated_safe_value_map_(std::move(annotated_safe_value_map)),
-        analyzer_(analyzer) {}
+  // Static method to substitute and transform the given PrimFunc
+  static PrimFunc Substitute(PrimFunc f) {
+    arith::Analyzer analyzer;
+    // Create an instance of the legalizer with the analyzer
+    SafeMemorysRewriter substituter(&analyzer);
+    // Get a mutable copy of the function node
+    PrimFuncNode *fptr = f.CopyOnWrite();
+    for (const auto &[_, buffer] : f->buffer_map) {
+      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    // Apply the legalizer to the function body
+    fptr->body = substituter.VisitStmt(f->body);
+    return f;
+  }
 
 private:
+  // Constructor initializing the base class with the analyzer
+  SafeMemorysRewriter(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
+  // Constructor initializing the base class with the analyzer
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto load = Downcast<BufferLoad>(IRMutatorWithAnalyzer::VisitExpr_(op));
 
     // For Load/Store, we only check the current node, not its children.
     // Since rewriter will recursively visit children.
@@ -181,7 +171,7 @@ private:
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    auto store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto store = Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
     GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(store);
@@ -253,6 +243,25 @@ private:
     return evaluate;
   }
 
+  Stmt VisitStmt_(const BlockNode *op) final {
+    for (auto buffer : op->alloc_buffers) {
+      buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    if (op->annotations.count(attr::kSafeValueMap)) {
+      auto map = op->annotations.Get(attr::kSafeValueMap)
+                     ->as<Map<Var, PrimExpr>>()
+                     .value();
+      for (const auto &[var, safe_value] : map) {
+        ICHECK(buffer_data_to_buffer_.count(var))
+            << "buffer " << var << " is not found in the block "
+            << buffer_data_to_buffer_;
+        auto buffer = buffer_data_to_buffer_[var];
+        annotated_safe_value_map_.Set(buffer, safe_value);
+      }
+    }
+    return IRMutatorWithAnalyzer::VisitStmt_(op);
+  }
+
   bool IsLocalBuffer(const Buffer &buffer) {
     String scope = buffer.scope();
     return scope == "local" || scope == "local.fragment" ||
@@ -276,87 +285,6 @@ private:
     return make_zero(buffer->dtype);
   }
 
-  Map<Buffer, PrimExpr> annotated_safe_value_map_;
-};
-
-// Class to legalize safe memory access by transforming them appropriately
-class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
-public:
-  // Static method to substitute and transform the given PrimFunc
-  static PrimFunc Substitute(PrimFunc f) {
-    arith::Analyzer analyzer;
-    // Create an instance of the legalizer with the analyzer
-    SafeMemoryLegalizer substituter(&analyzer);
-    // Get a mutable copy of the function node
-    PrimFuncNode *fptr = f.CopyOnWrite();
-    for (const auto &[_, buffer] : f->buffer_map) {
-      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    // Apply the legalizer to the function body
-    fptr->body = substituter.VisitStmt(f->body);
-    return f;
-  }
-
-private:
-  // Constructor initializing the base class with the analyzer
-  SafeMemoryLegalizer(arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer) {}
-
-  // Override the VisitStmt_ method to handle ForNode (loop statements)
-  Stmt VisitStmt_(const ForNode *op) final {
-    // Visit and potentially modify the loop node
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    auto has_inner_loop = HasInnerLoop(for_node->body);
-    if (!has_inner_loop) {
-      SafeMemorysRewriter rewriter(annotated_safe_value_map_, analyzer_);
-      for_node.CopyOnWrite()->body = rewriter(for_node->body);
-      // // Detect Buffer Load Node in the loop body, collect the indices and
-      // buffer size
-
-      // // Run the checker on the loop body
-      // GlobalMemChecker checker(analyzer_);
-      // checker(for_node->body);
-      // Array<PrimExpr> conditions = checker.GetConditions();
-      // auto body = for_node->body;
-      // // Note that we might have duplicate conditions
-      // // Which will be optimized by simplify pass
-      // // Replace the loop body with the new body
-      // for (auto cond : conditions) {
-      //   body = IfThenElse(cond, body);
-      // }
-      // for_node.CopyOnWrite()->body = body;
-      return std::move(for_node);
-    }
-
-    // Visit a For Node
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const BlockNode *op) final {
-    for (auto buffer : op->alloc_buffers) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    if (op->annotations.count(attr::kSafeValueMap)) {
-      auto map = op->annotations.Get(attr::kSafeValueMap)
-                     ->as<Map<Var, PrimExpr>>()
-                     .value();
-      for (const auto &[var, safe_value] : map) {
-        ICHECK(buffer_data_to_buffer_.count(var))
-            << "buffer " << var << " is not found in the block "
-            << buffer_data_to_buffer_;
-        auto buffer = buffer_data_to_buffer_[var];
-        annotated_safe_value_map_.Set(buffer, safe_value);
-      }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  static bool HasInnerLoop(const Stmt &stmt) {
-    LeafForFinder finder;
-    finder(stmt);
-    return !finder.leaf_for_nodes.empty();
-  }
-
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, PrimExpr> annotated_safe_value_map_;
 };
@@ -371,7 +299,7 @@ tvm::transform::Pass LegalizeSafeMemoryAccess() {
     if (disable_safe_memory_legalize) {
       return f;
     }
-    return SafeMemoryLegalizer::Substitute(std::move(f));
+    return SafeMemorysRewriter::Substitute(std::move(f));
   };
   // Create and return a PrimFunc pass with the transformation function
   return CreatePrimFuncPass(pass_func, 0, "tl.LegalizeSafeMemoryAccess", {});
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index 5a83f0df..c10d5687 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -465,6 +465,16 @@ private:
     return std::move(store);
   }
 
+  Stmt VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == "tl.assume") {
+      PrimExpr condition = this->VisitExpr(Downcast<PrimExpr>(op->node));
+      auto n = CopyOnWrite(op);
+      n->node = std::move(condition);
+      return Parent::VisitStmt_(n.get());
+    }
+    return Parent::VisitStmt_(op);
+  }
+
 private:
   bool ArrayDeepEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs) {
     if (lhs.size() != rhs.size()) {
-- 
GitLab


From 721baedb7821c9be2950d45dad05a736a3590dfd Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sat, 22 Nov 2025 19:24:45 +0800
Subject: [PATCH 033/139] [Bugfix] Fix autotune cache (#1315)

---
 tilelang/autotuner/param.py | 198 ++++++++++++++++++++++++++++--------
 1 file changed, 153 insertions(+), 45 deletions(-)

diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index 3e401cc5..4c8d9a94 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -13,18 +13,25 @@ from pathlib import Path
 from tilelang.jit import JITKernel
 import cloudpickle
 import os
-import shutil
 from tilelang.engine.param import KernelParam
 from tilelang import logger
 import json
 import hashlib
+import uuid
+from tilelang import env
+from tvm.runtime import Executable
 
 BEST_CONFIG_PATH = "best_config.json"
 FUNCTION_PATH = "function.pkl"
 LATENCY_PATH = "latency.json"
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+
+# Align file names with cache/kernel_cache.py
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
+KERNEL_CUBIN_PATH = "kernel.cubin"
+KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
 
@@ -143,6 +150,31 @@ class AutotuneResult:
     func: Callable | None = None
     kernel: Callable | None = None
 
+    @staticmethod
+    def _load_binary(path: str):
+        with open(path, "rb") as file:
+            binary = file.read()
+        return binary
+
+    @staticmethod
+    def _safe_write_file(path: str, mode: str, operation: Callable[[Any], None]):
+        # Random a temporary file within the same FS as the cache directory
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}")
+        with open(temp_path, mode) as temp_file:
+            operation(temp_file)
+        # Use atomic POSIX replace, so other processes cannot see a partial write
+        os.replace(temp_path, path)
+
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
     def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
@@ -161,34 +193,68 @@ class AutotuneResult:
         """
         os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
 
-        # Save kernel source code
+        # Save device kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving kernel source code to file: {kernel_path}")
+                logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                with open(kernel_path, "w") as f:
-                    f.write(kernel.kernel_source)
+                self._safe_write_file(device_kernel_path, "w",
+                                      lambda f: f.write(kernel.kernel_source))
         except Exception as e:
             logger.error(f"Error saving kernel source code to disk: {e}")
 
-        # Save wrapped kernel source code
+        # Save host kernel source code (wrapped)
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path, "w") as f:
-                f.write(kernel.get_kernel_source())
+                logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
+            if kernel.execution_backend == "tvm_ffi":
+                self._safe_write_file(host_kernel_path, "w",
+                                      lambda f: f.write(kernel.adapter.get_host_source()))
+            else:
+                self._safe_write_file(host_kernel_path, "w",
+                                      lambda f: f.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             logger.error(f"Error saving wrapped kernel source code to disk: {e}")
 
-        # Save kernel library
+        # Save kernel library (backend-specific)
         try:
-            kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            shutil.copy(src_lib_path, kernel_lib_path)
+            if kernel.execution_backend == "nvrtc":
+                kernel_lib_file = KERNEL_CUBIN_PATH
+            elif kernel.execution_backend == "tvm_ffi":
+                kernel_lib_file = EXECUTABLE_PATH
+            else:
+                kernel_lib_file = KERNEL_LIB_PATH
+
+            kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+
+            if kernel.execution_backend == "nvrtc":
+                # Save cubin and python helper file
+                src_lib_path = kernel.adapter.libpath
+                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                py_src_path = src_lib_path.replace(".cubin", ".py")
+                if verbose:
+                    logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                self._safe_write_file(kernel_py_path, "wb",
+                                      lambda f: f.write(self._load_binary(py_src_path)))
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb",
+                                      lambda f: f.write(self._load_binary(src_lib_path)))
+            elif kernel.execution_backend == "tvm_ffi":
+                executable = kernel.adapter.executable
+                if verbose:
+                    logger.debug(f"Saving kernel executable to file: {kernel_lib_path}")
+                self._safe_write_executable(executable, kernel_lib_path)
+            else:
+                src_lib_path = kernel.adapter.libpath
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb",
+                                      lambda f: f.write(self._load_binary(src_lib_path)))
+
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
 
@@ -197,8 +263,7 @@ class AutotuneResult:
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            with open(params_path, "wb") as f:
-                cloudpickle.dump(kernel.params, f)
+            self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
         except Exception as e:
             logger.error(f"Error saving kernel parameters to disk: {e}")
 
@@ -210,6 +275,7 @@ class AutotuneResult:
         out_idx: list[int] | int | None = None,
         execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
+        compile_flags: list[str] | str | None = None,
         func: Callable = None,
         verbose: bool = False,
     ) -> JITKernel:
@@ -233,23 +299,46 @@ class AutotuneResult:
         if not os.path.exists(cache_path):
             return None
 
-        kernel_global_source: str | None = None
+        # Resolve backend to pick correct file names
+        if execution_backend == "nvrtc":
+            kernel_lib_file = KERNEL_CUBIN_PATH
+        elif execution_backend == "tvm_ffi":
+            kernel_lib_file = EXECUTABLE_PATH
+        else:
+            kernel_lib_file = KERNEL_LIB_PATH
+
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+        params_path = os.path.join(cache_path, PARAMS_PATH)
+
+        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+            return None
+
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
+        # Load optional device kernel source
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
+                logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
         except Exception as e:
-            logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+            logger.error(f"Error loading kernel source code from disk: {e}")
 
-        kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
+        # Load optional host kernel source
+        try:
+            if verbose:
+                logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception as e:
+            logger.error(f"Error loading host kernel source code from disk: {e}")
 
         # Load kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
@@ -257,10 +346,11 @@ class AutotuneResult:
         except Exception as e:
             logger.error(f"Error loading kernel parameters from disk: {e}")
 
-        if kernel_global_source and kernel_params:
+        if host_kernel_source and device_kernel_source and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -268,6 +358,7 @@ class AutotuneResult:
                 out_idx=out_idx,
                 execution_backend=execution_backend,
                 pass_configs=pass_configs,
+                compile_flags=compile_flags,
             )
         else:
             return None
@@ -276,26 +367,29 @@ class AutotuneResult:
         if not os.path.exists(path):
             os.makedirs(path)
 
-        # save best config
+        # save best config (atomic)
         if verbose:
             logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        with open(path / BEST_CONFIG_PATH, "w") as f:
-            json.dump(self.config, f)
+        self._safe_write_file(
+            str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
 
-        # save function
+        # save function (atomic)
         if verbose:
             logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        with open(path / FUNCTION_PATH, "wb") as f:
-            cloudpickle.dump(self.func, f)
+        self._safe_write_file(
+            str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
-        # save ref latency
+        # save ref latency (atomic)
         if verbose:
             logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
-        with open(path / LATENCY_PATH, "w") as f:
-            json.dump({
+        self._safe_write_file(
+            str(path / LATENCY_PATH),
+            "w",
+            lambda f: json.dump({
                 "latency": self.latency,
                 "ref_latency": self.ref_latency,
-            }, f)
+            }, f),
+        )
 
         # save kernel
         self._save_kernel_to_disk(path, self.kernel)
@@ -306,6 +400,13 @@ class AutotuneResult:
             return None
 
         verbose = compile_args.verbose
+        # Normalize target and resolve execution backend for loading
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend
+        norm_target = Target(_determine_target(compile_args.target)) if isinstance(
+            compile_args.target, str) else compile_args.target
+        requested_backend = compile_args.execution_backend
+        resolved_backend = resolve_execution_backend(requested_backend, norm_target)
         # load best config
         if verbose:
             logger.debug(f"Loading best config from file: {path / BEST_CONFIG_PATH}")
@@ -325,10 +426,17 @@ class AutotuneResult:
             latency = json.load(f)
             latency, ref_latency = latency["latency"], latency["ref_latency"]
 
-        kernel = cls._load_kernel_from_disk(cls, path, compile_args.target,
-                                            compile_args.target_host, compile_args.out_idx,
-                                            compile_args.execution_backend,
-                                            compile_args.pass_configs, func)
+        kernel = cls._load_kernel_from_disk(
+            cls,
+            path,
+            norm_target,
+            compile_args.target_host,
+            compile_args.out_idx,
+            resolved_backend,
+            compile_args.pass_configs,
+            None,  # compile_flags not tracked here
+            func,
+        )
         if kernel is None:
             return None
         kernel.update_tuner_result(
-- 
GitLab


From 9f7bac4c1c21d259c59f44114554256b39c3610b Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 23 Nov 2025 14:01:02 +0800
Subject: [PATCH 034/139] [Refactor] Backup Analyzer to get the appropriate
 arith informations (#1311)

* [Refactor] Update Vectorization Functions to Accept Analyzer Parameter

- Modified `VectorizeLoop` and related functions to accept an `arith::Analyzer` parameter, enhancing their capability to perform analysis during vectorization.
- Updated multiple instances in `copy.cc`, `fill.cc`, `parallel.cc`, and layout inference files to utilize the new analyzer parameter for improved performance and correctness.
- Ensured consistency across vectorization logic by integrating the analyzer into existing workflows, facilitating better optimization opportunities.

* [Fix] Corrected PostOrderVisit call in loop_vectorize.cc

- Updated the PostOrderVisit function to analyze the body of the loop node instead of the node itself, ensuring proper handling of nested loops during vectorization analysis.

* fix

* lint fix

* fix
---
 3rdparty/tvm                              |  2 +-
 src/op/copy.cc                            |  4 +-
 src/op/fill.cc                            |  6 +-
 src/op/parallel.cc                        |  3 +-
 src/transform/layout_inference.cc         | 12 ++-
 src/transform/legalize_vectorized_loop.cc |  2 +-
 src/transform/loop_vectorize.cc           | 99 +++++++++++++++--------
 src/transform/loop_vectorize.h            |  5 ++
 8 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index bc31e7ad..cd2b2b60 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit bc31e7ad9f9fafd7659dfabafe359fd55a0ffc1e
+Subproject commit cd2b2b6013d155b5822300b0a0740fa65320dd9e
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 8ffef5ea..c2dd06fc 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -852,7 +852,7 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
   auto par_op = ParallelOp(transformed_loop);
 
   if (is_cpu_target) {
-    vectorized_thread_loop = VectorizeLoop(transformed_loop);
+    vectorized_thread_loop = VectorizeLoop(transformed_loop, analyzer);
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
@@ -865,7 +865,7 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
     auto thread_var = T.thread_var;
     auto thread_loop =
         PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
-    vectorized_thread_loop = VectorizeLoop(thread_loop);
+    vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
   }
 
   if (par_op->GetPredicate(T.thread_var).defined()) {
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 83b0842d..93b3bca0 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -207,7 +207,7 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
@@ -215,7 +215,7 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     return vectorized_thread_loop;
   } else if (dst.scope() == "local") {
     auto init_loop = MakeSIMTLoop(analyzer);
-    auto vectorized_thread_loop = VectorizeLoop(init_loop);
+    auto vectorized_thread_loop = VectorizeLoop(init_loop, analyzer);
     return vectorized_thread_loop;
   } else if (dst.scope() == "shared.dyn" || dst.scope() == "shared" ||
              dst.scope() == "global") {
@@ -225,7 +225,7 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 81777aa5..0d09cc12 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -452,8 +452,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       // As the pass will do post processing to the layout
       auto maybe_remapped_root_ =
           IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
-      int vector_size = GetVectorizeSize(maybe_remapped_root_);
-
+      int vector_size = GetVectorizeSize(maybe_remapped_root_, T.analyzer);
       DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
 
       PrimExpr loop_total_size = 1;
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index bd726b3d..be98b284 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/utils.h>
 
 #include <algorithm>
+#include <memory>
 #include <queue>
 
 #include "../layout/utils.h"
@@ -85,6 +86,7 @@ public:
     auto &next = infer_list_[cur_infer_id];
     auto iter_var = thread_var_vec_[cur_infer_id];
     auto thread_bounds = thread_bounds_vec_[cur_infer_id];
+    arith::Analyzer *cur_analyzer = analyzer_vec_[cur_infer_id].get();
     auto buffer_oob = buffer_oob_vec_[cur_infer_id];
     // Double-check that 'next' is valid
     ICHECK(next.defined()) << "infer_list_[" << cur_infer_id
@@ -108,7 +110,7 @@ public:
     // Run InferLayout
     auto updates =
         next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
-                                          &analyzer_, buffer_oob},
+                                          cur_analyzer, buffer_oob},
                           level);
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
@@ -266,6 +268,9 @@ public:
     ICHECK_EQ(thread_bounds_vec_.size(), infer_list_.size())
         << "Size mismatch: thread_bounds_vec_ and infer_list_ must match in "
            "length.";
+    ICHECK_EQ(analyzer_vec_.size(), infer_list_.size())
+        << "Size mismatch: analyzer_vec_ and infer_list_ must match in "
+           "length.";
     ICHECK_EQ(buffer_oob_vec_.size(), infer_list_.size())
         << "Size mismatch: buffer_oob_vec_ and infer_list_ must match in "
            "length.";
@@ -452,6 +457,7 @@ private:
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
 
       // Compute buffer oob for each buffer in the op
       if (const auto *copy = p.as<CopyNode>()) {
@@ -542,6 +548,7 @@ private:
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
       buffer_oob_vec_.push_back(false);
     } else {
       IRVisitorWithAnalyzer::VisitStmt(op->body);
@@ -683,6 +690,7 @@ private:
                                 IterVarType::kDataPar);
   std::vector<IterVar> thread_var_vec_;
   std::vector<Range> thread_bounds_vec_;
+  std::vector<std::unique_ptr<arith::Analyzer>> analyzer_vec_;
   std::vector<bool> buffer_oob_vec_;
   Target target_;
   LayoutMap annotated_layout_map_;
@@ -1024,7 +1032,7 @@ private:
       });
 
       if ((has_non_local || has_cast_operations) && !has_reducer) {
-        for_node = VectorizeLoop(for_node);
+        for_node = VectorizeLoop(for_node, analyzer_);
       }
 
       if (result_.predicate_map.count(root) && parallel_loop) {
diff --git a/src/transform/legalize_vectorized_loop.cc b/src/transform/legalize_vectorized_loop.cc
index aa461784..4fd4ab91 100644
--- a/src/transform/legalize_vectorized_loop.cc
+++ b/src/transform/legalize_vectorized_loop.cc
@@ -73,7 +73,7 @@ private:
     // Change the loop kind from vectorized to serial
     for_node.CopyOnWrite()->kind = ForKind::kSerial;
     // Apply vectorization transformation to the loop
-    return VectorizeLoop(for_node);
+    return VectorizeLoop(for_node, analyzer_);
   }
 };
 
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 45283d90..e8a18b00 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -45,7 +45,7 @@ struct VectorizePlanResult {
   PrimExpr condition;
 };
 
-class VectorizeFindGlobalAccess : public arith::IRVisitorWithAnalyzer {
+class VectorizeFindGlobalAccess : public StmtExprVisitor {
 public:
   VectorizeFindGlobalAccess() = default;
 
@@ -60,19 +60,20 @@ private:
   void VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return StmtExprVisitor::VisitStmt_(node);
   }
 
   void VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return StmtExprVisitor::VisitExpr_(node);
   }
 };
 
-class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
+class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
 public:
-  VectorizePlanner() = default;
+  explicit VectorizePlanner(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
 
   int Plan(const For &node) {
     tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
@@ -92,21 +93,31 @@ public:
   }
 
 private:
-  void VisitStmt_(const ForNode *node) final {
+  Stmt VisitStmt_(const ForNode *node) final {
     inner_for_ = node;
-    auto extent_ptr = as_const_int(analyzer_.Simplify(node->extent));
-    // Here I disable dynamic shape completely,
-    //   In order to do it, the Planner should accept an analyzer with
-    //   arithmetic info outside to prove the dividiblity of vector size
-    if (!extent_ptr) {
-      vector_size_ = 1;
-      return;
+    bool contains_nested_for = false;
+    // Must analysis vectorization on the innermost loop
+    PostOrderVisit(Downcast<Stmt>(node->body), [&](const ObjectRef &obj) {
+      if (obj.as<ForNode>()) {
+        contains_nested_for = true;
+      }
+    });
+
+    if (!contains_nested_for) {
+      auto extent_ptr = as_const_int(analyzer_->Simplify(node->extent));
+      // Here I disable dynamic shape completely,
+      //   In order to do it, the Planner should accept an analyzer with
+      //   arithmetic info outside to prove the dividiblity of vector size
+      if (!extent_ptr) {
+        vector_size_ = 1;
+        return ffi::GetRef<Stmt>(node);
+      }
+      vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
     }
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const BufferLoadNode *node) final {
+  PrimExpr VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
@@ -115,43 +126,44 @@ private:
       // constant buffer that tl hack to use as local register.
       auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
       if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
       }
     }
     UpdateVectorSize(node->indices, node->buffer);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void VisitStmt_(const BufferStoreNode *node) final {
+  Stmt VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
     UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr(node->value);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitStmt_(const IfThenElseNode *node) final {
+  Stmt VisitStmt_(const IfThenElseNode *node) final {
     CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const CallNode *node) final {
+  PrimExpr VisitExpr_(const CallNode *node) final {
     if (node->op == builtin::if_then_else()) {
       CheckConditionVectorized(node->args[0]);
     } else if (node->op == builtin::call_extern()) {
       // do not vectorize extern calls
       vector_size_ = 1;
     }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void CheckConditionVectorized(const PrimExpr &cond) {
     // TODO: perform some checks here
   }
 
-  void VisitExpr_(const CastNode *node) final {
+  PrimExpr VisitExpr_(const CastNode *node) final {
     vector_size_ = arith::ZeroAwareGCD(
         vector_load_bits_max_ / node->dtype.bits(), vector_size_);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
@@ -171,19 +183,16 @@ private:
     for (int i = 0; i < indices.size(); ++i) {
       elem_offset += indices[i] * strides[i];
     }
-
     // 2. If element offset is independent with loop_var, ignore it
-    if (CanProveIndependent(elem_offset, inner_for_->loop_var, &analyzer_)) {
+    if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {
       return;
     }
-
     // 3. Tight vectorize bound
     vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ /
                                                          buffer->dtype.bits());
-
     // 4. Try to vectorize buffer load
     while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
+                               inner_for_->extent, vector_size_, analyzer_)) {
       vector_size_ /= 2;
     }
   }
@@ -235,7 +244,14 @@ private:
   const int vector_size_;
 };
 
-int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); }
+int GetVectorizeSize(const For &loop) {
+  arith::Analyzer analyzer;
+  return VectorizePlanner(&analyzer).Plan(loop);
+}
+
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer) {
+  return VectorizePlanner(analyzer).Plan(loop);
+}
 
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer) {
@@ -274,10 +290,10 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
   if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_size_for_iter),
                                0))
     return false;
-
+  auto simplified_expr = analyzer->Simplify(Substitute(expr, {{var, zero}}));
   // The base offset must be divisible
-  if (!analyzer->CanProveEqual(
-          FloorMod(Substitute(expr, {{var, zero}}), target_size_for_expr), 0)) {
+  if (!analyzer->CanProveEqual(FloorMod(simplified_expr, target_size_for_expr),
+                               zero)) {
     return false;
   }
 
@@ -308,7 +324,20 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
 
 For VectorizeLoop(const For &loop, int vectorize_hint) {
   if (vectorize_hint <= 0) {
-    VectorizePlanner planner;
+    arith::Analyzer analyzer;
+    VectorizePlanner planner(&analyzer);
+    vectorize_hint = planner.Plan(loop);
+  }
+  if (vectorize_hint == 1)
+    return loop;
+  auto rewriter = VectorizeRewriter(vectorize_hint);
+  return Downcast<For>(rewriter(loop));
+}
+
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint) {
+  if (vectorize_hint <= 0) {
+    VectorizePlanner planner(analyzer);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index 4ab20c66..a63c4b45 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -35,8 +35,13 @@ using namespace tir;
 
 int GetVectorizeSize(const For &loop);
 
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer);
+
 For VectorizeLoop(const For &loop, int vectorize_hint = -1);
 
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint = -1);
+
 // Can prove expr is independent with var, i.e. the value of expr doesn't change
 // when var changes
 bool CanProveIndependent(const PrimExpr &expr, Var var,
-- 
GitLab


From ca98cc391790d160cffcb0b997c2380c276b8e2e Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 24 Nov 2025 16:17:13 +0800
Subject: [PATCH 035/139] Revert "[WIP] support more dtypes for tcgen05
 (#1229)" (#1323)

This reverts commit 0d101c110f74ebf2ef8c11a5ece9dfb314b48baa.

Co-authored-by: Zhiwen Mo <zm125@ic.ac.uk>
---
 .../example_tilelang_gemm_fp8_sm100.py        | 126 ---
 src/op/copy.cc                                |  14 +-
 src/op/gemm_py.cc                             |   2 -
 src/op/tcgen5_meta.h                          |  38 +-
 src/tl_templates/cuda/copy_sm100.h            |  35 +-
 src/tl_templates/cuda/gemm_sm100.h            |  76 +-
 src/tl_templates/cuda/tcgen_05_ld.h           | 753 +-----------------
 tilelang/intrinsics/mma_macro_generator.py    |   3 -
 .../intrinsics/tcgen05_macro_generator.py     |   9 +-
 tilelang/jit/adapter/wrapper.py               |   1 -
 tilelang/tileop/gemm/gemm_tcgen05.py          |   5 +-
 11 files changed, 87 insertions(+), 975 deletions(-)
 delete mode 100644 examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
deleted file mode 100644
index 4628a997..00000000
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import tilelang
-import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
-
-
-def matmul(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
-            mbar = T.alloc_barrier(1)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
-
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm_v2(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=(k == 0),
-                )
-                T.mbarrier_wait_parity(mbar, k % 2)
-
-            T.copy(C_tmem, C_local)
-            T.copy(C_local, C_shared)
-
-            T.copy(C_shared, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def calc_diff(x, y):
-    x, y = x.double(), y.double()
-    denominator = (x * x + y * y).sum()
-    sim = 2 * (x * y).sum() / denominator
-    return 1 - sim
-
-
-M, N, K = 4096, 4096, 8192
-block_M, block_N, block_K = 64, 256, 32
-trans_A, trans_B = False, True
-num_stages = 2
-threads = 256
-for tvm_fp8_dtype in ["float8_e4m3", "float8_e5m2"]:
-    for tvm_acc_dtype in ["float16", "float32"]:  # , torch.float16]:
-        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
-        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
-        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
-        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
-
-        func = matmul(
-            M,
-            N,
-            K,
-            block_M,
-            block_N,
-            block_K,
-            trans_A,
-            trans_B,
-            in_dtype,
-            out_dtype,
-            accum_dtype,
-            num_stages,
-            threads,
-        )
-        jit_kernel = tilelang.compile(
-            func,
-            out_idx=[2],
-            target="cuda",
-            pass_configs={
-                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
-            },
-        )
-        # jit_kernel.export_ptx("./dump.ptx")
-        # jit_kernel.export_sources("./dump.cu")
-
-        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
-        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
-
-        c = jit_kernel(a, b)
-        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
-        c = c.float()
-        diff = calc_diff(c, ref_c)
-        # assert diff < 1e-3, f"{diff}"
-        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
-
-        profiler = jit_kernel.get_profiler()
-        latency = profiler.do_bench()
-        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
-        print(
-            f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS"
-        )
diff --git a/src/op/copy.cc b/src/op/copy.cc
index c2dd06fc..2584abce 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -1117,11 +1117,6 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
-  bool src_needs_pack =
-      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
-  bool dst_needs_unpack =
-      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
-
   if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
     is_ld = true;
   } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
@@ -1129,8 +1124,9 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: " << "src scope = "
-              << src.scope() << ", dst scope = " << dst.scope();
+    ICHECK(0) << "Unsupported tensor memory copy: "
+              << "src scope = " << src.scope()
+              << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1250,10 +1246,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
-      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ", " +
-                               bool_str + ">"));
+                               std::to_string(num_chunks_each_wg) + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index 6097998c..ac506ee0 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -428,8 +428,6 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           result.push_back(Integer(meta.atom_m));
           result.push_back(Integer(meta.atom_n));
           result.push_back(Integer(meta.atom_k));
-          result.push_back(Integer(meta.enable_ws));
-          result.push_back(Integer(meta.enable_2cta));
         }
         return result;
       });
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index 350a2bc8..bb63c8dc 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -15,19 +15,16 @@ using runtime::DataType;
 
 struct TCGEN5MMAMeta {
   int atom_m, atom_n, atom_k;
-  bool enable_ws, enable_2cta;
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
 GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
 #define FAIL                                                                   \
+  return { false, TCGEN5MMAMeta{0, 0, 0} }
+#define SUCCESS(atom_m, atom_n, atom_k)                                        \
   return {                                                                     \
-    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
-  }
-#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
-  return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
   }
   std::vector<int> ws_valid_atom_ns = {256, 128, 64};
   if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
@@ -37,52 +34,39 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     if (M % 128 == 0) {
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16, false, false);
+          SUCCESS(128, atom_n, 16);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16, false, false);
+          SUCCESS(64, atom_n, 16);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16, false, false);
+          SUCCESS(32, atom_n, 16);
       FAIL;
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e4m3() ||
-              ab_dtype.is_float8_e5m2() || ab_dtype.is_float8_e5m2fnuz() ||
-              ab_dtype.is_float6_e2m3fn() || ab_dtype.is_float6_e3m2fn() ||
-              ab_dtype.is_float4_e2m1fn()) &&
-             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
-              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
+  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
+             (c_dtype.is_float() && c_dtype.bits() == 32)) {
     if (K % 32 != 0)
       FAIL;
     if (M % 128 == 0) {
-      for (int atom_n : ws_valid_atom_ns)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32, true, false);
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32, false, true);
-      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32, false, false);
+          SUCCESS(128, atom_n, 32);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32, true, false);
-      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32, false, false);
+          SUCCESS(64, atom_n, 32);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32, true, false);
+          SUCCESS(32, atom_n, 32);
       FAIL;
     } else {
       FAIL;
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index aa898bcc..c4047c34 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -51,21 +51,6 @@ __device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
                :
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e5_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
-  return ret;
-}
-
-__device__ __forceinline__ void st_global_256(fp8_e5_32_t *ptr,
-                                              fp8_e5_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
-}
 
 __device__ __forceinline__ unsigned long long
 pack_bfloat16x4(const bfloat16_t x, const bfloat16_t y, const bfloat16_t z,
@@ -110,38 +95,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, bool pack16, typename dst_t>
+template <int N, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
-      tmem_start_col + tmem_col_offset, dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
+                                               dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, bool pack16, typename dst_t>
+template <int N, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
-      tmem_start_col + tmem_col_offset, dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
+                                               dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, bool pack16, typename dst_t>
+template <int N, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, bool pack16, typename dst_t>
+template <int N, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 6c68c2c2..856d37dd 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,96 +243,46 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
-                           K, a_major, b_major,
+struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
-                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
-                 integral_constant<UMMA::Major, b_major>,
-                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
-};
-template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
-                           K, a_major, b_major,
-                           std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
-                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
-                         integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
+                         Int<N>, integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
-                           K, a_major, b_major,
-                           std::enable_if_t<M == 64 && K == 32>> {
+struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
-                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
+                 Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
-template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
-                           K, a_major, b_major,
-                           std::enable_if_t<M == 64 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
-                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
-                         integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
-};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
-                           K, a_major, b_major,
+struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
-                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
-                 integral_constant<UMMA::Major, b_major>,
-                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
-};
-template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
-                           K, a_major, b_major,
-                           std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
-                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
-                         integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
+                         Int<N>, integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
-                           K, a_major, b_major,
+struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
-                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
+                 Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
-template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
-                           K, a_major, b_major,
-                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
-                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
-                         integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
-};
 
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index 9e5e3420..b2eb2f81 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,9 +10,7 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-template <bool Pack16> class tmem_ld_32dp32bNx;
-
-template <> class tmem_ld_32dp32bNx<false> {
+class tmem_ld_32dp32bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -182,180 +180,9 @@ public:
     }
   }
 };
-template <> class tmem_ld_32dp32bNx<true> {
-public:
-  template <int N>
-  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
-                  "N must be a power of 2 and lies between 1 ~ 128");
-
-    if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
-                   "{%0},"
-                   "[%1];\n"
-                   : "=r"(dst_ptr[0])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
-                   "{%0, %1},"
-                   "[%2];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
-                   : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
-                   "{%0, %1, %2, %3},"
-                   "[%4];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3])
-                   : "r"(src_addr));
-    } else if constexpr (N == 8) {
-      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
-                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
-                   "[%8];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
-                   : "r"(src_addr));
-    } else if constexpr (N == 16) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15},"
-          "[%16];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15])
-          : "r"(src_addr));
-    } else if constexpr (N == 32) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
-          "%26, %27, %28, %29, %30, %31},"
-          "[%32];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
-          : "r"(src_addr));
-    } else if constexpr (N == 64) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63},"
-          "[%64];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63])
-          : "r"(src_addr));
-    } else if constexpr (N == 128) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
-          "%70, "
-          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
-          "%84, "
-          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
-          "%98, "
-          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
-          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
-          "%121, %122, %123, %124, %125, %126, %127},"
-          "[%128];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
-            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
-            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
-            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
-            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
-            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
-            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
-            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
-            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
-            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
-            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
-            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
-            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
-            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
-            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
-            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
-            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
-            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
-            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
-            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
-            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
-            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
-          : "r"(src_addr));
-    } else {
-      asm volatile("trap");
-    }
-  }
-};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-template <bool Pack16> class tmem_ld_16dp64bNx;
-template <> class tmem_ld_16dp64bNx<false> {
+class tmem_ld_16dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -525,43 +352,39 @@ public:
     }
   }
 };
-template <> class tmem_ld_16dp64bNx<true> {
+
+// 16 data path lanes, 128-bit pattern, repeated N times
+class tmem_ld_16dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
-                  "N must be a power of 2 and lies between 1 ~ 128");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
-                   "{%0},"
-                   "[%1];\n"
-                   : "=r"(dst_ptr[0])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
-      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -572,9 +395,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -591,9 +414,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -626,9 +449,9 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 128) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -696,39 +519,32 @@ public:
   }
 };
 
-// 16 data path lanes, 128-bit pattern, repeated N times
-template <bool Pack16> class tmem_ld_16dp128bNx;
-template <> class tmem_ld_16dp128bNx<false> {
+// 16 data path lanes, 256-bit pattern, repeated N times
+class tmem_ld_16dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
-                   "{%0, %1},"
-                   "[%2];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 4) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -739,9 +555,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -758,9 +574,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -793,332 +609,7 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
-          "%70, "
-          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
-          "%84, "
-          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
-          "%98, "
-          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
-          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
-          "%121, %122, %123, %124, %125, %126, %127},"
-          "[%128];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
-            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
-            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
-            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
-            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
-            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
-            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
-            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
-            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
-            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
-            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
-            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
-            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
-            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
-            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
-            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
-            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
-            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
-            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
-            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
-            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
-            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
-          : "r"(src_addr));
-    } else {
-      asm volatile("trap");
-    }
-  }
-};
-template <> class tmem_ld_16dp128bNx<true> {
-public:
-  template <int N>
-  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
-
-    if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
-                   "{%0, %1},"
-                   "[%2];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
-                   "{%0, %1, %2, %3},"
-                   "[%4];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3])
-                   : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
-                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
-                   "[%8];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
-                   : "r"(src_addr));
-    } else if constexpr (N == 8) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15},"
-          "[%16];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15])
-          : "r"(src_addr));
-    } else if constexpr (N == 16) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
-          "%26, %27, %28, %29, %30, %31},"
-          "[%32];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
-          : "r"(src_addr));
-    } else if constexpr (N == 32) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63},"
-          "[%64];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63])
-          : "r"(src_addr));
-    } else if constexpr (N == 64) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
-          "%70, "
-          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
-          "%84, "
-          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
-          "%98, "
-          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
-          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
-          "%121, %122, %123, %124, %125, %126, %127},"
-          "[%128];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
-            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
-            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
-            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
-            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
-            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
-            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
-            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
-            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
-            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
-            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
-            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
-            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
-            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
-            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
-            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
-            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
-            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
-            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
-            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
-            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
-            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
-          : "r"(src_addr));
-    } else {
-      asm volatile("trap");
-    }
-  }
-};
-
-// 16 data path lanes, 256-bit pattern, repeated N times
-template <bool Pack16> class tmem_ld_16dp256bNx;
-template <> class tmem_ld_16dp256bNx<false> {
-public:
-  template <int N>
-  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
-
-    if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
-                   "{%0, %1, %2, %3},"
-                   "[%4];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
-                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
-                   "[%8];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
-                   : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15},"
-          "[%16];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15])
-          : "r"(src_addr));
-    } else if constexpr (N == 8) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
-          "%26, %27, %28, %29, %30, %31},"
-          "[%32];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
-          : "r"(src_addr));
-    } else if constexpr (N == 16) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63},"
-          "[%64];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63])
-          : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 32) {
       asm volatile(
           "tcgen05.ld.sync.aligned.16x256b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
@@ -1187,193 +678,35 @@ public:
     }
   }
 };
-template <> class tmem_ld_16dp256bNx<true> {
-public:
-  template <int N>
-  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
-
-    if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
-                   "{%0, %1, %2, %3},"
-                   "[%4];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3])
-                   : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
-                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
-                   "[%8];\n"
-                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
-                   : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15},"
-          "[%16];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15])
-          : "r"(src_addr));
-    } else if constexpr (N == 8) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
-          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
-          "%26, %27, %28, %29, %30, %31},"
-          "[%32];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
-          : "r"(src_addr));
-    } else if constexpr (N == 16) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63},"
-          "[%64];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63])
-          : "r"(src_addr));
-    } else if constexpr (N == 32) {
-      asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
-          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
-          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
-          "%28, "
-          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
-          "%42, "
-          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
-          "%56, "
-          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
-          "%70, "
-          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
-          "%84, "
-          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
-          "%98, "
-          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
-          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
-          "%121, %122, %123, %124, %125, %126, %127},"
-          "[%128];\n"
-          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
-            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
-            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
-            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
-            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
-            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
-            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
-            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
-            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
-            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
-            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
-            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
-            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
-            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
-            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
-            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
-            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
-            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
-            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
-            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
-            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
-            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
-            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
-            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
-            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
-            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
-            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
-            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
-            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
-            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
-            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
-            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
-            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
-            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
-            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
-            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
-            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
-            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
-            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
-            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
-            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
-            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
-            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
-          : "r"(src_addr));
-    } else {
-      asm volatile("trap");
-    }
-  }
-};
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-template <bool Pack16 = false> class tmem_ld_32dp64bNx {
+class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-template <bool Pack16 = false> class tmem_ld_32dp128bNx {
+class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-template <bool Pack16 = false> class tmem_ld_32dp256bNx {
+class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index bbfeb157..8c546c63 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -45,10 +45,7 @@ class TensorCoreIntrinEmitter:
         "int8": "int8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
-        "float8_e4m3fn": "e4m3",
-        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
-        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 966f4dc4..e53ff7cb 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -169,11 +169,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 5:
+        if len(meta) != 3:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
                 f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
+        atom_m, atom_n, atom_k = (int(x) for x in meta)
+        enable_ws = atom_m != 128
 
         # by default, we utilize non-swizzle layout offset
         a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
@@ -381,10 +382,10 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         k = int(self.chunk)
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
-        if len(meta) != 5:
+        if len(meta) != 3:
             raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
                              f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
+        atom_m, atom_n, _ = (int(x) for x in meta)
 
         if m % atom_m != 0 or n % atom_n != 0:
             raise ValueError(
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 75607976..48b8e908 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -144,7 +144,6 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
-        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 1de9fe87..52c192e5 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -85,9 +85,6 @@ class GemmTCGEN5(GemmBase):
             raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
                              f"A scope {self.A.scope()}, B scope {self.B.scope()}")
 
-        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(
-            self.M, self.N, self.K)
-
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
         if self.B.scope() not in {"shared", "shared.dyn"}:
@@ -106,7 +103,7 @@ class GemmTCGEN5(GemmBase):
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype not in ["float32", 'float16']:
+        if accum_dtype != "float32":
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
-- 
GitLab


From fddcbbd665d2fc8eed0f629fbcb2521798068d66 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Nov 2025 16:48:45 +0800
Subject: [PATCH 036/139] [CI]: Bump actions/checkout from 5 to 6 (#1319)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/ci.yml               | 4 ++--
 .github/workflows/dist.yml             | 4 ++--
 .github/workflows/pr-perfbench-bot.yml | 2 +-
 .github/workflows/publish-docs.yml     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9fe3286..c33a25b6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,7 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
@@ -104,7 +104,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 0ba3fbc3..ed63914c 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -52,7 +52,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
@@ -122,7 +122,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
diff --git a/.github/workflows/pr-perfbench-bot.yml b/.github/workflows/pr-perfbench-bot.yml
index 37da4e3c..e6954bcc 100644
--- a/.github/workflows/pr-perfbench-bot.yml
+++ b/.github/workflows/pr-perfbench-bot.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           ref: refs/pull/${{ github.event.issue.number }}/merge
           fetch-depth: 0
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 95330310..2197015b 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
-- 
GitLab


From 2a70fd3f9e93dee4e776a9891377340d8170cc5e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Nov 2025 16:49:18 +0800
Subject: [PATCH 037/139] [CI]: Bump pypa/cibuildwheel from 3.2 to 3.3 (#1318)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/dist.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index ed63914c..ff230af4 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -160,7 +160,7 @@ jobs:
           fi
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.2
+        uses: pypa/cibuildwheel@v3.3
         with:
           package-dir: .
           output-dir: wheelhouse
-- 
GitLab


From 01d207fa1494a5c46b2cc44d0682ce0544271418 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Mon, 24 Nov 2025 18:32:00 +0800
Subject: [PATCH 038/139] [Installation] Fix building using customized TVM path
 (#1326)

---
 cmake/load_tvm.cmake             | 5 ++++-
 docs/get_started/Installation.md | 9 +++++----
 tilelang/env.py                  | 6 +++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cmake/load_tvm.cmake b/cmake/load_tvm.cmake
index f013c3ba..cb21be95 100644
--- a/cmake/load_tvm.cmake
+++ b/cmake/load_tvm.cmake
@@ -3,12 +3,15 @@
 set(TVM_BUILD_FROM_SOURCE TRUE)
 set(TVM_SOURCE ${CMAKE_SOURCE_DIR}/3rdparty/tvm)
 
-if(DEFINED $ENV{TVM_ROOT})
+if(DEFINED ENV{TVM_ROOT})
   if(EXISTS $ENV{TVM_ROOT}/cmake/config.cmake)
     set(TVM_SOURCE $ENV{TVM_ROOT})
+    message(STATUS "Using TVM_ROOT from environment variable: ${TVM_SOURCE}")
   endif()
 endif()
 
+message(STATUS "Using TVM source: ${TVM_SOURCE}")
+
 set(TVM_INCLUDES
   ${TVM_SOURCE}/include
   ${TVM_SOURCE}/src
diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index be0d794e..585a0029 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -93,14 +93,16 @@ Some useful CMake options you can toggle while configuring:
 
 (using-existing-tvm)=
 
-### Building with Existing TVM Installation
+### Building with Customized TVM Path
 
-If you already have a compatible TVM installation, use the `TVM_ROOT` environment variable to specify the location of your existing TVM repository when building tilelang:
+If you already have a TVM codebase, use the `TVM_ROOT` environment variable to specify the location of your existing TVM repository when building tilelang:
 
 ```bash
 TVM_ROOT=<your-tvm-repo> pip install . -v
 ```
 
+> **Note**: This will still rebuild the TVM-related libraries (stored in `TL_LIBS`). And this method often leads to some path issues. Check `env.py` to see some environment variables which are not set properly.
+
 (install-using-docker)=
 
 ## Install Using Docker
@@ -197,8 +199,7 @@ Set `NO_TOOLCHAIN_VERSION=ON` to disable this.
 
 ### Run-time environment variables
 
-<!-- TODO: tvm -->
-TODO
+Please refer to the `env.py` file for a full list of supported run-time environment variables.
 
 ## Other Tips
 
diff --git a/tilelang/env.py b/tilelang/env.py
index b98bbf98..39d9e722 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -314,9 +314,9 @@ else:
     if tvm_path not in sys.path:
         prepend_pythonpath(tvm_path)
         env.TVM_IMPORT_PYTHON_PATH = tvm_path
-
-    if os.environ.get("TVM_LIBRARY_PATH") is None:
-        os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
+# By default, the built TVM-related libraries are stored in TL_LIBS.
+if os.environ.get("TVM_LIBRARY_PATH") is None:
+    os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
 
 # Initialize CUTLASS paths
 if os.environ.get("TL_CUTLASS_PATH", None) is None:
-- 
GitLab


From 6c2162a9fdcd1e754faea9944da033c3199b08c1 Mon Sep 17 00:00:00 2001
From: Yichen Yan <wenji.yyc@alibaba-inc.com>
Date: Mon, 24 Nov 2025 19:07:51 +0800
Subject: [PATCH 039/139] [Release] Allow developer with write permission to
 trigger wheel release (#1322)

---
 .github/workflows/dist.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index ff230af4..73c08936 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,5 +1,6 @@
 name: Dist
 on:
+  workflow_dispatch:
   schedule:
     # gemini said this is 6:00 china time
     - cron: "0 22 * * *"
-- 
GitLab


From caa6dd3f02885960a75f299f73a94f67e0817477 Mon Sep 17 00:00:00 2001
From: Tong WU <109033598+Rachmanino@users.noreply.github.com>
Date: Mon, 24 Nov 2025 19:38:14 +0800
Subject: [PATCH 040/139] [Feat] Support warp reduce (#1316)

* [Feat] Support warp reduce

* lint

* add test

* lint
---
 src/op/builtin.cc                             | 25 ++++++
 src/op/builtin.h                              | 25 ++++++
 src/target/codegen_cuda.cc                    | 10 +++
 src/tl_templates/cuda/reduce.h                | 31 +++++++
 .../test_tilelang_language_warp_reduce.py     | 83 +++++++++++++++++++
 tilelang/language/__init__.py                 |  5 ++
 tilelang/language/reduce.py                   | 80 ++++++++++++++++++
 7 files changed, 259 insertions(+)
 create mode 100644 testing/python/language/test_tilelang_language_warp_reduce.py

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index e7e86f2f..ced86cfa 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -341,5 +341,30 @@ TIR_DEFINE_TL_BUILTIN(tcgen05_mma_arrive)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(warp_reduce_sum)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_max)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_min)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitand)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index f5c7d9ed..7ae638f1 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -571,6 +571,31 @@ TVM_DLL const Op &device_assert();
  */
 TVM_DLL const Op &device_assert_with_msg();
 
+/*!
+ * \brief tilelang intrinsic for warp reduction sum.
+ */
+TVM_DLL const Op &warp_reduce_sum();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction max.
+ */
+TVM_DLL const Op &warp_reduce_max();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction min.
+ */
+TVM_DLL const Op &warp_reduce_min();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitand.
+ */
+TVM_DLL const Op &warp_reduce_bitand();
+
+/*!
+ * \brief tilelang intrinsic for warp reduction bitor.
+ */
+TVM_DLL const Op &warp_reduce_bitor();
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index dda96925..99512b8b 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -2609,6 +2609,16 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index a083c711..45824264 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -250,4 +250,35 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   }
 };
 
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  constexpr uint32_t mask = 0xffffffff;
+  value = op(value, __shfl_xor_sync(mask, value, 16));
+  value = op(value, __shfl_xor_sync(mask, value, 8));
+  value = op(value, __shfl_xor_sync(mask, value, 4));
+  value = op(value, __shfl_xor_sync(mask, value, 2));
+  value = op(value, __shfl_xor_sync(mask, value, 1));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
new file mode 100644
index 00000000..681b2347
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -0,0 +1,83 @@
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def get_kernel(reduce_op: str, dtype: str):
+
+    assert reduce_op in ["sum", "max", "min", "bitand", "bitor"]
+
+    @T.prim_func
+    def main(x: T.Tensor((32), dtype)):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding(0)
+            local_val = T.alloc_local([1], dtype)
+            local_val[0] = x[tx]
+            reduced_val = T.alloc_local([1], dtype)
+            if reduce_op == "sum":
+                reduced_val[0] = T.warp_reduce_sum(local_val[0])
+            elif reduce_op == "max":
+                reduced_val[0] = T.warp_reduce_max(local_val[0])
+            elif reduce_op == "min":
+                reduced_val[0] = T.warp_reduce_min(local_val[0])
+            elif reduce_op == "bitand":
+                reduced_val[0] = T.warp_reduce_bitand(local_val[0])
+            elif reduce_op == "bitor":
+                reduced_val[0] = T.warp_reduce_bitor(local_val[0])
+            x[tx] = reduced_val[0]
+
+    return main
+
+
+def test_warp_reduce_sum():
+    a = torch.randn((32,), dtype=torch.float32, device='cuda')
+    kernel = get_kernel('sum', 'float32')
+    ref = torch.full_like(a, a.sum())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_max():
+    a = torch.randn((32,), dtype=torch.float32, device='cuda')
+    kernel = get_kernel("max", 'float32')
+    print(kernel.get_kernel_source())
+    ref = torch.full_like(a, a.max())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_min():
+    a = torch.randn((32,), dtype=torch.float32, device='cuda')
+    kernel = get_kernel("min", 'float32')
+    ref = torch.full_like(a, a.min())
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitand():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
+    kernel = get_kernel("bitand", 'int32')
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val & a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+def test_warp_reduce_bitor():
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
+    kernel = get_kernel("bitor", 'int32')
+    ref_val = a[0]
+    for i in range(1, a.shape[0]):
+        ref_val = ref_val | a[i]
+    ref = torch.full_like(a, ref_val)
+    kernel(a)
+    torch.testing.assert_close(a, ref)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 95488bdf..75d8d0b4 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -65,6 +65,11 @@ from .reduce import (
     reduce_bitxor,  # noqa: F401
     cumsum,  # noqa: F401
     finalize_reducer,  # noqa: F401
+    warp_reduce_sum,  # noqa: F401
+    warp_reduce_max,  # noqa: F401
+    warp_reduce_min,  # noqa: F401
+    warp_reduce_bitand,  # noqa: F401
+    warp_reduce_bitor,  # noqa: F401
 )
 from .print import print, device_assert  # noqa: F401
 from .customize import (
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 09289559..23bb6d05 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -325,3 +325,83 @@ def finalize_reducer(reducer: tir.Buffer):
         tir.op.Op.get("tl.finalize_reducer"),
         reducer.access_ptr("w"),
     )
+
+
+def warp_reduce_sum(value: tir.PrimExpr):
+    """Perform warp reduction sum on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the sum of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced sum value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_sum"), value)
+
+
+def warp_reduce_max(value: tir.PrimExpr):
+    """Perform warp reduction max on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the max of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced max value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_max"), value)
+
+
+def warp_reduce_min(value: tir.PrimExpr):
+    """Perform warp reduction min on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the min of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced min value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_min"), value)
+
+
+def warp_reduce_bitand(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-and on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-and of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-and value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitand"), value)
+
+
+def warp_reduce_bitor(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-or on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-or of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-or value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitor"), value)
-- 
GitLab


From c30df2a1c58bc6296e2a6027b4ebacf9f1b82202 Mon Sep 17 00:00:00 2001
From: Wenhao Xie <wh.xie@outlook.com>
Date: Tue, 25 Nov 2025 01:08:35 +0800
Subject: [PATCH 041/139] [Enhancement] Support more dtype in `T.print` (#1329)

* [Enhancement] Support more dtype in `T.print`

* upd

* upd
---
 src/tl_templates/cuda/debug.h                 | 353 +++++-------------
 .../python/debug/test_tilelang_debug_print.py |  21 +-
 2 files changed, 107 insertions(+), 267 deletions(-)

diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 2724a814..020cb1f1 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -5,282 +5,107 @@
 #endif
 
 #include "common.h"
-
 #ifndef __CUDACC_RTC__
+#include <cstdint>
 #include <cstdio>
 #endif
 
-// Template declaration for device-side debug printing (variable only)
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Overload for pointer type (supports any cv-qualified T*)
-template <typename T> __device__ void debug_print_var(const char *msg, T *var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=pointer "
-      "value=%p\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, var);
-}
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for plain char type
-template <> __device__ void debug_print_var<char>(const char *msg, char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (int)var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for integer type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for bool type
-template <> __device__ void debug_print_var<bool>(const char *msg, bool var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
-         "value=%s\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var ? "true" : "false");
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for half type
-template <> __device__ void debug_print_var<half>(const char *msg, half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_var<half_t>(const char *msg, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half_t "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
 
-// Specialization for bfloat16_t type
-template <>
-__device__ void debug_print_var<bfloat16_t>(const char *msg, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
 
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half, "half", "%f", float);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
 
 #if __CUDA_ARCH_LIST__ >= 890
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_var<fp8_e4_t>(const char *msg, fp8_e4_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e4_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
-}
-
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_var<fp8_e5_t>(const char *msg, fp8_e5_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
-}
-
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
 #endif
 
-// Template declaration for device-side debug printing (buffer only)
-template <typename T>
-__device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned int>(const char *msg, const char *buf_name,
-                                       int index, unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for half type
-template <>
-__device__ void debug_print_buffer_value<half>(const char *msg,
-                                               const char *buf_name, int index,
-                                               half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for bfloat16_t type
-template <>
-__device__ void
-debug_print_buffer_value<bfloat16_t>(const char *msg, const char *buf_name,
-                                     int index, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for fp8_e4_t type
-#if __CUDA_ARCH_LIST__ >= 890
-template <>
-__device__ void debug_print_buffer_value<fp8_e4_t>(const char *msg,
-                                                   const char *buf_name,
-                                                   int index, fp8_e4_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e4_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
 
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e5_t>(const char *msg,
-                                                   const char *buf_name,
-                                                   int index, fp8_e5_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e5_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
-#endif
-
-// Specialization for int16 type
-template <>
-__device__ void debug_print_buffer_value<int16_t>(const char *msg,
-                                                  const char *buf_name,
-                                                  int index, int16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int16_t value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (int32_t)var);
+template <typename T>
+__device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
 TL_DEVICE void device_assert(bool cond) { assert(cond); }
@@ -290,4 +115,4 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
     printf("Device assert failed: %s\n", msg);
     assert(0);
   }
-}
+}
\ No newline at end of file
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index fcfae4ed..a1aa42ed 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -19,9 +19,24 @@ def debug_print_buffer(M=16, N=16, dtype="float16"):
 
 
 def test_debug_print_buffer():
-    debug_print_buffer(16, 16, dtype="float")
-    debug_print_buffer(16, 16, dtype="float16")
-    debug_print_buffer(16, 16, dtype="uint8")
+    debug_print_buffer(dtype='bool')
+    debug_print_buffer(dtype='int8')
+    debug_print_buffer(dtype='int16')
+    debug_print_buffer(dtype='int32')
+    debug_print_buffer(dtype='int64')
+    debug_print_buffer(dtype='uint8')
+    debug_print_buffer(dtype='uint16')
+    debug_print_buffer(dtype='uint32')
+    debug_print_buffer(dtype='uint64')
+    debug_print_buffer(dtype='float16')
+    debug_print_buffer(dtype='float32')
+    debug_print_buffer(dtype='float64')
+    debug_print_buffer(dtype='bfloat16')
+    debug_print_buffer(dtype='float8_e4m3')
+    debug_print_buffer(dtype='float8_e4m3fn')
+    debug_print_buffer(dtype='float8_e4m3fnuz')
+    debug_print_buffer(dtype='float8_e5m2')
+    debug_print_buffer(dtype='float8_e5m2fnuz')
 
 
 def debug_print_buffer_conditional(M=16, N=16):
-- 
GitLab


From 9dda774affbc13bbb142d5f59c91a6cb8aa88d39 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 25 Nov 2025 01:36:17 +0800
Subject: [PATCH 042/139] [BugFix] Use BufferRegion in tl.cumsum to infer
 buffer shape (#1321)

* [BugFix] Use BufferRegion in tl.cumsum to infer buffer shape

* remove debug lines

* remove rubbish

* Fix decorator syntax for atomic_different_memory_orders_program

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
---
 src/op/reduce.cc                              | 91 +++++++++++++++++--
 src/op/reduce.h                               |  8 +-
 .../python/issue/test_tilelang_issue_1001.py  | 33 +++++++
 .../test_tilelang_language_atomic_add.py      |  2 +-
 tilelang/analysis/__init__.py                 |  1 +
 tilelang/analysis/ast_printer.py              | 23 +++++
 tilelang/engine/phase.py                      |  3 +
 tilelang/language/reduce.py                   |  8 +-
 8 files changed, 155 insertions(+), 14 deletions(-)
 create mode 100644 testing/python/issue/test_tilelang_issue_1001.py
 create mode 100644 tilelang/analysis/ast_printer.py

diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index 05dad48f..b6dbe865 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -16,6 +16,7 @@
 #include "../transform/loop_partition.h"
 #include "region.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/tir/stmt.h"
 
 namespace tvm {
 namespace tl {
@@ -57,12 +58,65 @@ static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
       RegionOp region(call->args, vmap);
       return BufferRegion(region->GetBuffer(), region->GetRanges());
     }
+    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
+    if (call->op.same_as(builtin::tvm_access_ptr())) {
+      Var var = Downcast<Var>(call->args[1]);
+      Buffer buf = vmap[var];
+      Array<Range> ranges;
+      for (PrimExpr extent : buf->shape) {
+        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+      }
+      return BufferRegion(buf, ranges);
+    }
   }
 
   LOG(FATAL) << "Unsupported argument for BufferRegion in reduce: " << arg;
   throw; // Unreachable
 }
 
+// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
+// BufferRegion. Offset is computed from all but the last two dimensions; extent
+// is the product of the last two extents. rw_mask: 1=read, 2=write,
+// 3=readwrite.
+static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
+                                        int rw_mask) {
+  Buffer buf = region->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+  ICHECK(ndim == 1 || ndim == 2) << "Cumsum expects buffers with 1 or 2 dims";
+
+  PrimExpr offset, extent;
+  if (ndim == 1) {
+    // Simple 1D region: offset and extent come from the single axis.
+    auto axis = region->region[0];
+    offset = axis->min;
+    extent = axis->extent;
+  } else {
+    // Compute row-major strides for ndim >= 2
+    std::vector<PrimExpr> strides(ndim);
+    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
+    PrimExpr cur = one;
+    for (int i = ndim - 1; i >= 0; --i) {
+      strides[i] = cur;
+      cur = cur * buf->shape[i];
+    }
+    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
+    offset = make_const(buf->shape[0].dtype(), 0);
+    for (int i = 0; i < ndim - 2; ++i) {
+      offset = offset + region->region[i]->min * strides[i];
+    }
+
+    // Extent: last two extents product (elements)
+    extent =
+        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+  }
+
+  // ptype and return handle
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
 ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
   // Accept BufferRegion/BufferLoad/tl.region for src/dst
@@ -231,6 +285,7 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto dst_scope = this->dst.scope();
 
   if (src_scope == "local.fragment" && dst_scope == "local.fragment") {
+
     Buffer src_buffer = get_buffer(this->src);
     Buffer dst_buffer = get_buffer(this->dst);
     Fragment src_layout = T.layout_map[this->src].as<Fragment>().value();
@@ -518,6 +573,16 @@ TIR_REGISTER_TL_OP(ReduceOp, reduce)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+// Normalize "Buffer" to BufferRegion. Use the shape of the buffer as the
+// ranges.
+static BufferRegion ConvertBufferToBufferRegion(const Buffer &buf) {
+  Array<Range> ranges;
+  for (PrimExpr extent : buf->shape) {
+    ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+  }
+  return BufferRegion(buf, ranges);
+}
+
 CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
   /// CumSum constructor arguments:
   /// - src: input buffer
@@ -526,11 +591,19 @@ CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
   /// - reverse: whether to cumsum in reverse order
   CHECK_EQ(args.size(), 4);
   ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  // node->src = vmap[GetVarFromAccessPtr(args[0])];
+  // node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  node->srcRegion_ = NormalizeToBufferRegion(args[0], vmap);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1], vmap);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
   node->reverse = args[3].as<Bool>().value();
-  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()));
+  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()))
+      << "The dim of cumsum should be less than the number of dimensions. Got "
+         "dim="
+      << node->dim << ", but src has " << node->src->shape.size() << " dims.";
+
   data_ = std::move(node);
 }
 
@@ -546,18 +619,22 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
     int ndim = static_cast<int>(src->shape.size());
+
+    // Build access pointers from regions locally
+    PrimExpr srcPtr = MakeAccessPtrFromRegion(srcRegion_, 1);
+    PrimExpr dstPtr = MakeAccessPtrFromRegion(dstRegion_, 2);
+
     if (ndim == 1) {
       ICHECK_EQ(dim, 0) << "Cumulative sum over a 1D buffer only supports dim "
                            "= 0.";
       ss << "tl::CumSum1D<" << threads << ", " << (reverse ? "true" : "false")
          << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src->shape[0]};
     } else if (ndim == 2) {
       ss << "tl::CumSum2D<" << threads << ", " << dim << ", "
          << (reverse ? "true" : "false") << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0], src->shape[1]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src->shape[0],
+              src->shape[1]};
     } else {
       LOG(FATAL) << "CumSum currently supports only 1D or 2D buffers, got "
                  << ndim << "D.";
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 3b124a4d..eb0599eb 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -133,8 +133,10 @@ public:
 class CumSumOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension along which to compute cumulative sum
-  bool reverse;         ///< Whether to compute in reverse order
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;      ///< Dimension along which to compute cumulative sum
+  bool reverse; ///< Whether to compute in reverse order
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.CumSumOp", CumSumOpNode,
                                     TileOperatorNode);
 
@@ -143,6 +145,8 @@ public:
     refl::ObjectDef<CumSumOpNode>()
         .def_ro("src", &CumSumOpNode::src)
         .def_ro("dst", &CumSumOpNode::dst)
+        .def_ro("srcRegion", &CumSumOpNode::srcRegion_)
+        .def_ro("dstRegion", &CumSumOpNode::dstRegion_)
         .def_ro("dim", &CumSumOpNode::dim)
         .def_ro("reverse", &CumSumOpNode::reverse);
   }
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
new file mode 100644
index 00000000..77d8cc1f
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -0,0 +1,33 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },)
+def _cumsum_view_infer_layout(hidden):
+    num_tokens = T.dynamic('num_tokens')
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), 'float']):
+        with T.Kernel(num_tokens, threads=128) as pid:
+            smem = T.alloc_shared((hidden,), dtype='float')
+            T.copy(x[pid, :], smem)
+            T.cumsum(T.view(smem, (1, hidden)), dim=1)
+
+    return buggy_kernel
+
+
+def test_cumsum_view_infer_layout():
+    hidden = 128
+    x = torch.randn(1, hidden, device='cuda', dtype=torch.float)
+    kernel = _cumsum_view_infer_layout(hidden)
+    kernel(x)
+
+
+if __name__ == '__main__':
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index 2472c20f..b157966a 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -260,7 +260,7 @@ def test_atomic_addx2():
     run_atomic_addx2(32, 64, 8, 16)
 
 
-@tilelang.jit(debug_root_path="./testing/python/language")
+@tilelang.jit
 def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
 
     @T.prim_func
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
index b72fc2ba..6e5ee5d6 100644
--- a/tilelang/analysis/__init__.py
+++ b/tilelang/analysis/__init__.py
@@ -1,3 +1,4 @@
 """Tilelang IR analysis & visitors."""
 
+from .ast_printer import ASTPrinter  # noqa: F401
 from .nested_loop_checker import NestedLoopChecker  # noqa: F401
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
new file mode 100644
index 00000000..c54ec5cf
--- /dev/null
+++ b/tilelang/analysis/ast_printer.py
@@ -0,0 +1,23 @@
+from tvm import tir
+from tvm.tir import PrimFunc
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import ir_transform
+
+
+def ASTPrinter():
+    """
+    Print the AST of a given tilelang module for debugging.
+    """
+
+    def pre_visit(statement: tir.Stmt) -> None:
+        """
+        Pre-order visitor to print all visited statements.
+        """
+
+        print(f"Visiting statement: {type(statement)}")
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        new_body = ir_transform(func.body, pre_visit, None)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 35c16a43..f686ba1f 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -74,6 +74,9 @@ def PreLowerSemanticCheck(mod: IRModule) -> None:
     Note: This is a validation-only pipeline of passes and does not modify or return the module.
     """
 
+    # Debug
+    # tilelang.analysis.ASTPrinter()(mod)
+
     # Check if there are any invalid nested loops.
     tilelang.analysis.NestedLoopChecker()(mod)
 
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 23bb6d05..9d84e0b2 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -246,8 +246,8 @@ def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -
     tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.cumsum"),
-        cumsum_smem.access_ptr("r"),
-        cumsum_smem.access_ptr("w"),
+        buffer_to_tile_region(cumsum_smem, "r"),
+        buffer_to_tile_region(cumsum_smem, "w"),
         dim,
         reverse,
     )
@@ -300,8 +300,8 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.cumsum"),
-        src.access_ptr("r"),
-        dst.access_ptr("w"),
+        buffer_to_tile_region(src, "r"),
+        buffer_to_tile_region(dst, "w"),
         dim,
         reverse,
     )
-- 
GitLab


From b02068546bd4f83beb3adea8771e91caa5022b35 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:25:04 +0800
Subject: [PATCH 043/139] [Fix] fix wrong uint narrowing bug in tvm in #1310
 (#1320)

---
 3rdparty/tvm                  | 2 +-
 tilelang/language/allocate.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index cd2b2b60..3354ada7 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit cd2b2b6013d155b5822300b0a0740fa65320dd9e
+Subproject commit 3354ada79dd428e383102020814fa9c37638e752
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index f0784e86..da1ca837 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -22,6 +22,7 @@ from tvm.tir import PrimExpr
 from tvm.script.parser.tir import block_attr
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
+from .v2.dtypes import dtype as tl_dtype
 
 
 def alloc_shared(shape, dtype, scope="shared.dyn"):
@@ -135,7 +136,7 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
     buffer = T.alloc_buffer([1], dtype, scope=parsed_scope)
     if parsed_init is not None:
         if isinstance(parsed_init, (int, float, IntImm, FloatImm)):
-            block_attr({"tl.local_var_init": {buffer.data: parsed_init}})
+            block_attr({"tl.local_var_init": {buffer.data: tl_dtype(dtype)(parsed_init)}})
         else:
             T.buffer_store(buffer, parsed_init, 0)
     return buffer
-- 
GitLab


From 71b73e185aa2b72f3fabdae7382f9b0451034389 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:32:48 +0800
Subject: [PATCH 044/139] [Refactor] Disable strided buffer load inside tvm
 (#1301) (#1332)

---
 3rdparty/tvm                                      |  2 +-
 .../test_tilelang_language_frontend_v2.py         | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 3354ada7..e3af4000 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 3354ada79dd428e383102020814fa9c37638e752
+Subproject commit e3af400013551755a8df668ba77b530735931ade
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 349f3caf..299a4127 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -427,7 +427,7 @@ def test_var_macro():
         pass
 
 
-def frame_inside_macro():
+def test_frame_inside_macro():
 
     @tilelang.jit
     def get_sample_kernel():
@@ -453,5 +453,18 @@ def frame_inside_macro():
     kernel = get_sample_kernel()  # noqa: F841
 
 
+def test_buffer_slice_step():
+    try:
+
+        @T.prim_func
+        def prim_buffer_slice_step(A: T.Buffer((10,), T.int32), B: T.Buffer((5,), T.int32)):
+            with T.Kernel(1):
+                B[0:5:2] = A[0:10:2]
+
+        raise AssertionError("Expect to report an error, buffer slice with step is not supported")
+    except RuntimeError:
+        pass
+
+
 if __name__ == '__main__':
     tilelang.testing.main()
-- 
GitLab


From 2f34840fc40ee74c9ab8f3b019983398e5610315 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:35:08 +0800
Subject: [PATCH 045/139] [Refactor] Moving `NormalizeToBufferRegion` and
 `MakeAccessPtrFromRegion` to utils (#1333)

* Refactor GEMM and Reduce operations by moving NormalizeToBufferRegion and MakeAccessPtrFromRegion to utils.{h,cc} for better code organization and reuse.

* lint fix
---
 src/op/gemm.cc    |  97 ++++--------------------------------------
 src/op/gemm_py.cc |  88 ++------------------------------------
 src/op/reduce.cc  |  95 ++---------------------------------------
 src/op/utils.cc   | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 src/op/utils.h    |  35 ++++++++++++++++
 5 files changed, 155 insertions(+), 265 deletions(-)
 create mode 100644 src/op/utils.cc
 create mode 100644 src/op/utils.h

diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 48e6cdf6..cece1e6f 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -14,6 +14,7 @@
 #include "../target/utils.h"
 #include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -48,92 +49,9 @@ using namespace tir;
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap[var];
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
-
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
-
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
-
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
 Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
   ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
@@ -535,9 +453,12 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
 
   // Build access pointers from regions locally
-  PrimExpr Aptr = MakeAccessPtrFromRegion(aRegion_, /*r*/ 1);
-  PrimExpr Bptr = MakeAccessPtrFromRegion(bRegion_, /*r*/ 1);
-  PrimExpr Cptr = MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3);
+  PrimExpr Aptr =
+      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Bptr =
+      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Cptr =
+      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
 
   std::stringstream ss;
   std::string op_name;
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index ac506ee0..a6ddef64 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -14,98 +14,16 @@
 #include "../target/utils.h"
 #include "region.h"
 #include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-// Normalize a GEMM argument (BufferRegion/BufferLoad/tvm_access_ptr/tl.region)
-// to BufferRegion
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in GEMM region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in GEMM region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap.at(var);
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-  LOG(FATAL) << "Unsupported GEMM argument for BufferRegion: " << arg;
-  throw; // Unreachable, keeps compiler happy
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim >= 2) << "GEMM expects buffers with at least 2 dims";
-
-  // Compute row-major strides
-  std::vector<PrimExpr> strides(ndim);
-  PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-  PrimExpr cur = one;
-  for (int i = ndim - 1; i >= 0; --i) {
-    strides[i] = cur;
-    cur = cur * buf->shape[i];
-  }
-
-  // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-  PrimExpr offset = make_const(buf->shape[0].dtype(), 0);
-  for (int i = 0; i < ndim - 2; ++i) {
-    offset = offset + region->region[i]->min * strides[i];
-  }
-
-  // Extent: last two extents product (elements)
-  PrimExpr extent =
-      region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
 /**
  * @brief Construct a Gemm operator from serialized TL arguments and a buffer
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index b6dbe865..c326f5ac 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -17,105 +17,16 @@
 #include "region.h"
 #include "tir/transforms/ir_utils.h"
 #include "tvm/tir/stmt.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-// Normalize an argument (BufferRegion/BufferLoad/tl.region)
-// to BufferRegion so Reduce can uniformly consume regions.
-static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                            const BufferMap &vmap) {
-  // Case 1: Already a BufferRegion
-  if (arg->IsInstance<BufferRegionNode>()) {
-    return Downcast<BufferRegion>(arg);
-  }
-
-  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
-  // extent=1)
-  if (const auto *load = arg.as<BufferLoadNode>()) {
-    Array<Range> ranges;
-    for (const PrimExpr &index : load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
-        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
-            << "Only stride-1 Ramp is supported in region conversion";
-        ICHECK(ramp->lanes.as<IntImmNode>())
-            << "Scalable vector lanes not supported in region conversion";
-        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        ranges.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    return BufferRegion(load->buffer, ranges);
-  }
-
-  // Case 3: Call nodes (only tl.region)
-  if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
-    if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
-      return BufferRegion(region->GetBuffer(), region->GetRanges());
-    }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap[var];
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
-  }
-
-  LOG(FATAL) << "Unsupported argument for BufferRegion in reduce: " << arg;
-  throw; // Unreachable
-}
-
-// Build a tvm_access_ptr(handle) to the start of the 2D tile within a
-// BufferRegion. Offset is computed from all but the last two dimensions; extent
-// is the product of the last two extents. rw_mask: 1=read, 2=write,
-// 3=readwrite.
-static PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
-                                        int rw_mask) {
-  Buffer buf = region->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-  ICHECK(ndim == 1 || ndim == 2) << "Cumsum expects buffers with 1 or 2 dims";
-
-  PrimExpr offset, extent;
-  if (ndim == 1) {
-    // Simple 1D region: offset and extent come from the single axis.
-    auto axis = region->region[0];
-    offset = axis->min;
-    extent = axis->extent;
-  } else {
-    // Compute row-major strides for ndim >= 2
-    std::vector<PrimExpr> strides(ndim);
-    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
-    PrimExpr cur = one;
-    for (int i = ndim - 1; i >= 0; --i) {
-      strides[i] = cur;
-      cur = cur * buf->shape[i];
-    }
-    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
-    offset = make_const(buf->shape[0].dtype(), 0);
-    for (int i = 0; i < ndim - 2; ++i) {
-      offset = offset + region->region[i]->min * strides[i];
-    }
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
 
-    // Extent: last two extents product (elements)
-    extent =
-        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
-  }
-
-  // ptype and return handle
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-}
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
 ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
diff --git a/src/op/utils.cc b/src/op/utils.cc
new file mode 100644
index 00000000..59960b57
--- /dev/null
+++ b/src/op/utils.cc
@@ -0,0 +1,105 @@
+/*!
+ * \file tl/op/utils.cc
+ * \brief Common utilities implementation for TL ops.
+ */
+
+#include "utils.h"
+
+#include <tvm/tir/builtin.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
+                                     const BufferMap &vmap) {
+  // Case 1: Already a BufferRegion
+  if (arg->IsInstance<BufferRegionNode>()) {
+    return Downcast<BufferRegion>(arg);
+  }
+
+  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
+  // extent=1)
+  if (const auto *load = arg.as<BufferLoadNode>()) {
+    Array<Range> ranges;
+    for (const PrimExpr &index : load->indices) {
+      if (const auto *ramp = index.as<RampNode>()) {
+        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
+        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
+            << "Only stride-1 Ramp is supported in region conversion";
+        ICHECK(ramp->lanes.as<IntImmNode>())
+            << "Scalable vector lanes not supported in region conversion";
+        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+      } else {
+        ranges.push_back(Range::FromMinExtent(index, 1));
+      }
+    }
+    return BufferRegion(load->buffer, ranges);
+  }
+
+  // Case 3: Call nodes
+  if (const auto *call = arg.as<CallNode>()) {
+    // tl.region(...) — reconstruct via RegionOp
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args, vmap);
+      return BufferRegion(region->GetBuffer(), region->GetRanges());
+    }
+    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
+    if (call->op.same_as(builtin::tvm_access_ptr())) {
+      Var var = Downcast<Var>(call->args[1]);
+      Buffer buf = vmap.at(var);
+      Array<Range> ranges;
+      for (PrimExpr extent : buf->shape) {
+        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+      }
+      return BufferRegion(buf, ranges);
+    }
+  }
+
+  LOG(FATAL) << "Unsupported argument for BufferRegion: " << arg;
+  throw; // Unreachable
+}
+
+PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
+                                 bool require_2d) {
+  Buffer buf = region->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+  if (require_2d) {
+    ICHECK(ndim >= 2) << "Expect buffers with at least 2 dims";
+  }
+
+  PrimExpr offset, extent;
+  if (ndim == 1) {
+    // 1D: straightforward
+    auto axis = region->region[0];
+    offset = axis->min;
+    extent = axis->extent;
+  } else {
+    // Compute row-major strides
+    std::vector<PrimExpr> strides(ndim);
+    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
+    PrimExpr cur = one;
+    for (int i = ndim - 1; i >= 0; --i) {
+      strides[i] = cur;
+      cur = cur * buf->shape[i];
+    }
+    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
+    offset = make_const(buf->shape[0].dtype(), 0);
+    for (int i = 0; i < ndim - 2; ++i) {
+      offset = offset + region->region[i]->min * strides[i];
+    }
+    // Extent: last two extents product (elements)
+    extent =
+        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+  }
+
+  // ptype and return handle
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/utils.h b/src/op/utils.h
new file mode 100644
index 00000000..9e7880ac
--- /dev/null
+++ b/src/op/utils.h
@@ -0,0 +1,35 @@
+/*!
+ * \file tl/op/utils.h
+ * \brief Common utilities for TL ops.
+ */
+
+#ifndef TVM_TL_OP_UTILS_H_
+#define TVM_TL_OP_UTILS_H_
+
+#include "./operator.h"
+#include "region.h"
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Normalize an argument (BufferRegion/BufferLoad/tl.region/tvm_access_ptr)
+// to BufferRegion so ops can uniformly consume regions.
+TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
+                                             const BufferMap &vmap);
+
+// Build a tvm_access_ptr(handle) from a BufferRegion.
+// - If `require_2d` is true, checks buffer ndim >= 2.
+// - For 1D regions (when allowed), offset=min, extent=extent.
+// - For ndim >= 2, offset sums all but last two dims using row-major strides,
+//   extent is product of the last two extents.
+TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
+                                         int rw_mask, bool require_2d = false);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_UTILS_H_
-- 
GitLab


From 2ae4f1b7877a828da7d01cf88a2a45ad37850bfd Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:07:52 +0800
Subject: [PATCH 046/139] [Fix] Fix bug copying from or to local buffer (#1304)
 (#1324)

* [Fix] fix copy from or to local buffer (#1304)

* fix lint error

* minor fix testing script
---
 src/op/copy.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/op/copy.cc b/src/op/copy.cc
index 2584abce..82c903f8 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -851,8 +851,13 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
   For vectorized_thread_loop;
   auto par_op = ParallelOp(transformed_loop);
 
-  if (is_cpu_target) {
-    vectorized_thread_loop = VectorizeLoop(transformed_loop, analyzer);
+  if (is_cpu_target || dst.scope() == "local" || src.scope() == "local") {
+    if (src.scope() == "local" && dst.scope() != "local") {
+      LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
+                   << dst.scope() << " buffer `" << dst->name
+                   << "` may cause conflicted write.";
+    }
+    vectorized_thread_loop = VectorizeLoop(transformed_loop);
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
-- 
GitLab


From e2b10c580b32cd31f384917d0ce31b7610f4e5e4 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 25 Nov 2025 20:22:15 +0800
Subject: [PATCH 047/139] [Language][UX] Semantic check for parallel fragment
 access (#1338)

---
 src/transform/layout_inference.cc             |   8 +-
 .../test_tilelang_fragment_loop_checker.py    | 162 ++++++++++++++++++
 .../test_tilelang_nested_loop_checker.py}     |   0
 tilelang/analysis/__init__.py                 |   1 +
 tilelang/analysis/fragment_loop_checker.py    | 100 +++++++++++
 tilelang/analysis/nested_loop_checker.py      |   6 +-
 tilelang/engine/phase.py                      |   3 +
 7 files changed, 277 insertions(+), 3 deletions(-)
 create mode 100644 testing/python/analysis/test_tilelang_fragment_loop_checker.py
 rename testing/python/{language/test_tilelang_language_nested_loop.py => analysis/test_tilelang_nested_loop_checker.py} (100%)
 create mode 100644 tilelang/analysis/fragment_loop_checker.py

diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index be98b284..873f70d0 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -821,7 +821,13 @@ private:
               int64_t frag_reg_num = 1;
               for (auto i : frag.value()->OutputShape()) {
                 auto pci = as_const_int(i);
-                ICHECK(pci != nullptr);
+                ICHECK(pci != nullptr)
+                    << "Can not use non-constant range to "
+                       "iterate over a fragment/local "
+                       "buffer. Non-constant shape expr is: "
+                    << i
+                    << ". This is possibly because you use symbolic shape when "
+                       "accessing a fragment/local buffer.";
                 frag_reg_num *= *pci;
               }
               reg_num += frag_reg_num;
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
new file mode 100644
index 00000000..9073aebc
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -0,0 +1,162 @@
+import tilelang
+import tilelang.language as T
+import pytest
+
+
+@tilelang.jit
+def simple_invalid_loop(dtype: str = "bfloat16",
+                        accum_dtype: str = "float32",
+                        num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[i] = 0
+
+    return main
+
+
+@tilelang.jit
+def nested_invalid_loop(dtype: str = "bfloat16",
+                        accum_dtype: str = "float32",
+                        num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A // 64):
+                for j in T.Parallel(64):
+                    data_frag[i * 64 + j] = 0
+
+    return main
+
+
+@tilelang.jit
+def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16",
+                                       accum_dtype: str = "float32",
+                                       num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[64 // 2 + i % 64] = 0
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_use_loop_var(dtype: str = "bfloat16",
+                                accum_dtype: str = "float32",
+                                num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):  # noqa: B007
+                for j in T.Parallel(64):
+                    data_frag[j] = 0  # This is valid because we don't use i
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_frag(dtype: str = "bfloat16",
+                        accum_dtype: str = "float32",
+                        num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_shared[i] = 0  # Valid because this is shared memory
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_serial(dtype: str = "bfloat16",
+                      accum_dtype: str = "float32",
+                      num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+            data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.serial(A):
+                data_shared[i] = 0  # Valid because this is serial
+
+    return main
+
+
+def test_invalid_loop():
+    with pytest.raises(ValueError):
+        simple_invalid_loop()
+    with pytest.raises(ValueError):
+        nested_invalid_loop()
+    with pytest.raises(ValueError):
+        invalid_loop_with_complex_dataflow()
+
+
+def test_valid_loop():
+    valid_loop_not_use_loop_var()
+    valid_loop_not_frag()
+    valid_loop_serial()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_nested_loop.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
similarity index 100%
rename from testing/python/language/test_tilelang_language_nested_loop.py
rename to testing/python/analysis/test_tilelang_nested_loop_checker.py
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
index 6e5ee5d6..33ccded6 100644
--- a/tilelang/analysis/__init__.py
+++ b/tilelang/analysis/__init__.py
@@ -2,3 +2,4 @@
 
 from .ast_printer import ASTPrinter  # noqa: F401
 from .nested_loop_checker import NestedLoopChecker  # noqa: F401
+from .fragment_loop_checker import FragmentLoopChecker  # noqa: F401
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
new file mode 100644
index 00000000..3186b23e
--- /dev/null
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+from tvm import tir
+from tvm.tir import (PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm)
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import post_order_visit
+
+
+@tir.functor.visitor
+class _LoopVarUseAnalyzer(PyStmtExprVisitor):
+    """Analyze whether a loop variable is used in the given expr."""
+
+    def __init__(self, var: Var) -> None:
+        super().__init__()
+        self.var = var
+        self.used = False
+
+    def visit_var_(self, op: Var) -> None:
+        if op == self.var:
+            self.used = True
+        # Don't recursively visit children to avoid infinite recursion
+
+
+def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
+    """
+        Collect local buffer accesses in the loop body.
+
+        Args:
+            statement: The TIR statement to analyze
+
+        Returns:
+            Tuple of buffer accesses in the loop body.
+        """
+
+    buffer_accesses = []
+
+    def visit_buffer_access(node):
+        if isinstance(node, (BufferLoad, BufferStore)) and node.buffer.scope().startswith("local"):
+            buffer_accesses.append(node)
+
+    post_order_visit(statement, visit_buffer_access)
+
+    return buffer_accesses
+
+
+@tir.functor.visitor
+class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            # Fuse consecutive parallel loops
+            # Other nested cases are all invalid in TileLang.
+            loops = [op]
+            child = op.body
+            while isinstance(child, For) and child.kind == tir.ForKind.PARALLEL:
+                loops.append(child)
+                child = child.body
+
+            loops_with_symbolic_ranges = []
+            for loop in loops:
+                if not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
+                    loops_with_symbolic_ranges.append(loop)
+
+            if len(loops_with_symbolic_ranges) > 0:
+                buffer_accesses = collect_local_buffer_accesses(child)
+            for loop in loops_with_symbolic_ranges:
+                for buffer_access in buffer_accesses:
+                    indices = buffer_access.indices
+                    analyzer = _LoopVarUseAnalyzer(loop.loop_var)
+                    for index in indices:
+                        analyzer.visit_expr(index)
+                    if analyzer.used:
+                        raise ValueError(
+                            "[Tilelang Semantic Check] "
+                            f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
+                            "a local/fragment buffer, which is not allowed in Tilelang.")
+
+            return
+
+        self.visit_stmt(op.body)
+
+
+def FragmentLoopChecker():
+    """
+    When using T.Parallel over a local/fragment buffer, there are several restrictions:
+    to ensure that the parallelization is valid.
+
+    1. The range of loop can not be symbolic.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _FragmentLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
index 4b9741c3..7a0d94da 100644
--- a/tilelang/analysis/nested_loop_checker.py
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -35,7 +35,8 @@ class _NestedLoopCheckVisitor(PyStmtExprVisitor):
 
             # Otherwise
             if self.in_parallel_context:
-                raise ValueError("Nested parallel loops are not allowed. "
+                raise ValueError("[Tilelang Semantic Check] "
+                                 "Nested parallel loops are not allowed. "
                                  "Please check your loop structure.")
             self.in_parallel_context = True
             self.visit_stmt(child)
@@ -43,7 +44,8 @@ class _NestedLoopCheckVisitor(PyStmtExprVisitor):
             return
         elif is_pipelined_for(op):
             if self.in_parallel_context:
-                raise ValueError("Pipelined loop cannot be nested inside a parallel loop. "
+                raise ValueError("[Tilelang Semantic Check] "
+                                 "Pipelined loop cannot be nested inside a parallel loop. "
                                  "Please check your loop structure.")
 
         self.visit_stmt(op.body)
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index f686ba1f..17d6e4aa 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -80,6 +80,9 @@ def PreLowerSemanticCheck(mod: IRModule) -> None:
     # Check if there are any invalid nested loops.
     tilelang.analysis.NestedLoopChecker()(mod)
 
+    # Check if there are any invalid symbolic T.Parallel + fragment access.
+    tilelang.analysis.FragmentLoopChecker()(mod)
+
 
 def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # Bind the target device information to the module
-- 
GitLab


From f810f9767a53b140557daf5486e326c723b40a6a Mon Sep 17 00:00:00 2001
From: LJC00118 <77378439+LJC00118@users.noreply.github.com>
Date: Wed, 26 Nov 2025 12:57:48 +0800
Subject: [PATCH 048/139] Add unit tests for T.assume (#1341)

* Add test for T.assume

* Add unit test for T.assume

* Add unit test for T.assume

* Add unit tests for T.assume

* Remove debug print for kernel source

Remove print statement for kernel source in tests.

* Update test_tilelang_language_assume.py

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
---
 .../language/test_tilelang_language_assume.py | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 testing/python/language/test_tilelang_language_assume.py

diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
new file mode 100644
index 00000000..9c75a5ac
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -0,0 +1,89 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def test_assume_remove_boundary_check():
+
+    @tilelang.jit
+    def kernel_with_assume():
+        N = T.dynamic('N')
+
+        @T.prim_func
+        def main(A: T.Tensor((N,), "float32"), l: T.int32, r: T.int32):
+            with T.Kernel(1, threads=32) as _:
+                for i in T.serial(r - l + 1):
+                    T.assume(l + i >= 0 and l + i < N)
+                    A[l + i] = 0
+
+        return main
+
+    jit_kernel = kernel_with_assume()
+    source = jit_kernel.get_kernel_source()
+
+    assert ("if (" not in source)
+
+
+def test_assume_enable_vectorization():
+
+    @tilelang.jit
+    def kernel_vectorize(M):
+        N = T.dynamic('N')
+        vectorize_size = 4
+
+        @T.prim_func
+        def main(
+                A: T.Tensor((M, N), "float32"),
+                B: T.Tensor((M, N), "float32"),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+
+                base_idx = tid * 4
+                T.assume(N % vectorize_size == 0)
+
+                for i in T.vectorized(vectorize_size):
+                    T.assume(base_idx + i < N)
+                    B[tid, base_idx + i] = A[tid, base_idx + i]
+
+        return main
+
+    jit_kernel = kernel_vectorize(128)
+    source = jit_kernel.get_kernel_source()
+
+    assert ("float4" in source) and ("if (" not in source)
+
+
+def test_assume_complex_indexing():
+
+    @tilelang.jit
+    def kernel_complex():
+        M = T.dynamic('M')
+        N = T.dynamic('N')
+
+        @T.prim_func
+        def main(
+                A: T.Tensor((M, N), "float32"),
+                B: T.Tensor((M, N), "float32"),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+                for j in T.serial(N):
+                    i_src = T.min(j + 233, tid + 2)
+                    j_src = j * T.ceildiv(j, i_src) * j - 1
+
+                    T.assume(i_src >= 0 and i_src < M)
+                    T.assume(j_src >= 0 and j_src < N)
+
+                    B[tid, j] = A[i_src, j_src]
+
+        return main
+
+    jit_kernel = kernel_complex()
+    source = jit_kernel.get_kernel_source()
+
+    assert ("if (" not in source)
+
+
+if __name__ == '__main__':
+    tilelang.testing.main()
-- 
GitLab


From fac0400680aa267efe01c663d0b92544c22471b5 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Wed, 26 Nov 2025 14:02:09 +0800
Subject: [PATCH 049/139] [Feat] Extend LegalizeNegativeIndex to support buffer
 store stmts (#1339)

This commit enhances the LegalizeNegativeIndex transformation pass to handle
both buffer load and store operations with negative indices and adds some
test cases.
---
 src/support/ffi_aliases.h                     |   1 +
 src/transform/legalize_negative_index.cc      | 214 +++++------
 ...elang_transform_legalize_negative_index.py | 342 ++++++++++++++++++
 3 files changed, 453 insertions(+), 104 deletions(-)
 create mode 100644 testing/python/transform/test_tilelang_transform_legalize_negative_index.py

diff --git a/src/support/ffi_aliases.h b/src/support/ffi_aliases.h
index cbc6fb02..7dbe0b39 100644
--- a/src/support/ffi_aliases.h
+++ b/src/support/ffi_aliases.h
@@ -3,6 +3,7 @@
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/function.h>
 #include <tvm/ffi/memory.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
index b502a6fb..f0df555e 100644
--- a/src/transform/legalize_negative_index.cc
+++ b/src/transform/legalize_negative_index.cc
@@ -1,6 +1,6 @@
 /*!
  * \file legalize_negative_index.cc
- * \brief Legalize negative indices in buffer load expressions.
+ * \brief Legalize negative indices in buffer load/store expressions.
  */
 
 #include <tvm/ffi/reflection/registry.h>
@@ -10,6 +10,7 @@
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
 #include "arith/ir_mutator_with_analyzer.h"
@@ -23,47 +24,42 @@ using arith::IRVisitorWithAnalyzer;
 
 enum class IndexSignState { kNonNegative, kNegative, kUnknown };
 
+using BufferAccessVariant =
+    std::variant<const BufferLoadNode *, const BufferStoreNode *>;
+using LoadStore2StateMap =
+    std::unordered_map<BufferAccessVariant, std::vector<IndexSignState>>;
+
 class NegativeIndexAnalyzer : public IRVisitorWithAnalyzer {
 public:
-  explicit NegativeIndexAnalyzer(
-      std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-          *result)
+  explicit NegativeIndexAnalyzer(LoadStore2StateMap *result)
       : result_(result) {}
 
-  void VisitExpr_(const BufferLoadNode *op) final {
-    auto load = tvm::ffi::GetRef<BufferLoad>(op);
+private:
+  std::vector<IndexSignState> ProcessIdx(const ffi::Array<PrimExpr> &indices,
+                                         ffi::String buffer_name) {
     std::vector<IndexSignState> states;
-    states.reserve(op->indices.size());
-    bool needs_record = false;
+    states.reserve(indices.size());
 
-    for (size_t i = 0; i < op->indices.size(); ++i) {
-      PrimExpr simplified = analyzer_.Simplify(op->indices[i]);
+    for (size_t i = 0; i < indices.size(); ++i) {
+      PrimExpr simplified = analyzer_.Simplify(indices[i]);
+      IndexSignState state = IndexSignState::kUnknown;
 
       // Handle scalar indices with the standard analyzer
       if (simplified.dtype().lanes() == 1) {
-        if (analyzer_.CanProve(simplified >= 0)) {
-          states.push_back(IndexSignState::kNonNegative);
-          continue;
-        }
-        if (analyzer_.CanProve(simplified < 0)) {
-          states.push_back(IndexSignState::kNegative);
-          needs_record = true;
-          continue;
-        }
-        states.push_back(IndexSignState::kUnknown);
-        needs_record = true;
-        DLOG(WARNING)
-            << "LegalizeNegativeIndex: cannot prove non-negative index "
-            << simplified << " for buffer " << load->buffer->name << " (axis "
-            << i << ").";
-        continue;
+        if (analyzer_.CanProve(simplified >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(simplified < 0))
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
       }
-
       // Vector indices: try to reason about non-negativity/negativity
       // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
       // lanes).
-      IndexSignState vec_state = IndexSignState::kUnknown;
-      if (const auto *ramp = simplified.as<RampNode>()) {
+      else if (const auto *ramp = simplified.as<RampNode>()) {
         // Compute a safe lower/upper bound for the vector lanes
         // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
         // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
@@ -85,118 +81,129 @@ public:
         if (s_max > 0)
           upper += s_max * (lanes - 1);
 
-        if (lower >= 0) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (upper < 0) {
-          vec_state = IndexSignState::kNegative;
-        } else {
-          vec_state = IndexSignState::kUnknown;
-        }
-      } else if (const auto *bc = simplified.as<BroadcastNode>()) {
-        auto v = analyzer_.Simplify(bc->value);
-        if (analyzer_.CanProve(v >= 0)) {
-          vec_state = IndexSignState::kNonNegative;
-        } else if (analyzer_.CanProve(v < 0)) {
-          vec_state = IndexSignState::kNegative;
-        } else {
+        if (lower >= 0)
+          state = IndexSignState::kNonNegative;
+        else if (upper < 0)
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
+        auto v = analyzer_.Simplify(broadcast->value);
+        if (analyzer_.CanProve(v >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(v < 0))
+          state = IndexSignState::kNegative;
+        else {
           // Try const bound if proof unavailable
           auto vb = analyzer_.const_int_bound(v);
-          if (vb->min_value >= 0) {
-            vec_state = IndexSignState::kNonNegative;
-          } else if (vb->max_value < 0) {
-            vec_state = IndexSignState::kNegative;
-          } else {
-            vec_state = IndexSignState::kUnknown;
-          }
+          if (vb->min_value >= 0)
+            state = IndexSignState::kNonNegative;
+          else if (vb->max_value < 0)
+            state = IndexSignState::kNegative;
+          else
+            DLOG(WARNING)
+                << "LegalizeNegativeIndex: cannot prove non-negative index "
+                << simplified << " for buffer " << buffer_name << " (axis " << i
+                << ", index " + indices[i]->Script() + ").";
         }
       }
+      states.push_back(state);
+    }
 
-      if (vec_state == IndexSignState::kNonNegative) {
-        states.push_back(IndexSignState::kNonNegative);
-        continue;
-      }
-      if (vec_state == IndexSignState::kNegative) {
-        states.push_back(IndexSignState::kNegative);
-        needs_record = true;
-        continue;
-      }
+    return std::move(states);
+  }
 
-      states.push_back(IndexSignState::kUnknown);
-      needs_record = true;
-      DLOG(WARNING) << "LegalizeNegativeIndex: cannot prove non-negative index "
-                    << simplified << " for buffer " << load->buffer->name
-                    << " (axis " << i << ").";
-    }
+  bool NeedRecord(const std::vector<IndexSignState> &states) {
+    return std::any_of(states.begin(), states.end(),
+                       [](const IndexSignState &state) {
+                         return state == IndexSignState::kUnknown ||
+                                state == IndexSignState::kNegative;
+                       });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
 
-    if (needs_record) {
+    if (NeedRecord(states))
       (*result_)[op] = std::move(states);
-    }
 
     IRVisitorWithAnalyzer::VisitExpr_(op);
   }
 
+  void VisitStmt_(const BufferStoreNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
+
+    if (NeedRecord(states))
+      (*result_)[op] = std::move(states);
+
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
 private:
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      *result_;
+  LoadStore2StateMap *result_;
 };
 
 class NegativeIndexRewriter : public arith::IRMutatorWithAnalyzer {
 public:
-  static PrimFunc
-  Apply(PrimFunc func,
-        const std::unordered_map<const BufferLoadNode *,
-                                 std::vector<IndexSignState>> &states) {
+  static PrimFunc Apply(PrimFunc func, const LoadStore2StateMap &states) {
     arith::Analyzer analyzer;
     NegativeIndexRewriter rewriter(&analyzer, states);
-    if (!func->body.defined()) {
-      return func;
-    }
     PrimFuncNode *func_node = func.CopyOnWrite();
     func_node->body = rewriter.VisitStmt(func_node->body);
     return func;
   }
 
 private:
-  NegativeIndexRewriter(
-      arith::Analyzer *analyzer,
-      const std::unordered_map<const BufferLoadNode *,
-                               std::vector<IndexSignState>> &states)
+  NegativeIndexRewriter(arith::Analyzer *analyzer,
+                        const LoadStore2StateMap &states)
       : arith::IRMutatorWithAnalyzer(analyzer), states_(states) {}
 
+  ffi::Array<PrimExpr> UpdateIdx(const ffi::Array<PrimExpr> &indices,
+                                 const ffi::Array<PrimExpr> &buffer_shape,
+                                 const std::vector<IndexSignState> &state_vec) {
+    ICHECK_EQ(state_vec.size(), indices.size())
+        << "State vector size mismatch for buffer load/store indices ("
+        << indices << ")";
+    ffi::Array<PrimExpr> new_indices = indices;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      if (state_vec[i] != IndexSignState::kNegative)
+        continue;
+      new_indices.Set(i, analyzer_->Simplify(buffer_shape[i] + indices[i]));
+    }
+    return new_indices;
+  }
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     BufferLoad load =
         Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
 
     auto it = states_.find(op);
-    if (it == states_.end()) {
+    if (it == states_.end())
       return load;
-    }
 
-    auto indices = load->indices;
-    bool changed = false;
-
-    const auto &state_vector = it->second;
-    ICHECK_EQ(state_vector.size(), indices.size())
-        << "State vector size mismatch for buffer load " << load->buffer->name;
+    auto indices = UpdateIdx(load->indices, load->buffer->shape, it->second);
+    return BufferLoad(load->buffer, indices, load->predicate);
+  }
 
-    for (size_t i = 0; i < indices.size(); ++i) {
-      if (state_vector[i] != IndexSignState::kNegative) {
-        continue;
-      }
-      PrimExpr extent = load->buffer->shape[i];
-      indices.Set(i, analyzer_->Simplify(extent + indices[i]));
-      changed = true;
-    }
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store =
+        Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    if (!changed) {
-      return load;
-    }
+    auto it = states_.find(op);
+    if (it == states_.end())
+      return store;
 
-    return BufferLoad(load->buffer, indices);
+    auto indices = UpdateIdx(store->indices, store->buffer->shape, it->second);
+    return BufferStore(store->buffer, store->value, indices, store->predicate);
   }
 
-  const std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      &states_;
+private:
+  const LoadStore2StateMap &states_;
 };
 
 PrimFunc LegalizeNegativeIndex(PrimFunc func) {
@@ -204,8 +211,7 @@ PrimFunc LegalizeNegativeIndex(PrimFunc func) {
     return func;
   }
 
-  std::unordered_map<const BufferLoadNode *, std::vector<IndexSignState>>
-      states;
+  LoadStore2StateMap states;
   NegativeIndexAnalyzer analyzer(&states);
   analyzer(func->body);
   if (states.empty()) {
diff --git a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
new file mode 100644
index 00000000..c5dd065a
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
@@ -0,0 +1,342 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+def _check(original, expected):
+    """Helper function to verify structural equality after transformations"""
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LegalizeNegativeIndex()(mod)
+    expected = tvm.IRModule.from_expr(expected.with_attr("global_symbol", "main"))
+    tvm.ir.assert_structural_equal(mod["main"], expected["main"], True)
+
+
+def test_buffer_load_negative_index_legalized():
+    """
+    Test that negative indices are legalized by adding buffer extent.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        value = A[-1]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        value = A[1023]  # A[-1] becomes A[1023]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices - only negative ones are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), "float32")):
+        value = A[-1, 10]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), "float32")):
+        value = A[1023, 10]  # A[-1, 10] becomes A[1023, 10]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), "float32")):
+        value = A[-1, -2, -3]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), "float32")):
+        value = A[1023, 510, 253]  # -1+1024=1023, -2+512=510, -3+256=253
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        B = T.alloc_buffer((1024,), "float32")
+        for i in T.serial(1, 1024):
+            value = A[-i]
+            B[-i] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        B = T.alloc_buffer((1024,), "float32")
+        for i in T.serial(1, 1024):
+            value = A[1024 - i]
+            B[1024 - i] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_non_negative_index_unchanged():
+    """
+    Test that non-negative indices remain unchanged.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        value = A[0]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        # No changes expected for non-negative indices
+        value = A[0]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_unknown_sign_index_warning():
+    """
+    Test that indices with unknown sign trigger warnings but are processed.
+    This test mainly checks that the pass doesn't crash on unknown signs.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        i = T.Var("i", "int32")
+        value = A[i]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        i = T.Var("i", "int32")
+        # Unknown sign indices should remain unchanged
+        value = A[i]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        vec = T.Broadcast(-1, 4)
+        value = A[vec]
+        B = T.alloc_buffer((4,), "float32")
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        value = A[T.Broadcast(1023, 4)]
+        B = T.alloc_buffer((4,), "float32")
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        value = A[vec]
+        B = T.alloc_buffer((4,), "float32")
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        value = A[T.Ramp(1020, 1, 4)]
+        B = T.alloc_buffer((4,), "float32")
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_nested_buffer_loads():
+    """
+    Test legalization with nested buffer load expressions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), "float32")):
+        inner_val = A[-1, 10]
+        outer_val = A[inner_val.astype("int32"), -2]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = outer_val
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), "float32")):
+        inner_val = A[1023, 10]
+        outer_val = A[inner_val.astype("int32"), 510]
+        B = T.alloc_buffer((1,), "float32")
+        B[0] = outer_val
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index():
+    """
+    Test negative indices in buffer store operations are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        A[-1] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        A[1023] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), "float32")):
+        A[-1, 10] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), "float32")):
+        A[1023, 10] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions for buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), "float32")):
+        A[-1, -2, -3] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), "float32")):
+        A[1023, 510, 253] = 42.0  # -1+1024=1023, -2+512=510, -3+256=253
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        for i in T.serial(1, 1024):
+            A[-i] = i * 2.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        for i in T.serial(1, 1024):
+            A[1024 - i] = i * 2.0
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized store operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        vec = T.Broadcast(-1, 4)
+        values = T.Broadcast(42.0, 4)
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        values = T.Broadcast(42.0, 4)
+        A[T.Broadcast(1023, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized store operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32")):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        values = T.Ramp(0.0, 1.0, 4)  # values: [0.0, 1.0, 2.0, 3.0]
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32")):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        values = T.Ramp(0.0, 1.0, 4)
+        A[T.Ramp(1020, 1, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_nested_in_condition():
+    """
+    Test negative index buffer store within conditional statements.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), "float32"), flag: T.int32):
+        if flag > 0:
+            A[-1] = 42.0
+        else:
+            A[-2] = 24.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), "float32"), flag: T.int32):
+        if flag > 0:
+            A[1023] = 42.0
+        else:
+            A[1022] = 24.0
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
-- 
GitLab


From f5d9da46788674b326ace0714c47ad36f39c1de8 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 26 Nov 2025 15:18:50 +0800
Subject: [PATCH 050/139] [Refactor] Phaseout vmap for Tile Operators (#1334)

* Refactor GEMM and Reduce operations by moving NormalizeToBufferRegion and MakeAccessPtrFromRegion to utils.{h,cc} for better code organization and reuse.

* lint fix

* Refactor region handling by removing the RegionOp and updating NormalizeToBufferRegion to only accept BufferLoad and BufferRegion. This change improves code organization and simplifies the handling of memory regions across various operations.

* fix

* Refactor memory region handling by introducing `tl.region` calls across various operations, including GEMM and fill functions. This change enhances the consistency of region management and improves code organization by utilizing utility functions for buffer region conversions.

* fix

* fix

* test fix

* lint fix

* Refactor GEMM operations to improve memory region handling by replacing `mbarPtr_` with `mbarRegion_` and updating related logic in both C++ and Python implementations. This change enhances the clarity and consistency of buffer region management.

* fix

* lint fix

* fix

* fix

* test fix

* lint fix

* lint fix

* minor fix

* fix

---------

Co-authored-by: Zhiwen Mo <zm125@ic.ac.uk>
---
 .../deepseek_mla/test_example_mla_decode.py   |   1 -
 examples/gemv/example_gemv.py                 |  21 +--
 examples/gemv/test_example_gemv.py            |   4 +-
 src/op/atomic_add.cc                          |  27 ++--
 src/op/atomic_add.h                           |   2 +-
 src/op/copy.cc                                | 127 +++++++++---------
 src/op/copy.h                                 |  38 +++---
 src/op/fill.cc                                |  54 +-------
 src/op/fill.h                                 |   2 +-
 src/op/finalize_reducer.cc                    |  11 +-
 src/op/finalize_reducer.h                     |   2 +-
 src/op/gemm.cc                                |  28 ++--
 src/op/gemm.h                                 |   4 +-
 src/op/gemm_py.cc                             |  22 ++-
 src/op/gemm_py.h                              |   9 +-
 src/op/gemm_sp.cc                             |  16 ++-
 src/op/gemm_sp.h                              |   7 +-
 src/op/operator.cc                            |  11 +-
 src/op/operator.h                             |  13 +-
 src/op/reduce.cc                              |  15 +--
 src/op/reduce.h                               |   4 +-
 src/op/region.cc                              |  99 +++++---------
 src/op/region.h                               |  99 +++++---------
 src/op/utils.cc                               |  21 +--
 src/op/utils.h                                |   6 +-
 src/transform/layout_inference.cc             |  21 ++-
 src/transform/layout_reducer.cc               |  34 ++++-
 src/transform/lower_tile_op.cc                |   3 +-
 .../python/issue/test_tilelang_issue_830.py   |  10 ++
 tilelang/intrinsics/mfma_macro_generator.py   |  40 +++++-
 tilelang/intrinsics/mma_macro_generator.py    |  41 +++++-
 .../intrinsics/mma_sm70_macro_generator.py    |   6 +-
 tilelang/language/atomic.py                   |  25 +---
 tilelang/language/copy.py                     |  31 +----
 tilelang/language/experimental/gemm_sp.py     |  18 +--
 tilelang/language/fill.py                     |  24 +---
 tilelang/language/gemm.py                     |  39 +++---
 tilelang/language/reduce.py                   |  28 ++--
 tilelang/language/utils.py                    |  85 ++----------
 tilelang/tileop/gemm/gemm_base.py             |   4 +
 tilelang/tileop/gemm/gemm_tcgen05.py          |  11 +-
 tilelang/utils/__init__.py                    |   1 +
 tilelang/utils/language.py                    |  73 ++++++----
 43 files changed, 535 insertions(+), 602 deletions(-)

diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
index 66a750f7..a269ea57 100644
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
@@ -1,5 +1,4 @@
 import tilelang.testing
-
 import example_mla_decode
 
 
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 4e43dcd9..58e0114b 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -334,14 +334,14 @@ def get_autotuned_kernel(
     return main
 
 
-def check_correctness_and_bench(kernel, N, K, bench_ref=True):
+def check_correctness_and_bench(kernel, N, K, do_bench=True):
     profiler = kernel.get_profiler()
     profiler.assert_allclose(lambda x, y: x @ y.T, atol=1e-2, rtol=1e-2)
-    if bench_ref:
+    if do_bench:
         latency = profiler.do_bench(lambda x, y: x @ y.T, warmup=50)
         print(f"Torch Latency: {latency} ms")
-    latency = profiler.do_bench(kernel, warmup=50)
-    print(f"TileLang Latency: {latency} ms\n")
+        latency = profiler.do_bench(kernel, warmup=50)
+        print(f"TileLang Latency: {latency} ms\n")
 
 
 def main(do_bench: bool = True):
@@ -350,12 +350,13 @@ def main(do_bench: bool = True):
     parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
     args, _ = parser.parse_known_args()
     N, K = args.n, args.k
-    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K)
-    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K)
-    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K)
+    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K, do_bench=do_bench)
+    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(
+        gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
 
     print("Test passed!")
 
diff --git a/examples/gemv/test_example_gemv.py b/examples/gemv/test_example_gemv.py
index 3881ca76..323337a7 100644
--- a/examples/gemv/test_example_gemv.py
+++ b/examples/gemv/test_example_gemv.py
@@ -1,5 +1,3 @@
-import tilelang.testing
-
 import example_gemv
 
 
@@ -8,4 +6,4 @@ def test_example_gemv():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    test_example_gemv()
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 57e0d8b7..1a49b770 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -5,7 +5,7 @@
  */
 
 #include "./atomic_add.h"
-#include "./region.h"
+#include "utils.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
@@ -26,32 +26,27 @@ using namespace tir;
  * @brief Construct an AtomicAdd operator from call arguments and a buffer map.
  *
  * Builds the internal AtomicAddNode, extracts the source and destination
- * regions and their backing Buffers from the first two call-style expressions
- * in `args` (via RegionOp), and stores them along with their ranges. If a third
- * argument is provided, it is interpreted as an integer immediate and stored as
- * the node's coalesced width.
+ * regions and their backing Buffers from the first two region-style expressions
+ * in `args` (BufferLoad/BufferRegion), and stores them along with their
+ * ranges. If a third argument is provided, it is interpreted as an integer
+ * immediate and stored as the node's coalesced width.
  *
  * @param args Call-style PrimExprs where:
  *             - args[0] is the source region call,
  *             - args[1] is the destination region call,
  *             - args[2] (optional) is an IntImm specifying coalesced width.
- * @param vmap Mapping from buffers used by RegionOp to concrete Buffer objects.
- *
  * Notes:
- * - The constructor checks that args[0] and args[1] are CallNodes.
+ * - The constructor checks that args[0] and args[1] are region-compatible.
  * - The constructed node is stored in this->data_.
  */
-AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) {
+AtomicAdd::AtomicAdd(Array<PrimExpr> args) {
   ObjectPtr<AtomicAddNode> node = tvm::ffi::make_object<AtomicAddNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
@@ -552,4 +547,4 @@ TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
 TVM_FFI_STATIC_INIT_BLOCK() { AtomicAddNode::RegisterReflection(); }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
index f3aaacdb..c6beb70e 100644
--- a/src/op/atomic_add.h
+++ b/src/op/atomic_add.h
@@ -65,7 +65,7 @@ class AtomicAdd : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicAdd, TileOperator,
                                              AtomicAddNode);
-  TVM_DLL AtomicAdd(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL AtomicAdd(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 82c903f8..9b93fea1 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -16,7 +16,7 @@
 #include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
-#include "region.h"
+#include "utils.h"
 
 #include "../target/cuda.h"
 #include "../target/utils.h"
@@ -110,36 +110,32 @@ template <typename T> static Array<T> ReverseArray(Array<T> array) {
 /*!
  * \brief Construct a Copy operator node from call arguments and a buffer map.
  *
- * This constructor parses the first two entries of `args` as Call nodes
- * describing source and destination Regions (via RegionOp), extracts their
- * Buffers and Ranges, and stores them on the newly created CopyNode. It also
+ * This constructor parses the first two entries of `args` as regions
+ * (BufferLoad/BufferRegion), extracts their Buffers and Ranges, and stores
+ * them on the newly created CopyNode. It also
  * reads optional arguments:
  * - args[2] (IntImm): coalesced width (stored only if > 0),
  * - args[3] (Bool): disable TMA lowering flag,
  * - args[4] (IntImm): eviction policy.
  *
  * Preconditions:
- * - `args` must contain at least two Call-compatible PrimExpr entries
- * describing regions; an ICHECK will fail if they are not CallNodes.
+ * - `args` must contain at least two region-compatible PrimExpr entries
+ *   (BufferLoad/BufferRegion); ICHECK will fail otherwise.
  *
  * @param args Array of PrimExpr where:
  *   - args[0] is the source Region call,
  *   - args[1] is the destination Region call,
  *   - optional args[2..4] are coalesced width, disable_tma, and eviction
  * policy.
- * @param vmap BufferMap used to resolve RegionOp buffers and ranges.
  */
-Copy::Copy(Array<PrimExpr> args, BufferMap vmap) {
+Copy::Copy(Array<PrimExpr> args) {
   ObjectPtr<CopyNode> node = tvm::ffi::make_object<CopyNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
@@ -250,6 +246,7 @@ PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
                                  const Array<IterVar> &ivs,
                                  Array<PrimExpr> extents, int src_dst) const {
   Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
   Array<PrimExpr> cond_list;
   ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
   size_t idx = 0;
@@ -302,7 +299,6 @@ For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
-
   ICHECK(loop_vars.size() <= src_range.size())
       << "loop_vars.size() = " << loop_vars.size()
       << ", src_range.size() = " << src_range.size() << ", src = " << src->name
@@ -1729,20 +1725,21 @@ Array<PrimExpr> TMADesc::EncodeCallArgs() const {
  * GPU intrinsics.
  *
  * @param args Array of PrimExpr TL-call arguments (see list above).
- * @param vmap Mapping from original buffer variables to actual Buffer objects.
  */
-Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap) {
+Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args) {
   ObjectPtr<Conv2DIm2ColOpNode> node =
       tvm::ffi::make_object<Conv2DIm2ColOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->nhw_step = args[2];
-  node->c_step = args[3];
-  node->kernel = args[4].as<IntImm>().value()->value;
-  node->stride = args[5].as<IntImm>().value()->value;
-  node->dilation = args[6].as<IntImm>().value()->value;
-  node->padding = args[7].as<IntImm>().value()->value;
-  node->eviction_policy = args[8].as<IntImm>().value()->value;
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src_ = node->srcRegion_->buffer;
+  node->dst_ = node->dstRegion_->buffer;
+  node->nhw_step_ = args[2];
+  node->c_step_ = args[3];
+  node->kernel_ = args[4].as<IntImm>().value()->value;
+  node->stride_ = args[5].as<IntImm>().value()->value;
+  node->dilation_ = args[6].as<IntImm>().value()->value;
+  node->padding_ = args[7].as<IntImm>().value()->value;
+  node->eviction_policy_ = args[8].as<IntImm>().value()->value;
   data_ = std::move(node);
 }
 
@@ -1793,24 +1790,24 @@ TileOperator Conv2DIm2ColOpNode::Clone() const {
 Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   ICHECK(TargetIsHopper(T.target));
-  ICHECK(src.scope() == "global" &&
-         (dst.scope() == "shared.dyn" || dst.scope() == "shared"));
-  ICHECK(src->shape.size() == 4);
-  ICHECK(dst->shape.size() == 2);
-  ICHECK(src->dtype == dst->dtype);
+  ICHECK(src_.scope() == "global" &&
+         (dst_.scope() == "shared.dyn" || dst_.scope() == "shared"));
+  ICHECK(src_->shape.size() == 4);
+  ICHECK(dst_->shape.size() == 2);
+  ICHECK(src_->dtype == dst_->dtype);
   Layout shared_layout;
-  if (T.layout_map.count(dst)) {
-    shared_layout = T.layout_map[dst];
+  if (T.layout_map.count(dst_)) {
+    shared_layout = T.layout_map[dst_];
   }
 
   TMAIm2ColDesc desc;
-  desc.rank = src->shape.size();
-  desc.data_type = to_CUtensorMapDataType(src->dtype);
-  desc.global_addr = src->data;
-  desc.global_shape = ReverseArray(src->shape);
+  desc.rank = src_->shape.size();
+  desc.data_type = to_CUtensorMapDataType(src_->dtype);
+  desc.global_addr = src_->data;
+  desc.global_shape = ReverseArray(src_->shape);
 
-  if (!src->strides.empty()) {
-    desc.global_stride = ReverseArray(src->strides);
+  if (!src_->strides.empty()) {
+    desc.global_stride = ReverseArray(src_->strides);
   } else {
     // Create stride from shape
     PrimExpr stride = 1;
@@ -1824,13 +1821,13 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
   desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * src->dtype.bytes();
+    return cast(DataType::Int(64), e) * src_->dtype.bytes();
   });
-  desc.elem_stride = {1, stride, stride, 1};
-  desc.lower_corner = {-padding, -padding};
-  desc.upper_corner = {-padding, -padding};
-  desc.smem_box_pixel = Downcast<IntImm>(dst->shape[0])->value;
-  desc.smem_box_channel = Downcast<IntImm>(dst->shape[1])->value;
+  desc.elem_stride = {1, stride_, stride_, 1};
+  desc.lower_corner = {-padding_, -padding_};
+  desc.upper_corner = {-padding_, -padding_};
+  desc.smem_box_pixel = Downcast<IntImm>(dst_->shape[0])->value;
+  desc.smem_box_channel = Downcast<IntImm>(dst_->shape[1])->value;
   desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
   desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
@@ -1844,15 +1841,15 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
 
     if (StructuralEqual()(shared_layout,
                           makeQuarterBankSwizzleLayout(*stride, *continuous,
-                                                       dst->dtype.bits()))) {
+                                                       dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
     } else if (StructuralEqual()(shared_layout, makeHalfBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
     } else if (StructuralEqual()(shared_layout, makeFullBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else {
       ICHECK(0) << "Cannot detect TMA layout.";
@@ -1871,43 +1868,43 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
       << "Currently can only support divisible channel case";
 
   global_coords.push_back(
-      FloorMod(c_step * desc.smem_box_channel, desc.global_shape[0]));
+      FloorMod(c_step_ * desc.smem_box_channel, desc.global_shape[0]));
   image_offset.push_back(
-      dilation *
-      FloorMod(FloorDiv(c_step * desc.smem_box_channel, desc.global_shape[0]),
-               kernel));
-  image_offset.push_back(dilation * FloorDiv(c_step * desc.smem_box_channel,
-                                             desc.global_shape[0] * kernel));
+      dilation_ *
+      FloorMod(FloorDiv(c_step_ * desc.smem_box_channel, desc.global_shape[0]),
+               kernel_));
+  image_offset.push_back(dilation_ * FloorDiv(c_step_ * desc.smem_box_channel,
+                                              desc.global_shape[0] * kernel_));
 
   PrimExpr h_dim =
-      FloorDiv(src->shape[1] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[1] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   PrimExpr w_dim =
-      FloorDiv(src->shape[2] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[2] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   global_coords.push_back(
-      stride * FloorMod(nhw_step * desc.smem_box_pixel, w_dim) - padding);
+      stride_ * FloorMod(nhw_step_ * desc.smem_box_pixel, w_dim) - padding_);
   global_coords.push_back(
-      stride *
-          FloorMod(FloorDiv(nhw_step * desc.smem_box_pixel, w_dim), h_dim) -
-      padding);
+      stride_ *
+          FloorMod(FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim), h_dim) -
+      padding_);
   global_coords.push_back(
-      FloorDiv(nhw_step * desc.smem_box_pixel, w_dim * h_dim));
+      FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim * h_dim));
 
   Array<PrimExpr> args;
   args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
   args.push_back(0); // mbar placeholder
-  auto dst_buffer = T.buffer_remap.count(dst) ? T.buffer_remap[dst] : dst;
+  auto dst_buffer = T.buffer_remap.count(dst_) ? T.buffer_remap[dst_] : dst_;
   auto shared_addr = dst_buffer.access_ptr(2);
   args.push_back(shared_addr);
   for (auto coord : global_coords)
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
-  args.push_back(this->eviction_policy);
+  args.push_back(this->eviction_policy_);
   Stmt tma_copy =
       IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
                  Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
diff --git a/src/op/copy.h b/src/op/copy.h
index ef46b9ed..b08f5768 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -280,7 +280,7 @@ public:
    * \param args  Expression arguments for the copy.
    * \param vmap  Buffer variable mapping.
    */
-  TVM_DLL Copy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Copy(Array<PrimExpr> args);
 
   /*!
    * \brief Get the TVM Op handle corresponding to this Copy op.
@@ -296,14 +296,16 @@ public:
  */
 class Conv2DIm2ColOpNode : public TileOperatorNode {
 public:
-  Buffer src, dst; // Source (input feature map) and destination (im2col matrix)
-  int stride;      // Stride for convolution
-  int padding;     // Padding amount
-  int dilation;    // Dilation factor
-  int kernel;      // Kernel size
-  int eviction_policy; // Cache eviction policy
-  PrimExpr nhw_step;   // Step size in NHW dimensions
-  PrimExpr c_step;     // Step size in channel dimension
+  BufferRegion srcRegion_, dstRegion_;
+  Buffer src_,
+      dst_;      // Source (input feature map) and destination (im2col matrix)
+  int stride_;   // Stride for convolution
+  int padding_;  // Padding amount
+  int dilation_; // Dilation factor
+  int kernel_;   // Kernel size
+  int eviction_policy_; // Cache eviction policy
+  PrimExpr nhw_step_;   // Step size in NHW dimensions
+  PrimExpr c_step_;     // Step size in channel dimension
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Conv2DIm2Col", Conv2DIm2ColOpNode,
                                     TileOperatorNode);
@@ -311,13 +313,15 @@ public:
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<Conv2DIm2ColOpNode>()
-        .def_ro("src", &Conv2DIm2ColOpNode::src)
-        .def_ro("dst", &Conv2DIm2ColOpNode::dst)
-        .def_ro("stride", &Conv2DIm2ColOpNode::stride)
-        .def_ro("padding", &Conv2DIm2ColOpNode::padding)
-        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation)
-        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel)
-        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy);
+        .def_ro("srcRegion", &Conv2DIm2ColOpNode::srcRegion_)
+        .def_ro("dstRegion", &Conv2DIm2ColOpNode::dstRegion_)
+        .def_ro("src", &Conv2DIm2ColOpNode::src_)
+        .def_ro("dst", &Conv2DIm2ColOpNode::dst_)
+        .def_ro("stride", &Conv2DIm2ColOpNode::stride_)
+        .def_ro("padding", &Conv2DIm2ColOpNode::padding_)
+        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation_)
+        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel_)
+        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy_);
   }
 
   /*!
@@ -342,7 +346,7 @@ class Conv2DIm2ColOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Conv2DIm2ColOp, TileOperator,
                                              Conv2DIm2ColOpNode);
-  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 93b3bca0..5a773768 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -17,7 +17,7 @@
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
 #include "builtin.h"
-#include "region.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -52,62 +52,18 @@ using namespace tir;
  * value].
  *             - args[0]: destination access (BufferLoad or pointer expression).
  *             - args[1]: value to fill (scalar or vector).
- * @param vmap Mapping from buffer variables to Buffer objects; used to resolve
- * the destination when args[0] is not a BufferLoad.
  *
  * Notes:
  * - The constructor enforces constraints (e.g., stride == 1 ramps, constant
  * lanes) and will terminate (via CHECK/ICHECK) if inputs are unsupported or out
  * of bounds.
  */
-Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
+Fill::Fill(Array<PrimExpr> args) {
   ObjectPtr<FillNode> node = tvm::ffi::make_object<FillNode>();
 
-  // Case 1: Region descriptor call (tl.region)
-  if (const auto *call = args[0].as<CallNode>()) {
-    if (call->op.same_as(RegionOp::Get())) {
-      auto region = RegionOp(call->args, vmap);
-      node->dst = region->GetBuffer();
-      node->region = region->GetRanges();
-    } else if (call->op.same_as(builtin::tvm_access_ptr())) {
-      node->dst = vmap[GetVarFromAccessPtr(args[0])];
-      for (int i = 0; i < node->dst->shape.size(); i++) {
-        node->region.push_back(Range(0, node->dst->shape[i]));
-      }
-    } else {
-      ICHECK(false) << "Unsupported call op in tl.fill: "
-                    << Downcast<Op>(call->op)->name;
-    }
-
-    // Case 2: Explicit BufferRegion (legacy path)
-  } else if (args[0]->IsInstance<BufferRegionNode>()) {
-    auto region = Downcast<BufferRegion>(args[0]);
-    node->dst = region->buffer;
-    node->region = region->region;
-
-    // Case 3: Vector/scalar region expressed via BufferLoad indices
-  } else if (args[0]->IsInstance<BufferLoadNode>()) {
-    auto buffer_load = Downcast<BufferLoad>(args[0]);
-    for (const auto &index : buffer_load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        CHECK(ramp->stride.as<IntImmNode>()->value == 1)
-            << "Only stride 1 ramps are supported";
-        const auto *lanes = ramp->lanes.as<IntImmNode>();
-        CHECK(lanes)
-            << "Scalable vectors not supported in BufferRegion conversion";
-        node->region.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        node->region.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    node->dst = buffer_load->buffer;
-    // Case 4: Access pointer, fill the full buffer
-  } else {
-    node->dst = vmap[GetVarFromAccessPtr(args[0])];
-    for (int i = 0; i < node->dst->shape.size(); i++) {
-      node->region.push_back(Range(0, node->dst->shape[i]));
-    }
-  }
+  BufferRegion region = NormalizeToBufferRegion(args[0]);
+  node->dst = region->buffer;
+  node->region = region->region;
 
   if (args[1]->dtype != node->dst->dtype) {
     node->value = Cast(node->dst->dtype, args[1]);
diff --git a/src/op/fill.h b/src/op/fill.h
index 8f1dd900..c10a5cfb 100644
--- a/src/op/fill.h
+++ b/src/op/fill.h
@@ -45,7 +45,7 @@ private:
 class Fill : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fill, TileOperator, FillNode);
-  TVM_DLL Fill(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Fill(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index 84b18897..effc4baf 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/op_attr_types.h>
 
 #include "../target/utils.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -29,12 +30,14 @@ using namespace tir;
  * @param args TL operator arguments: expects at least two elements where
  *             `args[0]` is an access pointer identifying the reducer variable
  * and `args[1]` is an integer encoding a `ReducerOpType` (e.g., Sum/Max/Min).
- * @param vmap Mapping from variables to Buffers used to look up the reducer
- * Buffer.
  */
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args) {
   auto node = tvm::ffi::make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
+  // Normalize any supported region expression
+  // (BufferRegion/BufferLoad/tl.region) to a BufferRegion, then take the
+  // underlying Buffer as reducer.
+  auto region = NormalizeToBufferRegion(args[0]);
+  node->reducer = region->buffer;
   node->op = (ReducerOpType)*as_const_int(args[1]);
   data_ = std::move(node);
 }
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
index ef49ee19..99e1e7cb 100644
--- a/src/op/finalize_reducer.h
+++ b/src/op/finalize_reducer.h
@@ -48,7 +48,7 @@ class FinalizeReducerOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FinalizeReducerOp, TileOperator,
                                              FinalizeReducerOpNode);
-  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index cece1e6f..5a98cba6 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -12,7 +12,6 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
 #include "utils.h"
 
@@ -42,8 +41,6 @@ using namespace tir;
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
@@ -53,12 +50,12 @@ using namespace tir;
 
 // MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
-Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
+Gemm::Gemm(Array<PrimExpr> args) {
   ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -83,11 +80,14 @@ Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    } else {
+      node->mbar_ = std::nullopt;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
@@ -500,11 +500,13 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
     auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
     Array<PrimExpr> new_args;
+    auto mbarPtr =
+        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
     new_args.push_back(StringImm(ss.str()));
     new_args.push_back(Aptr);
     new_args.push_back(Bptr);
     new_args.push_back(BufferLoad(C_buffer, cCoords_));
-    new_args.push_back(mbarPtr_);
+    new_args.push_back(mbarPtr);
     new_args.push_back(clearAccum_);
     auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
 
diff --git a/src/op/gemm.h b/src/op/gemm.h
index 1c976055..3ec58bec 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -97,7 +97,7 @@ public:
   // only will be enabled under cdna mfma instructions
   int kPack_ = 1;
   int wgWait_ = 0;
-  PrimExpr mbarPtr_;
+  BufferRegion mbarRegion_;
   std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   mutable GemmWarpPolicy policy_;
@@ -144,7 +144,7 @@ private:
 class Gemm : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Gemm, TileOperator, GemmNode);
-  TVM_DLL Gemm(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Gemm(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index a6ddef64..511a4283 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -12,7 +12,6 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "region.h"
 #include "tcgen5_meta.h"
 #include "utils.h"
 
@@ -46,19 +45,17 @@ using namespace tir;
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
+GemmPy::GemmPy(Array<PrimExpr> args) {
   ObjectPtr<GemmPyNode> node = tvm::ffi::make_object<GemmPyNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->bRegion_ = NormalizeToBufferRegion(args[1], vmap);
-  node->cRegion_ = NormalizeToBufferRegion(args[2], vmap);
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -83,11 +80,12 @@ GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarPtr_ = args[16];
-  if (node->mbarPtr_.as<CallNode>()) {
-    node->mbar_ = vmap[GetVarFromAccessPtr(node->mbarPtr_)];
-  } else {
-    node->mbar_ = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    }
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
index 0678588e..2fe47be8 100644
--- a/src/op/gemm_py.h
+++ b/src/op/gemm_py.h
@@ -29,8 +29,8 @@ public:
   int strideA_, strideB_;
   int offsetA_, offsetB_;
   PrimExpr clearAccum_ = const_false();
-  PrimExpr mbarPtr_;
-  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
+  BufferRegion mbarRegion_;
+  tir::Buffer mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
@@ -59,7 +59,8 @@ public:
         .def_ro("offsetA", &GemmPyNode::offsetA_)
         .def_ro("offsetB", &GemmPyNode::offsetB_)
         .def_ro("clearAccum", &GemmPyNode::clearAccum_)
-        .def_ro("mbarPtr", &GemmPyNode::mbarPtr_)
+        .def_ro("mbarRegion", &GemmPyNode::mbarRegion_)
+        .def_ro("mbar", &GemmPyNode::mbar_)
         .def_ro("cCoords", &GemmPyNode::cCoords_)
         .def_ro("kPack", &GemmPyNode::kPack_)
         .def_ro("wgWait", &GemmPyNode::wgWait_)
@@ -82,7 +83,7 @@ private:
 class GemmPy : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmPy, TileOperator, GemmPyNode);
-  TVM_DLL GemmPy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmPy(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index 52a119e0..df923d0e 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -14,6 +14,7 @@
 #include "../target/utils.h"
 #include "builtin.h"
 #include "gemm.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -79,16 +80,19 @@ std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
  * The populated GemmSPNode is stored in the instance's internal data_ pointer.
  *
  * @param args Positional TL call arguments in the above order.
- * @param vmap BufferMap mapping access pointers (from args) to Buffer objects.
  *
  * @note An ICHECK failure is raised if a provided kPack is not 1 or 2.
  */
-GemmSP::GemmSP(Array<PrimExpr> args, BufferMap vmap) {
+GemmSP::GemmSP(Array<PrimExpr> args) {
   ObjectPtr<GemmSPNode> node = tvm::ffi::make_object<GemmSPNode>();
-  node->a_ = vmap[GetVarFromAccessPtr(args[0])];
-  node->e_ = vmap[GetVarFromAccessPtr(args[1])];
-  node->b_ = vmap[GetVarFromAccessPtr(args[2])];
-  node->c_ = vmap[GetVarFromAccessPtr(args[3])];
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  node->a_ = node->aRegion_->buffer;
+  node->e_ = node->eRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
   node->transA_ = args[4].as<Bool>().value();
   node->transB_ = args[5].as<Bool>().value();
   node->m_ = args[6].as<IntImm>().value()->value;
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index 1eb535a5..aae5b27b 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -53,6 +53,7 @@ public:
 
 class GemmSPNode : public TileOperatorNode {
 public:
+  BufferRegion aRegion_, bRegion_, cRegion_, eRegion_;
   tir::Buffer a_, b_, c_, e_;
   bool transA_, transB_;
   int m_, n_, k_;
@@ -75,6 +76,10 @@ public:
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmSPNode>()
         .def_ro("policy", &GemmSPNode::policy_)
+        .def_ro("aRegion", &GemmSPNode::aRegion_)
+        .def_ro("bRegion", &GemmSPNode::bRegion_)
+        .def_ro("cRegion", &GemmSPNode::cRegion_)
+        .def_ro("eRegion", &GemmSPNode::eRegion_)
         .def_ro("a", &GemmSPNode::a_)
         .def_ro("b", &GemmSPNode::b_)
         .def_ro("c", &GemmSPNode::c_)
@@ -96,7 +101,7 @@ private:
 class GemmSP : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSP, TileOperator, GemmSPNode);
-  TVM_DLL GemmSP(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL GemmSP(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/operator.cc b/src/op/operator.cc
index b751559c..302ee3e3 100644
--- a/src/op/operator.cc
+++ b/src/op/operator.cc
@@ -24,16 +24,14 @@ using namespace tir;
  *
  * @param call The TIR Call whose operator and arguments will be used to build
  * the TileOperator.
- * @param vmap Buffer mapping passed through to the builder to resolve buffer
- * references.
  * @return TileOperator The constructed TileOperator, or a default (empty)
  * TileOperator if no builder exists.
  */
-TileOperator ParseOperator(Call call, BufferMap vmap) {
+TileOperator ParseOperator(Call call) {
   auto op_map = Op::GetAttrMap<OpBuilderFunc>("TLOpBuilder");
   Op op = call->op.as<Op>().value();
   if (op_map.count(op)) {
-    auto tile_op = op_map[op](call->args, vmap);
+    auto tile_op = op_map[op](call->args);
     ICHECK(tile_op.defined());
     return tile_op;
   }
@@ -48,14 +46,13 @@ TileOperator ParseOperator(Call call, BufferMap vmap) {
  * Otherwise returns a default-constructed (empty) TileOperator.
  *
  * @param stmt TIR statement to inspect; expected to be an Evaluate of a Call.
- * @param vmap Mapping of buffer variables used when building the operator.
  * @return TileOperator Parsed operator on success, or a default (empty)
  * TileOperator if `stmt` is not an Evaluate(Call).
  */
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap) {
+TileOperator ParseOperator(Stmt stmt) {
   if (stmt.as<Evaluate>() && stmt.as<EvaluateNode>()->value.as<CallNode>()) {
     auto call = stmt.as<EvaluateNode>()->value.as<CallNode>();
-    return ParseOperator(tvm::ffi::GetRef<Call>(call), vmap);
+    return ParseOperator(tvm::ffi::GetRef<Call>(call));
   }
   return TileOperator();
 }
diff --git a/src/op/operator.h b/src/op/operator.h
index 628b83b2..0d9f859a 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -72,11 +72,10 @@ public:
 
 Var GetVarFromAccessPtr(const PrimExpr &expr);
 
-TileOperator ParseOperator(Call call, BufferMap vmap);
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap);
+TileOperator ParseOperator(Call call);
+TileOperator ParseOperator(Stmt stmt);
 
-using OpBuilderFunc =
-    ffi::TypedFunction<TileOperator(Array<PrimExpr>, BufferMap)>;
+using OpBuilderFunc = ffi::TypedFunction<TileOperator(Array<PrimExpr>)>;
 
 #define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
   const Op &Entry::Get() {                                                     \
@@ -85,10 +84,8 @@ using OpBuilderFunc =
   }                                                                            \
   TVM_REGISTER_OP("tl." #OpName)                                               \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)             \
-      .set_attr<OpBuilderFunc>("TLOpBuilder",                                  \
-                               [](Array<PrimExpr> args, BufferMap vmap) {      \
-                                 return Entry(args, vmap);                     \
-                               })
+      .set_attr<OpBuilderFunc>(                                                \
+          "TLOpBuilder", [](Array<PrimExpr> args) { return Entry(args); })
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index c326f5ac..caf9198a 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -14,7 +14,6 @@
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
-#include "region.h"
 #include "tir/transforms/ir_utils.h"
 #include "tvm/tir/stmt.h"
 #include "utils.h"
@@ -28,11 +27,11 @@ using namespace tir;
 
 // MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
 
-ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
+ReduceOp::ReduceOp(Array<PrimExpr> args) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
-  // Accept BufferRegion/BufferLoad/tl.region for src/dst
-  node->srcRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1], vmap);
+  // Accept BufferRegion/BufferLoad for src/dst
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
   node->src = node->srcRegion_->buffer;
   node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
@@ -494,7 +493,7 @@ static BufferRegion ConvertBufferToBufferRegion(const Buffer &buf) {
   return BufferRegion(buf, ranges);
 }
 
-CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
+CumSumOp::CumSumOp(Array<PrimExpr> args) {
   /// CumSum constructor arguments:
   /// - src: input buffer
   /// - dst: output buffer
@@ -504,8 +503,8 @@ CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
   ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
   // node->src = vmap[GetVarFromAccessPtr(args[0])];
   // node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->srcRegion_ = NormalizeToBufferRegion(args[0], vmap);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1], vmap);
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
   node->src = node->srcRegion_->buffer;
   node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
diff --git a/src/op/reduce.h b/src/op/reduce.h
index eb0599eb..cab3835e 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -125,7 +125,7 @@ class ReduceOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReduceOp, TileOperator,
                                              ReduceOpNode);
-  TVM_DLL ReduceOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL ReduceOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
@@ -163,7 +163,7 @@ class CumSumOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(CumSumOp, TileOperator,
                                              CumSumOpNode);
-  TVM_DLL CumSumOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL CumSumOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/region.cc b/src/op/region.cc
index e4984af1..2a1f2745 100644
--- a/src/op/region.cc
+++ b/src/op/region.cc
@@ -1,7 +1,14 @@
 /*!
  * \file tl/op/region.cc
- * \brief Define region operator.
+ * \brief Define region operator (bridge to carry BufferRegion via Call args).
  *
+ * Notes:
+ * - BufferLoad/Ramp cannot represent a general PrimExpr as a vector lane
+ *   count. Dynamic extents like (H1 - H0) cannot be encoded as
+ *   Ramp(lanes = H1 - H0), and lowering BufferRegion to BufferLoad loses the
+ *   explicit extent information.
+ * - tl.region carries both mins and extents in Call args and lets the backend
+ *   reconstruct a BufferRegion faithfully.
  */
 
 #include "region.h"
@@ -11,27 +18,7 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 
-/**
- * @brief Construct a RegionOp from TL operator arguments.
- *
- * Parses the TL `region` operator call arguments to populate the RegionOpNode:
- * - Expects args[0] to be a `BufferLoad` whose `indices` are the per-dimension
- * minima.
- * - args[1] must be a constant integer used as the access mask.
- * - args[2 + i] provides the extent for dimension `i`.
- *
- * The constructor validates that the number of load indices equals `args.size()
- * - 2` and will abort via ICHECK on mismatch or if args[0] is not a
- * `BufferLoad`.
- *
- * Parameters:
- * - args: TL operator call arguments in the form
- *     [BufferLoad(min_i...), access_mask, extent_0, extent_1, ...,
- * extent_{n-1}] where n = number of dimensions.
- * - vmap: BufferMap passed through by the caller (not documented here as a
- * generic utility).
- */
-RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
+RegionOp::RegionOp(Array<PrimExpr> args) {
   size_t n = args.size();
   size_t ndim = n - 2;
   auto load = args[0].as<BufferLoadNode>();
@@ -39,10 +26,24 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   ICHECK(load->indices.size() == ndim)
       << "load->indices.size() = " << load->indices << " ndim = " << ndim;
   Array<Range> ranges;
+  // Rebuild per-axis ranges from mins (BufferLoad indices) and provided extents
   for (size_t i = 0; i < ndim; i++) {
-    PrimExpr min = load->indices[i];
+    PrimExpr index = load->indices[i];
     PrimExpr extent = args[2 + i];
-    ranges.push_back(Range::FromMinExtent(min, extent));
+    if (const auto *ramp = index.as<RampNode>()) {
+      const auto *stride_imm = ramp->stride.as<IntImmNode>();
+      ICHECK(stride_imm && stride_imm->value == 1)
+          << "RegionOp expects stride-1 Ramp for index";
+      if (const auto *lanes_imm = ramp->lanes.as<IntImmNode>()) {
+        if (const auto *ext_imm = extent.as<IntImmNode>()) {
+          ICHECK_EQ(lanes_imm->value, ext_imm->value)
+              << "Ramp lanes and provided extent must match";
+        }
+      }
+      ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+    } else {
+      ranges.push_back(Range::FromMinExtent(index, extent));
+    }
   }
   ObjectPtr<RegionOpNode> node = tvm::ffi::make_object<RegionOpNode>();
   node->buffer_ = load->buffer;
@@ -51,26 +52,11 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a copy of this RegionOpNode and return it as a TileOperator.
- *
- * @return TileOperator A new TileOperator that owns a copied RegionOpNode.
- */
 TileOperator RegionOpNode::Clone() const {
   auto op = tvm::ffi::make_object<RegionOpNode>(*this);
   return RegionOp(op);
 }
 
-/**
- * @brief Check whether the region spans the entire underlying buffer.
- *
- * Returns true if for every dimension the range minimum is zero and the
- * range extent is structurally equal to the corresponding buffer shape
- * dimension. Otherwise returns false.
- *
- * @return true if the region covers the full buffer in all dimensions; false
- * otherwise.
- */
 bool RegionOpNode::IsFullRegion() const {
   for (size_t i = 0; i < ranges_.size(); i++) {
     if (!is_zero(ranges_[i]->min))
@@ -81,39 +67,26 @@ bool RegionOpNode::IsFullRegion() const {
   return true;
 }
 
-/**
- * @brief Lower the region operator to a TIR statement.
- *
- * Lowers this RegionOpNode into a TIR Stmt by delegating to the operator's
- * evaluation path (currently `Evaluate(0)`).
- *
- * @param T Lowering context (provides buffers, producers/consumers and other
- *          environment required for lowering).
- * @param analyzer Optional arithmetic analyzer used for simplification during
- *                 lowering.
- * @return Stmt The lowered TIR statement representing this region operation.
- */
 Stmt RegionOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return Evaluate(0);
 }
 
-/**
- * @brief Infers data layout for the region operator.
- *
- * This operator does not provide any layout inference; the function always
- * returns an empty LayoutMap regardless of the provided arguments or inference
- * level.
- *
- * @param T Layout inference arguments (ignored).
- * @param level Inference granularity level (ignored).
- * @return LayoutMap Empty map indicating no inferred layouts.
- */
 LayoutMap RegionOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   return {};
 }
 
-TIR_REGISTER_TL_OP(RegionOp, region)
+const Op &RegionOp::Get() {
+  static const Op &op = Op::Get("tl.region");
+  return op;
+}
+
+TVM_REGISTER_OP("tl.region")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "region")
+    .set_attr<OpBuilderFunc>("TLOpBuilder",
+                             [](Array<PrimExpr> args) {
+                               return RegionOp(args);
+                             })
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
diff --git a/src/op/region.h b/src/op/region.h
index e5c478bf..24399f7a 100644
--- a/src/op/region.h
+++ b/src/op/region.h
@@ -1,74 +1,36 @@
 /*!
- * \file tl/op/op.h
- * \brief Tile library operations.
+ * \file tl/op/region.h
+ * \brief Tile memory region descriptor op (bridge to carry BufferRegion via
+ * Call args).
  *
+ * Why tl.region instead of passing BufferRegion directly?
+ *
+ * - While TIR can represent a BufferRegion, when a BufferRegion is passed as a
+ *   call argument through call_intrin/FFI, the Python->C++ conversion lowers it
+ *   to a BufferLoad(indices). To encode an interval inside indices, the FFI
+ *   typically uses Ramp(base, stride, lanes) to represent a contiguous slice.
+ * - Ramp(lanes) may only be a constant or vscale*k (scalable vector). A general
+ *   PrimExpr (e.g., H1 - H0) is not allowed as lanes, so dynamic extents would
+ *   make the lowered BufferLoad invalid.
+ * - Moreover, BufferLoad only carries indices, not per-axis extents. Downstream
+ *   tile operators (e.g., tl.copy, tl.reduce) that require both min and extent
+ *   cannot losslessly recover dynamic extents from a BufferLoad alone.
+ *
+ * tl.region is a small transport-only op that solves this:
+ * - The frontend packs buffer + mins (from BufferLoad.indices) + extents into
+ *   Call args, allowing dynamic extents to be expressed explicitly.
+ * - The backend (NormalizeToBufferRegion) reconstructs a BufferRegion from the
+ *   tl.region call without losing information.
+ * - The op itself carries no semantics in Lower/InferLayout and is only used as
+ *   a bridge for argument passing.
  */
 
 #ifndef TVM_TL_OP_REGION_H_
 #define TVM_TL_OP_REGION_H_
 
 #include "./operator.h"
-#include <tvm/arith/analyzer.h>
-#include <tvm/ir/op.h>
-#include <tvm/target/target.h>
 #include <tvm/tir/buffer.h>
 
-/**
- * Tile operator representing a memory region (buffer + ranges) used by TL
- * passes.
- *
- * Encapsulates the target tir::Buffer, the region extents as an Array<Range>,
- * and an access mask that indicates permitted or intended accesses for lowering
- * and layout inference.
- */
-
-/**
- * Lower this RegionOp into a TIR statement representing the region access.
- *
- * @param T Lowering-time arguments (e.g., loop/build context and value
- * mappings).
- * @param analyzer Arithmetic analyzer used to simplify and reason about
- * expressions.
- * @return A tir::Stmt that implements the region access/mutation described by
- * this operator.
- */
-
-/**
- * Infer the layout mapping for this region operator.
- *
- * Produces a LayoutMap describing how loop/axis indices map to buffer axes for
- * layout-aware scheduling and subsequent operators.
- *
- * @param T Layout inference arguments (e.g., input layouts and shapes).
- * @param level The inference detail level to use.
- * @return A LayoutMap describing inferred mappings for the operator.
- */
-
-/**
- * Return true when this RegionOp represents the full buffer region (i.e.,
- * ranges cover the entire buffer extent).
- */
-
-/**
- * Create a shallow copy of this operator as a TileOperator handle.
- *
- * @return A TileOperator that references a cloned RegionOpNode.
- */
-
-/**
- * Construct a RegionOp from argument expressions and a buffer map.
- *
- * @param args Positional expressions used to instantiate the operator
- * (semantics depend on how RegionOp is invoked in TL pipelines).
- * @param vmap Mapping from Buffer to replacement Buffer or buffer metadata used
- * during creation.
- */
-
-/**
- * Return the global Op registration for RegionOp.
- *
- * @return Reference to the registered tvm::Op describing the RegionOp.
- */
 namespace tvm {
 namespace tl {
 
@@ -80,6 +42,12 @@ public:
   Array<Range> ranges_;
   int access_mask_;
 
+  /*!
+   * access_mask_ encodes the intended access type when the region is used as
+   * an argument to tile operators: 1=read, 2=write, 3=read-write. The mask is
+   * transport metadata only and does not affect lowering.
+   */
+
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.RegionOp", RegionOpNode,
                                     TileOperatorNode);
 
@@ -107,8 +75,13 @@ class RegionOp : public TileOperator {
 public:
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(RegionOp, TileOperator,
                                              RegionOpNode);
-  TVM_DLL RegionOp(Array<PrimExpr> args, BufferMap vmap);
-
+  /*!
+   * Build a RegionOp from call arguments:
+   * - args[0]: BufferLoad whose indices are per-axis minima.
+   * - args[1]: Integer access mask (1=r, 2=w, 3=rw).
+   * - args[2 + i]: Extent of axis i (supports dynamic PrimExpr).
+   */
+  TVM_DLL RegionOp(Array<PrimExpr> args);
   static const Op &Get();
 };
 
diff --git a/src/op/utils.cc b/src/op/utils.cc
index 59960b57..7e56ae8c 100644
--- a/src/op/utils.cc
+++ b/src/op/utils.cc
@@ -12,8 +12,7 @@ namespace tl {
 
 using namespace tir;
 
-BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                     const BufferMap &vmap) {
+BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
   // Case 1: Already a BufferRegion
   if (arg->IsInstance<BufferRegionNode>()) {
     return Downcast<BufferRegion>(arg);
@@ -38,23 +37,15 @@ BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
     return BufferRegion(load->buffer, ranges);
   }
 
-  // Case 3: Call nodes
+  // Case 3: tl.region(...) — reconstruct via RegionOp (bridge)
   if (const auto *call = arg.as<CallNode>()) {
-    // tl.region(...) — reconstruct via RegionOp
     if (call->op.same_as(RegionOp::Get())) {
-      RegionOp region(call->args, vmap);
+      RegionOp region(call->args);
       return BufferRegion(region->GetBuffer(), region->GetRanges());
     }
-    // builtin.tvm_access_ptr(...) — map var to Buffer and take full region
-    if (call->op.same_as(builtin::tvm_access_ptr())) {
-      Var var = Downcast<Var>(call->args[1]);
-      Buffer buf = vmap.at(var);
-      Array<Range> ranges;
-      for (PrimExpr extent : buf->shape) {
-        ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
-      }
-      return BufferRegion(buf, ranges);
-    }
+    LOG(FATAL) << "Unsupported argument for BufferRegion (expect "
+                  "BufferLoad/BufferRegion/tl.region): "
+               << arg;
   }
 
   LOG(FATAL) << "Unsupported argument for BufferRegion: " << arg;
diff --git a/src/op/utils.h b/src/op/utils.h
index 9e7880ac..d386b1a5 100644
--- a/src/op/utils.h
+++ b/src/op/utils.h
@@ -16,10 +16,10 @@ namespace tl {
 
 using namespace tir;
 
-// Normalize an argument (BufferRegion/BufferLoad/tl.region/tvm_access_ptr)
+// Normalize an argument (BufferRegion/BufferLoad/tl.region)
 // to BufferRegion so ops can uniformly consume regions.
-TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
-                                             const BufferMap &vmap);
+// Note: tvm_access_ptr is no longer supported here.
+TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
 
 // Build a tvm_access_ptr(handle) from a BufferRegion.
 // - If `require_2d` is true, checks buffer ndim >= 2.
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index 873f70d0..f5ccc42b 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -437,11 +437,13 @@ private:
     if (op->op.as<GlobalVarNode>())
       return;
 
-    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op), GetBufferMap());
+    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op));
     if (p.defined()) {
       for (const auto &arg : op->args) {
         if (auto buffer = getBufferFromAccessPtr(arg)) {
           addToUseList(buffer.value());
+        } else if (auto buffer = getBufferFromRegion(arg)) {
+          addToUseList(buffer.value());
         }
       }
       // Compute thread_var_ and thread_bounds_
@@ -495,6 +497,9 @@ private:
   }
 
   Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
+    if (auto bl = expr.as<BufferLoadNode>()) {
+      return bl->buffer;
+    }
     auto call = expr.as<CallNode>();
     if (!call) {
       return std::nullopt;
@@ -514,8 +519,18 @@ private:
         }
       }
       return std::nullopt;
-    } else if (call->op.same_as(RegionOp::Get())) {
-      return call->args[0].as<BufferLoadNode>()->buffer;
+    }
+    return std::nullopt;
+  }
+
+  Optional<Buffer> getBufferFromRegion(const PrimExpr &expr) {
+    if (auto call = expr.as<CallNode>()) {
+      if (call->op.same_as(RegionOp::Get())) {
+        if (auto bl = call->args[0].as<BufferLoadNode>()) {
+          return bl->buffer;
+        }
+        return std::nullopt;
+      }
     }
     return std::nullopt;
   }
diff --git a/src/transform/layout_reducer.cc b/src/transform/layout_reducer.cc
index a3c69c43..660fc6fd 100644
--- a/src/transform/layout_reducer.cc
+++ b/src/transform/layout_reducer.cc
@@ -277,7 +277,7 @@ private:
     if (op->op.same_as(Fill::Get())) {
       ICHECK(!op->args.empty());
       if (auto arg0_call = op->args[0].as<Call>()) {
-        // Case 1: tl.region(...) — extract buffer var from its first arg
+        // tl.region(...) — extract buffer var from its first arg
         if (arg0_call.value()->op.same_as(RegionOp::Get())) {
           ICHECK(!arg0_call.value()->args.empty());
           if (auto bl = arg0_call.value()->args[0].as<BufferLoadNode>()) {
@@ -285,15 +285,14 @@ private:
             if (reducer_info_map_.count(var)) {
               ICHECK(inside_reducer_range_.count(var) == 0)
                   << "T.fill on reducer must be enclosed with a "
-                     "T.finalize_reducer "
-                     "before next.";
+                     "T.finalize_reducer before next.";
               inside_reducer_range_.Set(var,
                                         reducer_info_map_.Get(var).value());
             }
           }
         }
-        // Case 2: builtin.tvm_access_ptr(...) — existing path
-        else if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
+        // builtin.tvm_access_ptr(...) — existing path (legacy)
+        if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
           ICHECK(arg0_call.value()->args.size() > 1);
           if (auto var = arg0_call.value()->args[1].as<Var>();
               var && reducer_info_map_.count(var.value())) {
@@ -305,10 +304,33 @@ private:
                 var.value(), reducer_info_map_.Get(var.value()).value());
           }
         }
+      } else if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        Var var = bl->buffer->data;
+        if (reducer_info_map_.count(var)) {
+          ICHECK(inside_reducer_range_.count(var) == 0)
+              << "T.fill on reducer must be enclosed with a T.finalize_reducer "
+                 "before next.";
+          inside_reducer_range_.Set(var, reducer_info_map_.Get(var).value());
+        }
       }
     } else if (op->op.same_as(FinalizeReducerOp::Get())) {
       ICHECK(op->args.size() == 1);
-      auto var = GetVarFromAccessPtr(op->args[0]);
+      Var var;
+      if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        var = bl->buffer->data;
+      } else if (auto reg_call = op->args[0].as<Call>()) {
+        if (reg_call.value()->op.same_as(RegionOp::Get())) {
+          if (auto bl2 = reg_call.value()->args[0].as<BufferLoadNode>()) {
+            var = bl2->buffer->data;
+          } else {
+            LOG(FATAL) << "tl.region expects BufferLoad as first arg";
+          }
+        } else {
+          var = GetVarFromAccessPtr(op->args[0]);
+        }
+      } else {
+        var = GetVarFromAccessPtr(op->args[0]);
+      }
       ICHECK(inside_reducer_range_.count(var) == 1)
           << "T.finalize_reducer must have a pairing T.fill ahead of it, "
              "enclosing a reduction range.";
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 4c0ccfaf..4392f319 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -606,8 +606,7 @@ private:
     if (call && call->op.as<GlobalVarNode>())
       return Downcast<Evaluate>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    auto tile_op =
-        ParseOperator(tvm::ffi::GetRef<Stmt>(op), buffer_data_to_buffer_);
+    auto tile_op = ParseOperator(tvm::ffi::GetRef<Stmt>(op));
     if (!tile_op.defined())
       return IRMutatorWithAnalyzer::VisitStmt_(op);
     AddWorkspaceCallback callback = [this](int num_elem, DataType dtype) {
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index ab593712..950b8583 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -17,7 +17,15 @@ def _empty_kernel():
     return empty_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_lowering():
+    # Ensure a valid CUDA runtime context is current on this thread for the
+    # target device before using driver API calls. Without this, calls like
+    # cuModuleLoadData can fail with CUDA_ERROR_INVALID_CONTEXT, especially
+    # for kernels that don't touch any device memory or streams beforehand
+    # (e.g., "empty" kernels) and therefore haven't triggered context
+    # creation implicitly.
+    torch.cuda.set_device(0)
     kernel = _empty_kernel()
     kernel()
 
@@ -59,7 +67,9 @@ def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
     return kernel_with_tuple_kernel_binding if use_tuple_binding else kernel_with_scalar_kernel_binding
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_with_binding_variants():
+    torch.cuda.set_device(0)
     kernel = _empty_kernel_with_binding_variants()
     kernel()
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 84e4c21b..02c0b039 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -2,14 +2,15 @@ from __future__ import annotations
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
-from .utils import (
-    mfma_store_index_map,)
+from .utils import (mfma_store_index_map)
 from typing import Literal, Callable
 
 from tilelang.utils import is_fragment
-from tilelang.utils.language import to_buffer_region
+from tilelang.utils.language import get_buffer_region_from_load
 from .mfma_layout import (
     shared_16x4_to_local_64x1_layout_A,
     shared_4x16_to_local_64x1_layout_B,
@@ -268,7 +269,7 @@ class MatrixCoreIntrinEmitter:
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -314,7 +315,7 @@ class MatrixCoreIntrinEmitter:
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -655,6 +656,33 @@ class MatrixCoreIntrinEmitter:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
 
 class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 8c546c63..aab2a49e 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -3,14 +3,16 @@ import tilelang.language as T
 from typing import Literal, Callable
 from tilelang.common import TransformKind
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tilelang import tvm as tvm
 from tvm.runtime import convert
 from .utils import (
     mma_store_index_map,
     get_ldmatrix_offset,
 )
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_layout import (
     shared_16x8_to_mma_32x4_layout_sr_a,
     shared_16x8_to_mma_32x4_layout_sr_b,
@@ -243,7 +245,7 @@ class TensorCoreIntrinEmitter:
 
             thread_binding = self.get_thread_binding()
             # legalize shared buffer to region
-            A_region = to_buffer_region(A_shared_buf)
+            A_region = self._legalize_to_buffer_region(A_shared_buf)
             A_buf = A_region.buffer
             A_base0 = A_region.region[-2].min
             A_base1 = A_region.region[-1].min
@@ -294,7 +296,7 @@ class TensorCoreIntrinEmitter:
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -360,7 +362,7 @@ class TensorCoreIntrinEmitter:
             thread_binding = self.get_thread_binding()
 
             # legalize shared buffer to region
-            B_region = to_buffer_region(B_shared_buf)
+            B_region = self._legalize_to_buffer_region(B_shared_buf)
             B_buf = B_region.buffer
             B_base0 = B_region.region[-2].min
             B_base1 = B_region.region[-1].min
@@ -397,7 +399,7 @@ class TensorCoreIntrinEmitter:
         thread_binding = self.get_thread_binding()
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
@@ -798,6 +800,33 @@ class TensorCoreIntrinEmitter:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
 
 class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
     """
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index b20a6a90..78248081 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -5,7 +5,7 @@ from tvm import DataType
 from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
 from tilelang import tvm as tvm
 from tvm.runtime import convert
-from tilelang.utils import is_fragment, to_buffer_region
+from tilelang.utils import is_fragment
 from tilelang.intrinsics.mma_sm70_layout import (
     shared_16x4_to_mma_a_32x4_layout,
     shared_4x16_to_mma_b_32x4_layout,
@@ -207,7 +207,7 @@ class TensorCoreIntrinEmitter:
         mma_load_layout = mma_load_a_32x4_to_shared_16x4_layout
 
         # legalize shared buffer to region
-        A_region = to_buffer_region(A_shared_buf)
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
@@ -248,7 +248,7 @@ class TensorCoreIntrinEmitter:
         mma_load_layout = mma_load_b_32x4_to_shared_16x4_layout_trans if b_transposed else mma_load_b_32x4_to_shared_4x16_layout
 
         # legalize shared buffer to region
-        B_region = to_buffer_region(B_shared_buf)
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 6e5fa88c..56f87473 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -4,10 +4,9 @@
 from __future__ import annotations
 
 import tilelang.language as T
-from tvm import ir, tir
+from tvm import ir
 from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
-from tilelang.language.utils import buffer_region_to_tile_region, buffer_load_to_tile_region
-from tilelang.utils.language import get_buffer_region_from_load, legalize_pairwise_extents
+from tilelang.utils.language import to_buffer_region, legalize_pairwise_extents
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -203,24 +202,8 @@ def atomic_add(dst: Buffer,
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    def _to_region(data, access_type, extent):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            zeros = [tir.IntImm("int32", 0) for _ in extent]
-            return buffer_load_to_tile_region(tir.BufferLoad(data, zeros), access_type, extent)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
-
-    value = _to_region(value, "r", src_extent)
-    dst = _to_region(dst, "w", dst_extent)
+    value = to_buffer_region(value, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index 62de13d0..d59d73e8 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -3,11 +3,11 @@ from __future__ import annotations
 from typing import Literal
 from tilelang import language as T
 from tilelang.utils.language import (
+    to_buffer_region,
     get_buffer_region_from_load,
     legalize_pairwise_extents,
 )
 from tvm import ir, tir
-from tilelang.language.utils import buffer_region_to_tile_region, buffer_load_to_tile_region
 
 
 def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
@@ -69,27 +69,9 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
     # - otherwise -> error
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    def _to_region(data, access_type, extent):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            # Restrict a raw buffer to the computed copy extent by creating
-            # a BufferLoad at origin and passing the extents explicitly.
-            zeros = [tir.IntImm("int32", 0) for _ in extent]
-            return buffer_load_to_tile_region(tir.BufferLoad(data, zeros), access_type, extent)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
-
     # Use legalized extents for src and dst respectively.
-    src = _to_region(src, "r", src_extent)
-    dst = _to_region(dst, "w", dst_extent)
+    src = to_buffer_region(src, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     if coalesced_width is None:
         coalesced_width = -1  # PrimExpr can not be None
@@ -129,6 +111,7 @@ def c2d_im2col(img: tir.Buffer,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img.access_ptr("r"),
-                           col.access_ptr("w"), nhw_step, c_step, kernel, stride, dilation, pad,
-                           eviction_policy)
+    img_region = to_buffer_region(img, access_type="r")
+    col_region = to_buffer_region(col, access_type="w")
+    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img_region, col_region,
+                           nhw_step, c_step, kernel, stride, dilation, pad, eviction_policy)
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index e966e7d6..7cc3d736 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
+from tilelang.utils.language import to_buffer_region
 
 
 def gemm_sp(
@@ -62,17 +63,18 @@ def gemm_sp(
     K_A = A_sparse.shape[0] if transpose_A else A_sparse.shape[1]
     K_B = B.shape[1] if transpose_B else B.shape[0]
     assert K_A * 2 == K_B, f"T.gemm_sp K shape check failed: K_A = {K_A}, K_B = {K_B}"
-    Aptr = A_sparse.access_ptr("r")
-    Bptr = B.access_ptr("r")
-    Cptr = C.access_ptr("rw")
-    Eptr = E.access_ptr("r")
+    # Build tl.region descriptors for operands
+    A_arg = to_buffer_region(A_sparse, access_type="r")
+    E_arg = to_buffer_region(E, access_type="r")
+    B_arg = to_buffer_region(B, access_type="r")
+    C_arg = to_buffer_region(C, access_type="rw")
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.gemm_sp"),
-        Aptr,
-        Eptr,
-        Bptr,
-        Cptr,
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
         transpose_A,
         transpose_B,
         M,
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index ad74720f..fbbcf1b6 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -2,12 +2,7 @@
 from __future__ import annotations
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
-from tilelang.utils.language import get_buffer_region_from_load
-from tilelang.language.utils import (
-    buffer_to_tile_region,
-    buffer_region_to_tile_region,
-    buffer_load_to_tile_region,
-)
+from tilelang.utils.language import get_buffer_region_from_load, to_buffer_region
 
 
 def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.PrimExpr):
@@ -24,26 +19,21 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
     if isinstance(buffer, tir.Var) and has_let_value(buffer):
         buffer = get_let_value(buffer)
 
-    # Convert to a tl.region descriptor (PrimExpr) with write access
-    region_call = None
+    # Build tl.region as argument
     if isinstance(buffer, tir.Buffer):
-        region_call = buffer_to_tile_region(buffer, "w")
+        extents = list(buffer.shape)
     elif isinstance(buffer, tir.BufferRegion):
         extents = [r.extent for r in buffer.region]
-        region_call = buffer_region_to_tile_region(buffer, "w", extents)
     elif isinstance(buffer, tir.BufferLoad):
         region = get_buffer_region_from_load(buffer)
         if region is not None:
             extents = [r.extent for r in region.region]
-            region_call = buffer_region_to_tile_region(region, "w", extents)
         else:
-            # Fallback: treat element access as 1-extent per dim
-            region_call = buffer_load_to_tile_region(buffer, "w", [1] * len(buffer.indices))
+            extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
     else:
-        # As-is fallback (rare): pass through for downstream handling
-        region_call = buffer
-
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fill"), region_call, value)
+        extents = []
+    return tir.call_intrin("handle", tir.op.Op.get("tl.fill"),
+                           to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
 def clear(buffer: tir.Buffer | tir.Var):
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 0f2e82d7..2bfd3a0c 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -7,10 +7,11 @@ from tilelang.utils.language import (
     to_buffer_region,
     retrieve_shape,
     retrieve_stride,
-    retrieve_ptr,
     retrieve_offset,
     prim_expr_equal,
 )
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,)
 from tilelang.env import env as _env
 
 
@@ -50,17 +51,17 @@ def _gemm_impl(
     C = legalize_arguments(C)
     mbar = legalize_arguments(mbar) if mbar is not None else None
 
-    # Normalize A/B/C to BufferRegion to pass into tl.gemm
-    A = to_buffer_region(A)
-    B = to_buffer_region(B)
-    C = to_buffer_region(C)
+    # Normalize A/B/C to BufferRegion for shape/stride/offset analysis
+    A_region = to_buffer_region(A)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
 
-    A_shape = retrieve_shape(A)
-    B_shape = retrieve_shape(B)
-    C_shape = retrieve_shape(C)
+    A_shape = retrieve_shape(A_region)
+    B_shape = retrieve_shape(B_region)
+    C_shape = retrieve_shape(C_region)
 
-    A_stride = retrieve_stride(A)
-    B_stride = retrieve_stride(B)
+    A_stride = retrieve_stride(A_region)
+    B_stride = retrieve_stride(B_region)
 
     assert len(C_shape) == 2, "current only support C as a 2D tensor"
     assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
@@ -82,18 +83,22 @@ def _gemm_impl(
     stride_a = A_stride[-2]
     stride_b = B_stride[-2]
 
-    A_offset = retrieve_offset(A)
-    B_offset = retrieve_offset(B)
+    A_offset = retrieve_offset(A_region)
+    B_offset = retrieve_offset(B_region)
     assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
     assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
     offset_a = A_offset[-1]
     offset_b = B_offset[-1]
 
-    mbarptr = retrieve_ptr(mbar, "rw") if mbar is not None else tir.const(0, "uint32")
-    C_coords = [r.min for r in C.region]
-    return tir.call_intrin("handle", tir.op.Op.get(op_key), A, B, C, transpose_A, transpose_B, M, N,
-                           K, policy, clear_accum, stride_a, stride_b, offset_a, offset_b, k_pack,
-                           wg_wait, mbarptr, C_coords[0], C_coords[1])
+    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, "uint32")
+    C_coords = [r.min for r in C_region.region]
+    # Convert BufferRegion to tl.region calls for arguments
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin("handle", tir.op.Op.get(op_key), A_arg, B_arg, C_arg, transpose_A,
+                           transpose_B, M, N, K, policy, clear_accum, stride_a, stride_b, offset_a,
+                           offset_b, k_pack, wg_wait, mbar, C_coords[0], C_coords[1])
 
 
 # Public wrappers
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 9d84e0b2..3c4d8187 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
-from tilelang.language.utils import buffer_to_tile_region
+from tilelang.utils.language import to_buffer_region
 from tilelang.utils.language import is_shared, is_fragment
 from tvm.script.ir_builder import IRBuilder
 
@@ -51,8 +51,8 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             tir.call_intrin(
                 "handle",
                 tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(red_frag_in, "r"),
-                buffer_to_tile_region(red_frag_out, "w"),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -66,8 +66,8 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             tir.call_intrin(
                 "handle",
                 tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(red_frag_in, "r"),
-                buffer_to_tile_region(out, "w"),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -79,8 +79,8 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             tir.call_intrin(
                 "handle",
                 tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(buffer, "r"),
-                buffer_to_tile_region(red_frag_out, "w"),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -90,8 +90,8 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             tir.call_intrin(
                 "handle",
                 tir.op.Op.get("tl.reduce"),
-                buffer_to_tile_region(buffer, "r"),
-                buffer_to_tile_region(out, "w"),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(out, access_type="w"),
                 reduce_type,
                 dim,
                 clear,
@@ -246,8 +246,8 @@ def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -
     tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.cumsum"),
-        buffer_to_tile_region(cumsum_smem, "r"),
-        buffer_to_tile_region(cumsum_smem, "w"),
+        to_buffer_region(cumsum_smem, access_type="r"),
+        to_buffer_region(cumsum_smem, access_type="w"),
         dim,
         reverse,
     )
@@ -300,8 +300,8 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.cumsum"),
-        buffer_to_tile_region(src, "r"),
-        buffer_to_tile_region(dst, "w"),
+        to_buffer_region(src, access_type="r"),
+        to_buffer_region(dst, access_type="w"),
         dim,
         reverse,
     )
@@ -323,7 +323,7 @@ def finalize_reducer(reducer: tir.Buffer):
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.finalize_reducer"),
-        reducer.access_ptr("w"),
+        to_buffer_region(reducer, access_type="w"),
     )
 
 
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index ad8b83dd..75fea4c0 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,97 +1,38 @@
 from tilelang import tvm as tvm
 from tvm import tir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, op
+from tvm.tir import PrimExpr, BufferLoad, op
 from tilelang import language as T
 
 
 def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
-    """
-    Create a tile memory-region descriptor for a BufferLoad.
-
-    Maps access_type ('r', 'w', 'rw') to the numeric codes expected by the `tl.region` intrinsic
-    (1, 2, 3 respectively) and returns a tir.Call representing the region with the provided extents.
-
-    Parameters:
-        buffer (tir.BufferLoad): The BufferLoad that identifies the underlying buffer and indices.
-        access_type (str): One of 'r', 'w', or 'rw' indicating read, write, or read-write access.
-        *args (tir.PrimExpr): Extent expressions for each region dimension.
-
-    Returns:
-        tir.Call: A call to the `tl.region` intrinsic describing the memory region.
-
-    Raises:
-        KeyError: If access_type is not one of 'r', 'w', or 'rw'.
-    """
+    """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
     return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
 
 
-def buffer_to_tile_region(buffer: Buffer, access_type: str):
-    """Convert a TVM buffer to a tile region descriptor.
-
-    Args:
-        buffer (tir.Buffer): The buffer to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor covering the entire buffer
-    """
-    mins = [0 for _ in buffer.shape]
-    extents = [x for x in buffer.shape]
-    return region(T.BufferLoad(buffer, mins), access_type, *extents)
-
-
 def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-    """Convert a buffer load operation to a tile region descriptor.
-
-    Args:
-        load (tir.BufferLoad): The buffer load operation
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-        extents (List[tir.PrimExpr]): List of expressions defining the region size
-
-    Returns:
-        tir.Call: A region descriptor for the loaded area
-    """
-    indices = load.indices
-
+    """Convert a BufferLoad to a tl.region call with explicit extents."""
+    indices = list(load.indices)
     if len(indices) > len(extents):
-        # (f"mismatch between indices and extents for buffer load {load}: indices = {indices}, extents = {extents}, "
-        # f"region will be expanded in the last 2 dimensions")
-        new_extents = []
-        for _ in range(len(indices) - len(extents)):
-            new_extents.append(1)
-        for extent in extents:
-            new_extents.append(extent)
-        extents = new_extents
+        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))
+                  ] + list(extents)
     assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
     return region(load, access_type, *extents)
 
 
 def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str,
                                  extents: list[tir.PrimExpr]):
-    """Convert a buffer region to a tile region descriptor.
-
-    Args:
-        buffer_region (tir.BufferRegion): The buffer region to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor for the specified buffer region
-    """
-    mins = [x.min for x in buffer_region.region]
-    region_extents = [x.extent for x in buffer_region.region]
-    assert len(region_extents) >= len(
-        extents
-    ), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
-
-    # Clamp extents element-wise so that the produced region respects the
-    # requested copy/fill extent, supporting dynamic PrimExpr via tir.min.
+    """Clamp extents and return a tl.region call."""
+    mins = [r.min for r in buffer_region.region]
+    region_extents = [r.extent for r in buffer_region.region]
+    assert len(region_extents) >= len(extents), (
+        f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
+    )
     clamped_extents = [
         tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i]
         for i in range(len(region_extents))
     ]
-
-    return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
+    return region(tir.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
 
 
 def index_to_coordinates(index, shape) -> list[PrimExpr]:
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 021f59a4..581272cf 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -123,6 +123,10 @@ class GemmBase:
     def mbarptr(self) -> PrimExpr:
         return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, "uint32"))
 
+    @property
+    def mbar(self) -> tir.Buffer:
+        return getattr(self.gemm_node, "mbar", None)
+
     @property
     def C_coords(self):
         coords = getattr(self.gemm_node, "cCoords", None)
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 52c192e5..c2c8c1c8 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -94,9 +94,11 @@ class GemmTCGEN5(GemmBase):
         if self.wg_wait != -1:
             raise ValueError("TCGEN5MMA currently requires wg_wait == -1")
 
-        mbarptr = self.mbarptr
-        if mbarptr == 0:
-            raise ValueError("TCGEN5MMA requires a valid mbarrier pointer")
+        mbar = self.mbar
+        if mbar == 0:
+            raise ValueError("TCGEN5MMA requires a valid mbarrier")
+
+        mbarptr = mbar.access_ptr("rw")
 
         C_coords = self.C_coords
         if len(C_coords) != 2:
@@ -110,11 +112,10 @@ class GemmTCGEN5(GemmBase):
         B_shared = self.BRegion
         C_local = self.C
         clear_accum = self.clear_accum
-        mbar = self.mbarptr
 
         @T.prim_func
         def _gemm_ss() -> None:
             if thread_var // 32 == 0:
-                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbar, clear_accum)
+                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
 
         return _Simplify(_gemm_ss, inline_let=True)
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index e13905f8..a713df8e 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -15,5 +15,6 @@ from .language import (
     retrive_ptr_from_buffer_region,  # noqa: F401
     is_full_region,  # noqa: F401
     to_buffer_region,  # noqa: F401
+    get_buffer_region_from_load,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index e9fe13da..41da8ab0 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
+from tilelang.language.utils import region as _make_region_call
 from functools import reduce
 from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
 from tvm import ir, tir
-
 # Scope Checkers for TVM Buffers
 # These utility functions check the memory scope of a given TVM buffer.
 
@@ -159,7 +159,8 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion | None:
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad,
+                                extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
     """
     Get the buffer region from a buffer load.
 
@@ -170,45 +171,71 @@ def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion
     buffer, indices = buffer_load.buffer, buffer_load.indices
     regions = []
     found_ramp: bool = False
-    for indice in indices:
+
+    if extents is not None:
+        assert len(extents) == len(indices), "extents should have the same length as indices"
+    for i, indice in enumerate(indices):
         if isinstance(indice, tir.Ramp):
+            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
             regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
             found_ramp = True
         elif isinstance(indice, tir.PrimExpr):
-            regions.append(ir.Range.from_min_extent(indice, 1))
+            if extents is not None:
+                regions.append(ir.Range.from_min_extent(indice, extents[i]))
+                found_ramp = True
+            else:
+                regions.append(ir.Range.from_min_extent(indice, 1))
         else:
-            raise ValueError("Unsupported type: ", type(indice))
+            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
     if found_ramp:
         return tir.BufferRegion(buffer, regions)
     else:
         return None
 
 
-def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion | tir.Var,
+                     access_type: str = "rw",
+                     extents: list[PrimExpr] | None = None) -> PrimExpr | BufferRegion:
     """
-    Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+    Convert to/from the tl.region representation.
 
-    - Buffer -> full-region BufferRegion covering entire shape
-    - BufferRegion -> returned as-is
-    - BufferLoad -> best-effort convert via get_buffer_region_from_load;
-      if scalar, fall back to 1-sized ranges at given indices
+    - Buffer/BufferLoad/BufferRegion -> returns a tl.region call (PrimExpr)
+    - tl.region Call -> returns the decoded BufferRegion for analysis
     """
+    from tilelang.language.frame import has_let_value, get_let_value
+    if isinstance(obj, tir.Var) and has_let_value(obj):
+        obj = get_let_value(obj)
+    # Encode into tl.region call (when extents is provided), otherwise return BufferRegion for analysis
     if isinstance(obj, tir.BufferRegion):
-        return obj
+        if extents is None:
+            return obj
+        mins = [r.min for r in obj.region]
+        exts = [r.extent for r in obj.region]
+        assert len(extents) == len(exts)
+        exts = [tir.min(exts[i], extents[i]) for i in range(len(exts))]
+        return _make_region_call(tir.BufferLoad(obj.buffer, mins), access_type, *exts)
     if isinstance(obj, tir.Buffer):
         mins = [tir.IntImm("int32", 0) for _ in obj.shape]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
-        return tir.BufferRegion(obj, ranges)
+        if extents is None:
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return tir.BufferRegion(obj, ranges)
+        exts = list(extents)
+        return _make_region_call(tir.BufferLoad(obj, mins), access_type, *exts)
     if isinstance(obj, tir.BufferLoad):
-        region = get_buffer_region_from_load(obj)
-        if region is not None:
-            return region
-        # Fallback: scalar load -> 1-sized ranges at indices
-        mins = [idx for idx in obj.indices]
-        ones = [tir.IntImm("int32", 1) for _ in obj.indices]
-        ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
-        return tir.BufferRegion(obj.buffer, ranges)
-    raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+        if extents is None:
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return tir.BufferRegion(obj.buffer, ranges)
+        exts = list(extents)
+        if len(obj.indices) > len(exts):
+            exts = [tir.IntImm("int32", 1) for _ in range(len(obj.indices) - len(exts))] + exts
+        assert len(obj.indices) == len(exts)
+        return _make_region_call(obj, access_type, *exts)
+    raise ValueError(f"Unsupported argument type for to_buffer_region: {type(obj)}")
 
 
 def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
-- 
GitLab


From f0c721a467ed0e535b160e3f7e76709faa77cf57 Mon Sep 17 00:00:00 2001
From: Yunqian Fan <pannenets.f@foxmail.com>
Date: Wed, 26 Nov 2025 15:44:00 +0800
Subject: [PATCH 051/139] [Enhancement] add more dtype and fix mma.ws for fp16
 for tcgen05 (#1327)

* feat: add fp8 variants; add placeholder for fp6/fp4 in meta

support ld with pack for fp32 dtype

add dump

add tempalte expand

remove unused dtype and change to rebased apis

* fix: when atom-m!=128, enable_ws

* fix: typo in tcgen05 meta; dispatch in gemm sm100
---
 .../example_tilelang_gemm_fp8_sm100.py        | 126 +++
 src/op/copy.cc                                |  14 +-
 src/op/gemm_py.cc                             |   2 +
 src/op/tcgen5_meta.h                          |  38 +-
 src/tl_templates/cuda/copy_sm100.h            |  35 +-
 src/tl_templates/cuda/gemm_sm100.h            |  82 +-
 src/tl_templates/cuda/tcgen_05_ld.h           | 755 +++++++++++++++++-
 tilelang/intrinsics/mma_macro_generator.py    |   3 +
 .../intrinsics/tcgen05_macro_generator.py     |   9 +-
 tilelang/jit/adapter/wrapper.py               |   1 +
 tilelang/tileop/gemm/gemm_tcgen05.py          |   5 +-
 11 files changed, 980 insertions(+), 90 deletions(-)
 create mode 100644 examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
new file mode 100644
index 00000000..4628a997
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -0,0 +1,126 @@
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm_v2(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=(k == 0),
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+M, N, K = 4096, 4096, 8192
+block_M, block_N, block_K = 64, 256, 32
+trans_A, trans_B = False, True
+num_stages = 2
+threads = 256
+for tvm_fp8_dtype in ["float8_e4m3", "float8_e5m2"]:
+    for tvm_acc_dtype in ["float16", "float32"]:  # , torch.float16]:
+        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
+        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
+        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
+
+        func = matmul(
+            M,
+            N,
+            K,
+            block_M,
+            block_N,
+            block_K,
+            trans_A,
+            trans_B,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            threads,
+        )
+        jit_kernel = tilelang.compile(
+            func,
+            out_idx=[2],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
+            },
+        )
+        # jit_kernel.export_ptx("./dump.ptx")
+        # jit_kernel.export_sources("./dump.cu")
+
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+
+        c = jit_kernel(a, b)
+        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
+        c = c.float()
+        diff = calc_diff(c, ref_c)
+        # assert diff < 1e-3, f"{diff}"
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
+
+        profiler = jit_kernel.get_profiler()
+        latency = profiler.do_bench()
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
+        print(
+            f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS"
+        )
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 9b93fea1..b0cac131 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -1118,6 +1118,11 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
+  bool src_needs_pack =
+      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
+  bool dst_needs_unpack =
+      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
+
   if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
     is_ld = true;
   } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
@@ -1125,9 +1130,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: "
-              << "src scope = " << src.scope()
-              << ", dst scope = " << dst.scope();
+    ICHECK(0) << "Unsupported tensor memory copy: " << "src scope = "
+              << src.scope() << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1247,8 +1251,10 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
+      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ">"));
+                               std::to_string(num_chunks_each_wg) + ", " +
+                               bool_str + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index 511a4283..aa6c0282 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -344,6 +344,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           result.push_back(Integer(meta.atom_m));
           result.push_back(Integer(meta.atom_n));
           result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
         }
         return result;
       });
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index bb63c8dc..3d994bf5 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -15,16 +15,19 @@ using runtime::DataType;
 
 struct TCGEN5MMAMeta {
   int atom_m, atom_n, atom_k;
+  bool enable_ws, enable_2cta;
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
 GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
 #define FAIL                                                                   \
-  return { false, TCGEN5MMAMeta{0, 0, 0} }
-#define SUCCESS(atom_m, atom_n, atom_k)                                        \
   return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
+    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
+  }
+#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
+  return {                                                                     \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
   }
   std::vector<int> ws_valid_atom_ns = {256, 128, 64};
   if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
@@ -34,39 +37,52 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     if (M % 128 == 0) {
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16);
+          SUCCESS(128, atom_n, 16, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16);
+          SUCCESS(64, atom_n, 16, true, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16);
+          SUCCESS(32, atom_n, 16, true, false);
       FAIL;
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
-             (c_dtype.is_float() && c_dtype.bits() == 32)) {
+  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e4m3() ||
+              ab_dtype.is_float8_e5m2() || ab_dtype.is_float8_e5m2fnuz() ||
+              ab_dtype.is_float6_e2m3fn() || ab_dtype.is_float6_e3m2fn() ||
+              ab_dtype.is_float4_e2m1fn()) &&
+             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
+              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
       FAIL;
     if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
       for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
         if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32);
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
       FAIL;
     } else if (M % 64 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32);
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
       FAIL;
     } else if (M % 32 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32);
+          SUCCESS(32, atom_n, 32, true, false);
       FAIL;
     } else {
       FAIL;
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index c4047c34..aa898bcc 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -51,6 +51,21 @@ __device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
                :
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
+__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e5_32_t *ptr) {
+  ulonglong4 ret;
+  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
+               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
+               : "l"(ptr));
+  return ret;
+}
+
+__device__ __forceinline__ void st_global_256(fp8_e5_32_t *ptr,
+                                              fp8_e5_32_t &val8) {
+  ulonglong4 &val = *((ulonglong4 *)&val8);
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
 
 __device__ __forceinline__ unsigned long long
 pack_bfloat16x4(const bfloat16_t x, const bfloat16_t y, const bfloat16_t z,
@@ -95,38 +110,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 856d37dd..84e22f24 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,47 +243,99 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
           typename C_type_raw>
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index b2eb2f81..9e5e3420 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,7 +10,9 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-class tmem_ld_32dp32bNx {
+template <bool Pack16> class tmem_ld_32dp32bNx;
+
+template <> class tmem_ld_32dp32bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -180,9 +182,180 @@ public:
     }
   }
 };
+template <> class tmem_ld_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-class tmem_ld_16dp64bNx {
+template <bool Pack16> class tmem_ld_16dp64bNx;
+template <> class tmem_ld_16dp64bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -352,39 +525,43 @@ public:
     }
   }
 };
-
-// 16 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_16dp128bNx {
+template <> class tmem_ld_16dp64bNx<true> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -395,9 +572,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -414,9 +591,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -449,9 +626,9 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 128) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -519,32 +696,39 @@ public:
   }
 };
 
-// 16 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_16dp256bNx {
+// 16 data path lanes, 128-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp128bNx;
+template <> class tmem_ld_16dp128bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -555,9 +739,9 @@ public:
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -574,9 +758,9 @@ public:
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -609,9 +793,492 @@ public:
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp256bNx;
+template <> class tmem_ld_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -681,32 +1348,32 @@ public:
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-class tmem_ld_32dp64bNx {
+template <bool Pack16 = false> class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_32dp128bNx {
+template <bool Pack16 = false> class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_32dp256bNx {
+template <bool Pack16 = false> class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index aab2a49e..6e49b058 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -47,7 +47,10 @@ class TensorCoreIntrinEmitter:
         "int8": "int8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
+        "float8_e4m3fn": "e4m3",
+        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index e53ff7cb..966f4dc4 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -169,12 +169,11 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
                 f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, atom_k = (int(x) for x in meta)
-        enable_ws = atom_m != 128
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
 
         # by default, we utilize non-swizzle layout offset
         a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
@@ -382,10 +381,10 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         k = int(self.chunk)
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
-        if len(meta) != 3:
+        if len(meta) != 5:
             raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
                              f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
-        atom_m, atom_n, _ = (int(x) for x in meta)
+        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
 
         if m % atom_m != 0 or n % atom_n != 0:
             raise ValueError(
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 48b8e908..75607976 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -144,6 +144,7 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index c2c8c1c8..76f919e0 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -85,6 +85,9 @@ class GemmTCGEN5(GemmBase):
             raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
                              f"A scope {self.A.scope()}, B scope {self.B.scope()}")
 
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(
+            self.M, self.N, self.K)
+
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
         if self.B.scope() not in {"shared", "shared.dyn"}:
@@ -105,7 +108,7 @@ class GemmTCGEN5(GemmBase):
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype != "float32":
+        if accum_dtype not in ["float32", 'float16']:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
-- 
GitLab


From 17718bec9d4b10cc7360ee333c560ede675de66a Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 26 Nov 2025 19:16:29 +0800
Subject: [PATCH 052/139] [Refactor] Enhance CopyNode's IterVar Creation and
 Range Handling (#1346)

* [Refactor] Enhance CopyNode's IterVar Creation and Range Handling

This commit refines the `MakeIterVars` method in `CopyNode` to select base ranges based on memory scope levels, ensuring that the chosen ranges are not smaller than the original source ranges. Additionally, it updates the Python `copy` function to clarify range handling, including broadcasting logic and extent alignment. These changes improve the robustness and clarity of the copy operation's implementation.

* test fix
---
 src/op/copy.cc                 | 88 ++++++++++++++++++++++++++++++++--
 tilelang/language/copy.py      | 25 +++++++---
 tilelang/language/customize.py |  5 +-
 3 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/src/op/copy.cc b/src/op/copy.cc
index b0cac131..1bd548bc 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -179,15 +179,95 @@ TileOperator CopyNode::Clone() const {
  * copy operation.
  */
 Array<IterVar> CopyNode::MakeIterVars() const {
+  // Choose the range set from the lowest-level memory scope between src and
+  // dst. Scope levels: global < shared/shared.dyn/shared.tmem < local.fragment
+  // (fragment)
+  auto scope_level = [](const Buffer &b) -> int {
+    String s = b.scope();
+    if (s == "local.fragment" || s == "local")
+      return 2;
+    if (s == "shared" || s == "shared.dyn" || s == "shared.tmem")
+      return 1;
+    // default to global level for unknown scopes
+    return 0;
+  };
+
+  int src_level = scope_level(src);
+  int dst_level = scope_level(dst);
+  bool base_is_src = (src_level >= dst_level);
+  const Array<Range> &base_ranges = base_is_src ? src_range : dst_range;
+
+  // Sanity check: when switching away from the original (src_range),
+  // ensure the chosen base ranges are not provably smaller than the original
+  // per dimension. This guards against generating undersized loop domains.
+  // Improved logic: use two pointers to traverse both base_ranges and
+  // src_range, skipping dimensions with extent == 1. The number of non-1
+  // extents must match.
+  arith::Analyzer analyzer;
+
+  size_t base_dim = 0, src_dim = 0;
+  while (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+    // Skip base extents that are 1
+    while (base_dim < base_ranges.size() &&
+           is_one(base_ranges[base_dim]->extent)) {
+      ++base_dim;
+    }
+    // Skip src extents that are 1
+    while (src_dim < src_range.size() && is_one(src_range[src_dim]->extent)) {
+      ++src_dim;
+    }
+    // Both indices now at non-1, or at end
+    if (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+      PrimExpr base_ext = base_ranges[base_dim]->extent;
+      PrimExpr src_ext = src_range[src_dim]->extent;
+      // Only fail if base extent is provably smaller than src extent
+      if (analyzer.CanProve(base_ext < src_ext)) {
+        std::ostringstream oss;
+        oss << "Selected loop range is smaller than original src range at "
+               "matched non-1 dimension: "
+            << "base(extent=" << base_ext
+            << ", scope=" << (base_is_src ? src.scope() : dst.scope())
+            << ", min=" << base_ranges[base_dim]->min
+            << ", base_dim=" << base_dim << ") < src(extent=" << src_ext
+            << ", min=" << src_range[src_dim]->min << ", src_dim=" << src_dim
+            << ", scope=" << src.scope() << ") for src=" << src->name
+            << ", dst=" << dst->name << "\n";
+        oss << "src buffer: " << src->name << ", scope=" << src.scope() << "\n";
+        oss << "dst buffer: " << dst->name << ", scope=" << dst.scope() << "\n";
+        oss << "base_ranges[" << base_dim
+            << "]: min=" << base_ranges[base_dim]->min
+            << ", extent=" << base_ext << "\n";
+        oss << "src_ranges[" << src_dim << "]: min=" << src_range[src_dim]->min
+            << ", extent=" << src_ext << "\n";
+        LOG(FATAL) << oss.str();
+      }
+      ++base_dim;
+      ++src_dim;
+    }
+  }
+
+  // Any remaining unmatched dimensions in either range must all have extent ==
+  // 1
+  while (base_dim < base_ranges.size()) {
+    ICHECK(is_one(base_ranges[base_dim]->extent))
+        << "base_ranges has extra non-1 extent at dim " << base_dim;
+    ++base_dim;
+  }
+  while (src_dim < src_range.size()) {
+    ICHECK(is_one(src_range[src_dim]->extent))
+        << "src_range has extra non-1 extent at dim " << src_dim;
+    ++src_dim;
+  }
+
   Array<IterVar> loop_vars;
   size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
+  for (size_t i = 0; i < base_ranges.size(); i++) {
+    if (is_one(base_ranges[i]->extent))
       continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    Var var = Var(std::string{char('i' + idx)}, base_ranges[i]->extent->dtype);
     idx++;
     loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+        {Range(0, base_ranges[i]->extent), var, IterVarType::kDataPar});
   }
   return loop_vars;
 }
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index d59d73e8..965919fd 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -27,6 +27,22 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
 
     Returns:
         tir.Call: A handle to the copy operation
+
+    Range handling notes:
+    - Accepts `Buffer`/`BufferRegion`/`BufferLoad` on either side. Extents are
+      derived as follows: `Buffer -> shape`, `BufferRegion -> [r.extent]`,
+      `BufferLoad -> extents from its inferred/encoded region`.
+    - If both `src` and `dst` are scalar `BufferLoad` without region extents,
+      lowers to a direct store: `dst[...] = src`.
+    - If one side is missing extents, it is treated as all-ones with the other
+      side's rank to enable broadcasting.
+    - Extents are right-aligned and legalized via `legalize_pairwise_extents`:
+      per tail-dimension, equal keeps as-is, a `1` broadcasts to the other,
+      otherwise a conservative `tir.max` is used to remain safe for dynamic
+      shapes.
+    - The finalized extents are encoded with `tl.region` via `to_buffer_region`
+      and passed through to the backend; low-level loop construction and any
+      scope-specific decisions happen during lowering.
     """
     if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
         ir.assert_structural_equal(src.shape, dst.shape)
@@ -57,16 +73,11 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
         return tir.BufferStore(dst.buffer, src, dst.indices)
 
     assert src_extent or dst_extent, "Can't deduce copy extents from args"
-    # Treat missing extent as length-matched ones to enable broadcasting logic.
+    # Treat missing extent as length-matched ones to enable broadcasting.
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
 
-    # Align and broadcast extents from the right (tail) side independently
-    # for src and dst, so we can pass them unchanged into _to_region.
-    # Rules per-dim from the right:
-    # - equal -> keep both
-    # - one is 1 -> set that side to the other side's dim
-    # - otherwise -> error
+    # Align and broadcast extents from the right (tail) side.
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
     # Use legalized extents for src and dst respectively.
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 3d40ce47..720c9e99 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -46,8 +46,9 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     Returns:
         Buffer: A new buffer view with the specified shape
     """
-    assert prim_expr_equal(bits_product(shape, src.dtype),
-                           bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
+    assert prim_expr_equal(
+        bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)
+    ), f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
     return T.Tensor(shape, src.dtype, src.data)
 
 
-- 
GitLab


From 4f844000e3d36b9ff2c7bc4f44bbcea8c92bd152 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 26 Nov 2025 19:27:43 +0800
Subject: [PATCH 053/139] [Fix] Fix missing `not` rewrite in frontend (#1348)

---
 .../language/test_tilelang_language_frontend_v2.py  | 13 +++++++++++++
 tilelang/language/v2/ast.py                         | 12 ++++++++++--
 tilelang/language/v2/builder.py                     |  9 +++++----
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 299a4127..ee694104 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -466,5 +466,18 @@ def test_buffer_slice_step():
         pass
 
 
+def test_boolop():
+    a = Var('a', 'int32')
+    b = Var('b', 'int32')
+    c = Var('c', 'int32')
+    d = Var('d', 'int32')
+
+    @T.macro
+    def cond():
+        return not (a < b and b < c and a * d < b * d) or b * d < c * d
+
+    cond()
+
+
 if __name__ == '__main__':
     tilelang.testing.main()
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/v2/ast.py
index 307efdac..c6dfecf1 100644
--- a/tilelang/language/v2/ast.py
+++ b/tilelang/language/v2/ast.py
@@ -78,7 +78,7 @@ def quote_expr(expr: str, **kws) -> ast.expr:
 
 Operator = Literal['Add', 'Sub', 'Mult', 'MatMult', 'Div', 'Mod', 'Pow', 'LShift', 'RShift',
                    'BitOr', 'BitXor', 'BitAnd', 'FloorDiv']
-BoolOp = Literal['And', 'Or']
+BoolOp = Literal['And', 'Or', 'Not']
 
 
 def get_operator_name(operator: ast.operator) -> Operator:
@@ -217,11 +217,13 @@ class BaseBuilder:
     def aug_assign_slice(self, op: Operator, target: Any, sl: slice, aug_value: Any):
         eval_aug_assign(op, target, sl, aug_value)
 
-    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any]) -> Any:
+    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
         if op == 'And':
             return left and right()
         if op == 'Or':
             return left or right()
+        if op == 'Not':
+            return not left
         raise ValueError(f'Unknown boolop: {op}')
 
     def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
@@ -517,6 +519,12 @@ class DSLMutator(ast.NodeTransformer):
             )
         return last
 
+    def visit_UnaryOp(self, node: ast.UnaryOp):
+        node = self.generic_visit(node)
+        if isinstance(node.op, ast.Not):
+            return quote_expr("__tb.boolop('Not', operand)", operand=node.operand, span=node)
+        return node
+
     def visit_Compare(self, node: ast.Compare) -> ast.expr:
         node = self.generic_visit(node)
         left = node.left
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index c54b0701..aea425ad 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -148,8 +148,7 @@ class Builder(BaseBuilder):
 
     @classmethod
     def current(cls) -> Self:
-        builder = thread_local_storage.builder
-        assert builder is not None, "No active Builder found in the current thread."
+        builder = getattr(thread_local_storage, 'builder', None)
         return builder
 
     @contextmanager
@@ -424,7 +423,7 @@ class Builder(BaseBuilder):
         else:
             return super().aug_assign_slice(op, target, sl, aug_value)
 
-    def boolop(self, op, left, right):
+    def boolop(self, op, left, right=None):
         left = unwrap_cond(left)
         if isinstance(left, PrimExpr):
             with self.with_frame(BoolOpFrame()):
@@ -432,6 +431,8 @@ class Builder(BaseBuilder):
                     return tir.And(left, right())
                 if op == 'Or':
                     return tir.Or(left, right())
+                if op == 'Not':
+                    return tir.Not(left)
             raise RuntimeError(f"Unsupported boolean operator: {op}")
         else:
             return super().boolop(op, left, right)
@@ -562,7 +563,7 @@ class Macro(Generic[_P, _T]):
         return self.ir_gen.source
 
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-        builder = Builder.current()
+        builder = Builder.current() or Builder()
         with builder.macro(self.name, self.annotations):
             res = self.ir_gen.gen(builder)(*args, **kwargs)
         return res
-- 
GitLab


From 6bae64f6ebf5737bb8648b81584cd1b644e003d2 Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Wed, 26 Nov 2025 19:48:57 +0800
Subject: [PATCH 054/139] [Enhancement] Add support for k_pack in gemm_mfma
 (#1344)

* add support for k_pack

* support benchmark on ROCm

* fix format
---
 benchmark/matmul_fp8/benchmark_matmul.py    |  6 +++-
 src/tl_templates/hip/hip_fp8.h              | 38 +++++++++++++++++++++
 tilelang/intrinsics/mfma_macro_generator.py |  9 ++---
 tilelang/tileop/gemm/gemm_mfma.py           | 18 +++++-----
 4 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 36b91035..796f7b90 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -1,5 +1,6 @@
 import argparse
 import itertools
+import torch
 import logging
 import tilelang
 import tilelang.language as T
@@ -99,6 +100,7 @@ def get_configs(args, kwargs):
             block_K=[64, 128],
             num_stages=[0, 1, 2, 3],
             thread_num=[128, 256],
+            k_pack=[1, 2],
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
@@ -125,6 +127,7 @@ def matmul(
     block_K=None,
     num_stages=None,
     thread_num=None,
+    k_pack=None,
     policy=None,
     enable_rasteration=None,
 ):
@@ -156,7 +159,7 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float8_e4m3"
+    dtype = "float8_e4m3fnuz" if torch.version.hip is not None else "float8_e4m3"
     accum_dtype = "float"
 
     @T.prim_func
@@ -210,6 +213,7 @@ def matmul(
                     C_local,
                     transpose_B=True,
                     policy=policy,
+                    k_pack=k_pack,
                 )
             # Write back the results from C_local to the global memory C
             T.copy(C_local, C_shared)
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 0000745b..b32f84dc 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -127,3 +127,41 @@ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
   res.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
   return res;
 }
+
+__device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                        fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                        fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                        fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                        fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                        fp8_e4_t y7) {
+  signed char x0_char = *reinterpret_cast<signed char *>(&x0);
+  signed char x1_char = *reinterpret_cast<signed char *>(&x1);
+  signed char x2_char = *reinterpret_cast<signed char *>(&x2);
+  signed char x3_char = *reinterpret_cast<signed char *>(&x3);
+  signed char x4_char = *reinterpret_cast<signed char *>(&x4);
+  signed char x5_char = *reinterpret_cast<signed char *>(&x5);
+  signed char x6_char = *reinterpret_cast<signed char *>(&x6);
+  signed char x7_char = *reinterpret_cast<signed char *>(&x7);
+  signed char y0_char = *reinterpret_cast<signed char *>(&y0);
+  signed char y1_char = *reinterpret_cast<signed char *>(&y1);
+  signed char y2_char = *reinterpret_cast<signed char *>(&y2);
+  signed char y3_char = *reinterpret_cast<signed char *>(&y3);
+  signed char y4_char = *reinterpret_cast<signed char *>(&y4);
+  signed char y5_char = *reinterpret_cast<signed char *>(&y5);
+  signed char y6_char = *reinterpret_cast<signed char *>(&y6);
+  signed char y7_char = *reinterpret_cast<signed char *>(&y7);
+  int a = (x3_char << 24) | (x2_char << 16) | (x1_char << 8) | x0_char;
+  int b = (x7_char << 24) | (x6_char << 16) | (x5_char << 8) | x4_char;
+  int c = (y3_char << 24) | (y2_char << 16) | (y1_char << 8) | y0_char;
+  int d = (y7_char << 24) | (y6_char << 16) | (y5_char << 8) | y4_char;
+  fp8_e4_8_t res_x;
+  res_x.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
+  res_x.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
+  fp8_e4_8_t res_y;
+  res_y.x = *reinterpret_cast<fp8_e4_4_t *>(&c);
+  res_y.y = *reinterpret_cast<fp8_e4_4_t *>(&d);
+  fp8_e4_16_t res;
+  res.x = res_x;
+  res.y = res_y;
+  return res;
+}
\ No newline at end of file
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 02c0b039..618a9981 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -372,8 +372,8 @@ class MatrixCoreIntrinEmitter:
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
-        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
-        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+        a_local_stride: PrimExpr = k_inner * warp_rows * k_pack * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * k_pack * local_size_b if b_is_fragment else 0
 
         @T.macro
         def _warp_mfma(A_local_buf, B_local_buf, C_local_buf):
@@ -543,7 +543,8 @@ class MatrixCoreIntrinEmitter:
             return local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
+            [micro_size_s, micro_size_r *
+             self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
@@ -552,7 +553,7 @@ class MatrixCoreIntrinEmitter:
         chunk = self.chunk
 
         warp_s = warp_rows if matrix_is_a else warp_cols
-        warp_r = chunk // micro_size_r
+        warp_r = chunk // (micro_size_r * self.k_pack)
         block_s = block_row_warps if matrix_is_a else block_col_warps
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
index 45a53d3c..862ec725 100644
--- a/tilelang/tileop/gemm/gemm_mfma.py
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -28,6 +28,7 @@ class GemmMFMA(GemmBase):
             warp_row_tiles=warp_row_tiles,
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
+            k_pack=self.k_pack,
         )
 
         if self.is_gemm_ss():
@@ -75,6 +76,7 @@ class GemmMFMA(GemmBase):
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
             thread_var=thread_var,
+            k_pack=self.k_pack,
         )
 
         in_dtype = self.in_dtype
@@ -110,11 +112,11 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -145,12 +147,12 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
 
                 if clear_accum:
                     T.clear(C_buf)
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
 
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
@@ -177,10 +179,10 @@ class GemmMFMA(GemmBase):
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
 
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
@@ -207,7 +209,7 @@ class GemmMFMA(GemmBase):
                 accumulating into C_local.
                 """
 
-                for ki in T.serial(0, (block_K // micro_size_k)):
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
                     # Perform Matrix Multiplication
                     mfma_emitter.mfma(A_buf, B_buf, C_buf, ki)
 
-- 
GitLab


From b8240b7ae9387ba7143e6243b59069c3a04a12e9 Mon Sep 17 00:00:00 2001
From: Yuxuan Hu <huyuxuan1999@ruc.edu.cn>
Date: Thu, 27 Nov 2025 14:28:14 +0800
Subject: [PATCH 055/139] Add sparse fine-tuning kernel for deepseek sparse
 attention to example (#1296)

* [EXAMPLE] add example for dsa sparse finetuning

* [Refactor]
---
 examples/dsa_sparse_finetune/dsa.py           | 252 +++++++++++
 examples/dsa_sparse_finetune/index.py         |  79 ++++
 examples/dsa_sparse_finetune/indexer_bwd.py   | 265 +++++++++++
 .../indexer_topk_reducesum.py                 | 277 ++++++++++++
 .../dsa_sparse_finetune/sparse_mla_bwd.py     | 420 ++++++++++++++++++
 .../dsa_sparse_finetune/sparse_mla_fwd.py     | 332 ++++++++++++++
 .../sparse_mla_topk_reducesum.py              | 241 ++++++++++
 examples/dsa_sparse_finetune/utils.py         |  75 ++++
 8 files changed, 1941 insertions(+)
 create mode 100644 examples/dsa_sparse_finetune/dsa.py
 create mode 100644 examples/dsa_sparse_finetune/index.py
 create mode 100644 examples/dsa_sparse_finetune/indexer_bwd.py
 create mode 100644 examples/dsa_sparse_finetune/indexer_topk_reducesum.py
 create mode 100644 examples/dsa_sparse_finetune/sparse_mla_bwd.py
 create mode 100644 examples/dsa_sparse_finetune/sparse_mla_fwd.py
 create mode 100644 examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
 create mode 100644 examples/dsa_sparse_finetune/utils.py

diff --git a/examples/dsa_sparse_finetune/dsa.py b/examples/dsa_sparse_finetune/dsa.py
new file mode 100644
index 00000000..1ca28241
--- /dev/null
+++ b/examples/dsa_sparse_finetune/dsa.py
@@ -0,0 +1,252 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from indexer_topk_reducesum import indexer_topk_reducesum_interface
+from indexer_bwd import indexer_bwd_interface
+from sparse_mla_fwd import sparse_mla_fwd_interface
+from sparse_mla_bwd import sparse_mla_bwd
+from sparse_mla_topk_reducesum import sparse_mla_topk_reducesum_interface
+from einops import einsum, repeat
+from utils import get_abs_err, get_err_ratio
+
+
+class RegsiterLossFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, loss):
+        ctx.save_for_backward(loss)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad):
+        loss = ctx.saved_tensors
+        return grad, torch.ones(1, dtype=loss[0].dtype, device=loss[0].device)
+
+
+register_loss = RegsiterLossFunction.apply
+
+
+def ref_deepseek_sparse_attention_innner(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    dtype = q.dtype
+    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32),
+                                           (q, kv, index_q, index_k, weights))
+
+    index_sm_scale = index_q.shape[-1]**-0.5
+    b, s = index_q.shape[:2]
+
+    # tl_topk_indices = tl_topk_indices.to(torch.int64)
+    # tl_topk_indices[tl_topk_indices == -1] = s
+
+    casual_mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+    index_logits = einsum(index_q, index_k, 'b s1 h k, b s2 k -> b s1 h s2')
+    index_logits = F.relu(index_logits)
+    index_logits = (index_logits * weights.unsqueeze(-1)).sum(
+        dim=-2, dtype=torch.float32) * index_sm_scale
+    index_logits = torch.where(casual_mask, index_logits, float('-inf'))
+    topk_indices = torch.topk(index_logits, k=topk, dim=-1).indices
+    topk_logits = torch.gather(
+        F.pad(index_logits, (0, 1), value=float('-inf')), dim=-1, index=topk_indices)
+    topk_score = F.log_softmax(topk_logits, dim=-1, dtype=torch.float32)
+    index_topk_score = topk_score
+
+    if sm_scale is None:
+        sm_scale = kv.shape[-1]**-0.5
+
+    h = q.shape[-2]
+    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda")\
+        .scatter_(dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool))[:, :, :-1]
+    mask = repeat(casual_mask & index_mask, 'b s1 s2 -> b s1 h s2', h=h)
+    k, v = kv, kv[..., :dim_v]
+    logits = einsum(q, k, 'b s1 h d, b s2 d -> b s1 h s2') * sm_scale
+    logits = torch.where(mask, logits, float('-inf'))
+    attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+    o = einsum(attn_score, v, 'b s1 h s2, b s2 d -> b s1 h d')
+
+    attn_score = attn_score.sum(dim=-2)  # [b, s1, s2]
+    attn_topk_score = torch.gather(F.pad(attn_score, (0, 1)), dim=-1, index=topk_indices)
+    attn_topk_score = attn_topk_score / attn_topk_score.sum(dim=-1, keepdim=True)
+
+    loss = F.kl_div(
+        index_topk_score.clip(-100, 0),
+        attn_topk_score.detach().log().clip(-100, 0),
+        log_target=True,
+        reduction="sum")
+    o = register_loss(o, loss)
+
+    return o.to(dtype), topk_indices
+
+
+def ref_deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    all_o, all_topk_indices = [], []
+    for i in range(offsets.shape[0] - 1):
+        o, topk_indices = ref_deepseek_sparse_attention_innner(
+            q[None, offsets[i]:offsets[i + 1]],
+            kv[None, offsets[i]:offsets[i + 1]],
+            index_q[None, offsets[i]:offsets[i + 1]],
+            index_k[None, offsets[i]:offsets[i + 1]],
+            weights[None, offsets[i]:offsets[i + 1]],
+            topk,
+            dim_v,
+            sm_scale,
+            index_sm_scale,
+        )
+        all_o.append(o.squeeze(0))
+        all_topk_indices.append(topk_indices.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    return o, topk_indices
+
+
+class DSAFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        index_q: torch.Tensor,
+        index_k: torch.Tensor,
+        weights: torch.Tensor,
+        offsets: torch.Tensor,
+        topk: int,
+        dim_v: int,
+        sm_scale: Optional[float] = None,
+    ):
+        # topk_indices, index_score = ref_index_score(index_q, weights, index_k, topk)
+        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k,
+                                                                     topk, offsets)
+        o, lse = sparse_mla_fwd_interface(
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
+        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse,
+                              offsets)
+        ctx.topk = topk
+        ctx.dim_v = dim_v
+        ctx.sm_scale = sm_scale
+        return o, topk_indices
+
+    @staticmethod
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        _1: torch.Tensor,
+    ):
+        q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets = ctx.saved_tensors
+        attn_score = sparse_mla_topk_reducesum_interface(
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets,
+            dim_v=ctx.dim_v).squeeze(-2)
+        dq, dkv = sparse_mla_bwd(
+            q,
+            kv.unsqueeze(-2),
+            o,
+            do,
+            topk_indices.unsqueeze(-2),
+            lse,
+            offsets,
+            sm_scale=ctx.sm_scale)
+        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score,
+                                                             index_score, topk_indices, offsets)
+        return dq, dkv.squeeze(-2), dindex_q, dindex_k, dweights, None, None, None, None
+
+
+def deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+):
+    return DSAFunction.apply(q, kv, index_q, index_k, weights, offsets, topk, dim_v, sm_scale)
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    index_D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16().requires_grad_()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16().requires_grad_()
+    index_q = torch.randn((S, H, index_D)).cuda().bfloat16().requires_grad_()
+    weights = torch.randn((S, H)).cuda().bfloat16().requires_grad_()
+    index_k = torch.randn((S, index_D)).cuda().bfloat16().requires_grad_()
+    do = torch.randn((S, H, D)).cuda().bfloat16().requires_grad_()
+    offsets = torch.tensor([0, S // 2, S], dtype=torch.int32).cuda()
+
+    o, topk_indices = deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    o.backward(do)
+    q_grad, q.grad = q.grad, None
+    kv_grad, kv.grad = kv.grad, None
+    index_q_grad, index_q.grad = index_q.grad, None
+    index_k_grad, index_k.grad = index_k.grad, None
+    weights_grad, weights.grad = weights.grad, None
+
+    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights,
+                                                            offsets, topk, D)
+    ref_o.backward(do)
+    ref_q_grad, q.grad = q.grad, None
+    ref_kv_grad, kv.grad = kv.grad, None
+    ref_index_q_grad, index_q.grad = index_q.grad, None
+    ref_index_k_grad, index_k.grad = index_k.grad, None
+    ref_weights_grad, weights.grad = weights.grad, None
+
+    print(f"o err: {get_abs_err(o, ref_o):.6f} ratio: {get_err_ratio(o, ref_o):.6f}")
+    print(
+        f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}"
+    )
+    print(
+        f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}"
+    )
+    print(
+        f"index_q.grad err: {get_abs_err(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f} ratio: {get_err_ratio(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f}"
+    )
+    print(
+        f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}"
+    )
+    print(
+        f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}"
+    )
+
+    intersections = []
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        mask = (trt_np != -1)
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+        intersections.append(len(intersection) / len(set_ref))
+    print("average intersections: {:.4f}".format(sum(intersections) / len(intersections)))
+
+
+test_kernel()
diff --git a/examples/dsa_sparse_finetune/index.py b/examples/dsa_sparse_finetune/index.py
new file mode 100644
index 00000000..92ce687f
--- /dev/null
+++ b/examples/dsa_sparse_finetune/index.py
@@ -0,0 +1,79 @@
+# Modified from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+import torch
+import torch.nn.functional as F
+import functools
+from typing import Callable, Any
+
+
+def tensor_cache(fn: Callable[..., torch.Tensor],) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (last_args is not None and last_kwargs is not None) and \
+            (len(args) == len(last_args) and len(kwargs) == len(last_kwargs)) and \
+                all(a is b for a, b in zip(args, last_args, strict=False)) and \
+                    all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_lens(
+    lens: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    return F.pad(lens.cumsum(dim=0, dtype=dtype), (1, 0))
+
+
+@tensor_cache
+def prepare_lens_from_cu_seqlens(cu_seqlens: torch.LongTensor,) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([
+        torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device)
+        for n in prepare_lens(cu_seqlens).unbind()
+    ])
+
+
+@tensor_cache
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+
+
+@tensor_cache
+def prepare_token_indices(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    position_ids = prepare_position_ids(cu_seqlens)
+    return torch.stack([prepare_sequence_ids(cu_seqlens), position_ids], 1).to(cu_seqlens)
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
new file mode 100644
index 00000000..5430c1c0
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -0,0 +1,265 @@
+import torch
+import torch.nn.functional as F
+from einops import einsum, repeat
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = "bfloat16"
+FP32 = "float32"
+INT32 = "int32"
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_bwd_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_I: int = 32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert num_stages == 0
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_I == 0
+    assert heads <= 64 and heads % 8 == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    dtype: str = BF16
+    accum_dtype: str = FP32
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    shape_p = [seq_len, topk]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.prim_func
+    def tl_indexer_bwd_kernel(
+            IndexQ: T.Tensor(index_q_shape, dtype),
+            Weights: T.Tensor(weights_shape, dtype),
+            IndexK: T.Tensor(index_k_shape, dtype),
+            dIndexQ: T.Tensor(index_q_shape, dtype),
+            dWeights: T.Tensor(weights_shape, dtype),
+            dIndexK: T.Tensor(index_k_shape, dtype),
+            AttnScore: T.Tensor(shape_p, FP32),
+            IndexScore: T.Tensor(shape_p, FP32),
+            TopkIndices: T.Tensor(topk_indices_shape, INT32),
+            Offsets: T.Tensor(offsets_shape, INT32),
+            TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos = Offsets[i_b]
+            num_blocks = T.ceildiv(topk, block_I)
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            weights_shared = T.alloc_shared([heads], dtype=dtype)
+
+            d_index_q_frag = T.alloc_fragment([heads, dim], dtype=accum_dtype)
+            d_weights_frag = T.alloc_fragment([heads], dtype=accum_dtype)
+
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.copy(Weights[bos + i_t, :], weights_shared)
+            T.fill(d_index_q_frag, 0)
+            T.fill(d_weights_frag, 0)
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+
+            for bi_i in T.Pipelined(num_blocks, num_stages=num_stages):
+
+                i_st = bi_i * block_I
+                i_ed = (bi_i + 1) * block_I
+
+                indices_shared = T.alloc_shared([block_I], dtype=INT32)
+                T.copy(TopkIndices[bos + i_t, i_st:i_ed], indices_shared)
+
+                index_k_shared = T.alloc_shared([block_I, dim], dtype=dtype)
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t),
+                                                          IndexK[bos + pos, j], 0)
+
+                attn_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                index_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                for i in T.Parallel(block_I):
+                    attn_score_shared[i] = AttnScore[bos + i_t, i_st + i]
+                    index_score_shared[i] = IndexScore[bos + i_t, i_st + i]
+
+                logits = T.alloc_fragment((block_I, heads), accum_dtype)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                for i, j in T.Parallel(block_I, heads):
+                    logits[i, j] = T.max(logits[i, j], 0)
+
+                # dw
+                d_weights_i = T.alloc_fragment((block_I, heads), accum_dtype)
+                for i, j in T.Parallel(block_I, heads):
+                    d_weights_i[i,
+                                j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
+                T.reduce_sum(d_weights_i, d_weights_frag, dim=0, clear=False)
+
+                d_logits_qk = T.alloc_shared((block_I, heads), accum_dtype)
+                d_logits_qk_cast1 = T.alloc_fragment((block_I, heads), dtype)
+                d_logits_qk_cast2 = T.alloc_fragment((block_I, heads), dtype)
+
+                for i, j in T.Parallel(block_I, heads):
+                    d_relu = T.alloc_var(accum_dtype)
+                    if logits[i, j] > 0:
+                        d_relu = 1.0
+                    else:
+                        d_relu = 0.0
+                    d_logits_qk[i, j] = (index_score_shared[i] -
+                                         attn_score_shared[i]) * d_relu * weights_shared[j]
+
+                # dq
+                T.copy(d_logits_qk, d_logits_qk_cast1)
+                T.gemm(
+                    d_logits_qk_cast1,  # [BS, HQ]
+                    index_k_shared,  # [BS, K]
+                    d_index_q_frag,  # [HQ, K]
+                    transpose_A=True,
+                    transpose_B=False,
+                    clear_accum=False,
+                )
+
+                # dk
+                T.copy(d_logits_qk, d_logits_qk_cast2)
+                d_index_k_frag = T.alloc_fragment([block_I, dim], dtype=accum_dtype)
+                T.gemm(
+                    d_logits_qk_cast2,  # [BS, HQ]
+                    index_q_shared,  # [HQ, K]
+                    d_index_k_frag,  # [BS, K]
+                    transpose_A=False,
+                    transpose_B=False,
+                    clear_accum=True,
+                )
+
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    if ((pos > -1) & (pos <= i_t)):
+                        T.atomic_add(dIndexK[bos + pos, j], d_index_k_frag[i, j])
+
+            for i, j in T.Parallel(heads, dim):
+                d_index_q_frag[i, j] = d_index_q_frag[i, j] * sm_scale
+
+            T.copy(d_index_q_frag, dIndexQ[bos + i_t, :, :])
+            T.copy(d_weights_frag, dWeights[bos + i_t, :])
+
+    return tl_indexer_bwd_kernel
+
+
+def indexer_bwd_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    attn_score: torch.Tensor,
+    index_score: torch.Tensor,
+    topk_indices: torch.Tensor,
+    offsets: torch.Tensor,
+):
+    _, heads, dim, topk = *q.shape, topk_indices.shape[-1]
+    token_indices = prepare_token_indices(offsets)
+    dq = torch.zeros_like(q)
+    dweights = torch.zeros_like(weights)
+    dk = torch.zeros_like(k)
+    kernel = tl_indexer_bwd_impl(heads, dim, topk)
+    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets,
+           token_indices)
+    return dq, dweights, dk
+
+
+def ref_indexer_bwd(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor,
+                    TopkIndices: torch.Tensor, AttnScore: torch.Tensor,
+                    offsets: torch.Tensor) -> torch.Tensor:
+    Q.requires_grad_(True)
+    Weights.requires_grad_(True)
+    K.requires_grad_(True)
+    softmax_scale = Q.shape[-1]**-0.5
+    all_loss = []
+    all_log_topk_prob = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= TopkIndices.shape[-1]
+        q = Q[offsets[i]:offsets[i + 1]]
+        weights = Weights[offsets[i]:offsets[i + 1]]
+        k = K[offsets[i]:offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i]:offsets[i + 1]]
+        attn_score = AttnScore[offsets[i]:offsets[i + 1]]
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, 's1 h k, s2 k -> s1 h s2') * softmax_scale
+        logits = F.relu(logits)
+        score = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32)
+        score = torch.where(mask, score, float('-inf'))
+        topk_value = torch.gather(score, dim=-1, index=topk_indices.to(torch.int64))
+        log_topk_prob = F.log_softmax(topk_value, dim=-1, dtype=torch.float32)
+        loss = F.kl_div(
+            log_topk_prob.clip(-100, 0),
+            attn_score.log().clip(-100, 0),
+            log_target=True,
+            reduction="sum")
+        all_loss.append(loss)
+        all_log_topk_prob.append(log_topk_prob)
+    loss = torch.stack(all_loss).sum()
+    loss.backward()
+    log_topk_prob = torch.cat(all_log_topk_prob, dim=0)
+    return log_topk_prob.exp(), Q.grad, Weights.grad, K.grad
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    w = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    all_attn_score = []
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(topk)[None, :]).to(q.device)
+        logits = torch.ones(seq_len, topk).cuda()
+        logits = torch.where(mask, logits, float('-inf'))
+        attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        all_attn_score.append(attn_score)
+    attn_score = torch.cat(all_attn_score, dim=0)
+
+    topk_indices = repeat(
+        torch.arange(topk, dtype=torch.int32).cuda(), 'k -> s k', s=S).contiguous()
+    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score,
+                                                          offsets)
+
+    dq, dw, dk = indexer_bwd_interface(q, w, k, attn_score, index_score, topk_indices, offsets)
+
+    print(f"dq err: {get_abs_err(dq, ref_dq):.6f} ratio: {get_err_ratio(dq, ref_dq):.6f}")
+    print(f"dq err: {get_abs_err(dw, ref_dw):.6f} ratio: {get_err_ratio(dw, ref_dw):.6f}")
+    print(f"dq err: {get_abs_err(dk, ref_dk):.6f} ratio: {get_err_ratio(dk, ref_dk):.6f}")
+
+
+if __name__ == '__main__':
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
new file mode 100644
index 00000000..b7fa6627
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -0,0 +1,277 @@
+import math
+import torch
+import torch.nn.functional as F
+from einops import einsum
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = "bfloat16"
+FP32 = "float32"
+INT32 = "int32"
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_topk_reducesum_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_K: int = 32,
+    dtype: str = FP32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_K == 0
+    assert heads <= 64 and heads % 8 == 0
+    assert num_stages == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    N = 2 * topk
+    num_iters = int(round(math.log2(N)))
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.macro
+    def bitonic_sort(
+            topk_index_shared: T.SharedBuffer([N], dtype=INT32),
+            topk_value_shared: T.SharedBuffer([N], dtype=FP32),
+    ):
+        T.sync_threads()
+        for i1 in T.serial(num_iters):
+            for i2 in T.serial(i1 + 1):
+                for i in T.Parallel(N):
+                    ascending = (i & (1 << (i1 + 1))) != 0
+                    j = i ^ (1 << (i1 - i2))
+                    if i < j and \
+                        ((ascending and topk_value_shared[i] > topk_value_shared[j]) or (
+                                not ascending and topk_value_shared[i] < topk_value_shared[j])):
+                        val = topk_value_shared[i]
+                        topk_value_shared[i] = topk_value_shared[j]
+                        topk_value_shared[j] = val
+                        idx = topk_index_shared[i]
+                        topk_index_shared[i] = topk_index_shared[j]
+                        topk_index_shared[j] = idx
+                T.sync_threads()
+
+    @T.prim_func
+    def tl_indexer_topk_reducesum_kernel(
+            IndexQ: T.Tensor(index_q_shape, dtype),
+            Weights: T.Tensor(weights_shape, dtype),
+            IndexK: T.Tensor(index_k_shape, dtype),
+            TopkIndices: T.Tensor(topk_indices_shape, INT32),
+            ReduceSum: T.Tensor(topk_indices_shape, FP32),
+            Offsets: T.Tensor(offsets_shape, INT32),
+            TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos, eos = Offsets[i_b], Offsets[i_b + 1]
+            num_blocks = T.ceildiv(i_t + 1, block_K)
+
+            topk_index_shared = T.alloc_shared([N], dtype=INT32)
+            topk_value_shared = T.alloc_shared([N], dtype=FP32)
+
+            T.fill(topk_index_shared, -1)
+            T.fill(topk_value_shared, float('-inf'))
+            T.sync_threads()
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.sync_threads()
+
+            weights_frag = T.alloc_shared([heads], dtype=dtype)
+            T.copy(Weights[bos + i_t, :], weights_frag)
+            T.sync_threads()
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+            T.sync_threads()
+
+            for bk_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                k_st = bk_i * block_K
+                k_ed = T.min((bk_i + 1) * block_K, eos - bos)
+
+                index_k_shared = T.alloc_shared([block_K, dim], dtype=dtype)
+                for i, j in T.Parallel(block_K, dim):
+                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i,
+                                                                                  j], 0)
+                T.sync_threads()
+
+                logits = T.alloc_fragment((block_K, heads), FP32)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                T.sync_threads()
+
+                for i, j in T.Parallel(block_K, heads):
+                    logits[i, j] = T.max(logits[i, j], 0) * weights_frag[j]
+                T.sync_threads()
+
+                logits_sum = T.alloc_fragment(block_K, FP32)
+                T.reduce_sum(logits, logits_sum, dim=1)
+                T.sync_threads()
+
+                offset = T.alloc_var(INT32)
+                if k_st >= topk:
+                    offset = topk + (k_st % topk)
+                else:
+                    offset = k_st
+                T.sync_threads()
+                for i in T.Parallel(block_K):
+                    if k_st + i > i_t:
+                        logits_sum[i] = float('-inf')
+                    j = offset + i
+                    topk_index_shared[j] = k_st + i
+                    topk_value_shared[j] = logits_sum[i]
+                T.sync_threads()
+
+                if k_ed > topk and k_ed % topk == 0:
+                    bitonic_sort(topk_index_shared, topk_value_shared)
+
+            bitonic_sort(topk_index_shared, topk_value_shared)
+
+            logits_max_frag = T.alloc_fragment([1], dtype=FP32)
+            logits_frag = T.alloc_fragment([topk], dtype=FP32)
+            reducesum_shared = T.alloc_shared([topk], dtype=FP32)
+
+            T.copy(topk_value_shared[:topk], logits_frag)
+            T.sync_threads()
+
+            T.reduce_max(logits_frag, logits_max_frag, dim=-1)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                logits_frag[i] = T.exp(logits_frag[i] - logits_max_frag[0])
+            T.sync_threads()
+
+            lse_frag = T.alloc_fragment([1], dtype=FP32)
+            T.reduce_sum(logits_frag, lse_frag)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                reducesum_shared[i] = logits_frag[i] / lse_frag[0]
+            T.sync_threads()
+
+            # for i in T.Parallel(topk):
+            #     reducesum_shared[i] = logits_frag[i]
+            # T.sync_threads()
+
+            for i in T.Parallel(topk):
+                if topk_index_shared[i] > i_t:
+                    topk_index_shared[i] = -1
+            T.sync_threads()
+
+            T.copy(topk_index_shared[:topk], TopkIndices[bos + i_t, :])
+            T.copy(reducesum_shared[:topk], ReduceSum[bos + i_t, :])
+
+    return tl_indexer_topk_reducesum_kernel
+
+
+def indexer_topk_reducesum_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    topk: int,
+    offsets: torch.Tensor,
+    dtype: str = BF16,
+):
+    seq_len, heads, dim = q.shape
+    kernel = tl_indexer_topk_reducesum_impl(heads=heads, dim=dim, topk=topk, dtype=dtype)
+    token_indices = prepare_token_indices(offsets)
+    topk_indices = torch.zeros((seq_len, topk), device=q.device, dtype=torch.int32)
+    topk_score = torch.zeros((seq_len, topk), device=q.device, dtype=torch.float32)
+    kernel(q, weights, k, topk_indices, topk_score, offsets, token_indices)
+    return topk_indices, topk_score
+
+
+def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int,
+                    offsets: torch.Tensor) -> torch.Tensor:
+    all_topk_indices = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= topk
+        q = Q[offsets[i]:offsets[i + 1]]
+        weights = Weights[offsets[i]:offsets[i + 1]]
+        k = K[offsets[i]:offsets[i + 1]]
+        softmax_scale = q.shape[-1]**-0.5
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, 's1 h k, s2 k -> s1 h s2')
+        logits = F.relu(logits)
+        logits = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * softmax_scale
+        logits = torch.where(mask, logits, float('-inf'))
+        topk_logits, topk_indices = torch.topk(logits, k=topk, dim=-1)
+        topk_score = F.softmax(topk_logits, dim=-1, dtype=torch.float32)
+        all_topk_indices.append(topk_indices)
+        all_topk_score.append(topk_score)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return topk_indices, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=64,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    weights = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, S], dtype=torch.int32).cuda()
+
+    ref_topk_indices, ref_topk_score = ref_index_score(q, weights, k, topk, offsets)
+
+    topk_indices, topk_score = indexer_topk_reducesum_interface(q, weights, k, topk, offsets)
+
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        ref_np_val = ref_topk_score[j]
+        trt_np_val = topk_score[j]
+
+        mask = (ref_np_val > 0).cpu().numpy()
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+
+        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=",
+              len(intersection) / len(set_ref))
+
+        print(
+            f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}"
+        )
+
+
+if __name__ == '__main__':
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
new file mode 100644
index 00000000..33c21cb4
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -0,0 +1,420 @@
+# ruff: noqa
+import tilelang
+from tilelang import language as T
+import torch
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(out_idx=[-1])
+def preprocess(
+    H,
+    D,
+    block_ND=32,
+    num_stages=5,
+    dtype="bfloat16",
+    accum_dtype="float",
+):
+    assert dtype == "bfloat16"
+    assert accum_dtype == "float"
+
+    S = T.symbolic('S')
+
+    shape = [S, H, D]
+
+    @T.prim_func
+    def preprocess_kernel(
+            O: T.Tensor(shape, dtype),
+            dO: T.Tensor(shape, dtype),
+            Delta: T.Tensor([S, H], accum_dtype),
+    ):
+        with T.Kernel(H, T.ceildiv(S, block_ND)) as (bx, by):
+            o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            do = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            delta = T.alloc_fragment([block_ND], accum_dtype)
+            acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            T.clear(acc)
+            for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
+                T.copy(O[by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND], o)
+                T.copy(dO[by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
+                       do)
+                for i, j in T.Parallel(block_ND, block_ND):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[by * block_ND:(by + 1) * block_ND, bx])
+
+    return preprocess_kernel
+
+
+@tilelang.jit(out_idx=[-1])
+def postprocess(
+    D,
+    D_tail,
+    kv_group=1,
+    block_N=64,
+    threads=128,
+    dtype="bfloat16",
+    accum_dtype="float",
+):
+    assert dtype == "bfloat16"
+    assert accum_dtype == "float"
+    S_kv = T.symbolic('S_kv')
+
+    dkv_shape = [S_kv, kv_group, D + D_tail]
+
+    @T.prim_func
+    def postprocess_kernel(
+            dKV: T.Tensor(dkv_shape, accum_dtype),
+            dKV_out: T.Tensor(dkv_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, threads=threads) as (bx, by):
+            T.copy(
+                dKV[bx * block_N:(bx + 1) * block_N, by, :],
+                dKV_out[bx * block_N:(bx + 1) * block_N, by, :],
+            )
+
+    return postprocess_kernel
+
+
+@tilelang.jit(
+    out_idx=[-2],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
+def bwd(
+    H,
+    D,
+    D_tail,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_size=32,
+    num_stages=0,
+    threads=128,
+    indices_dtype="int32",
+    dtype="bfloat16",
+    accum_dtype="float",
+):
+    assert is_causal == True, 'non-casual is not supported now'
+    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert dtype == "bfloat16"
+    assert accum_dtype == "float"
+    assert indices_dtype == "int32"
+
+    if sm_scale is None:
+        sm_scale = (D + D_tail)**(-0.5)
+
+    B_plus_one = T.symbolic('B_plus_one')
+    S = T.symbolic('S')
+
+    H_kv = H // kv_group
+    q_shape = [S, H, D + D_tail]
+    k_shape = [S, kv_group, D + D_tail]
+    o_shape = [S, H, D]
+    indices_shape = [S, kv_group, topk]
+    delta_shape = [S, H]
+    lse_shape = [S, H]
+    offsets_shape = [B_plus_one]
+    token_indices_shape = [S, 2]
+    assert indices_dtype == "int32"
+    assert dtype == "bfloat16"
+    assert accum_dtype == "float"
+
+    H = H_kv
+    padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    BS = block_size
+    NS = tilelang.cdiv(topk, block_size)
+
+    split_store = 2
+
+    @T.prim_func
+    def sparse_mla_bwd_kernel(
+            Q: T.Tensor(q_shape, dtype),
+            KV: T.Tensor(k_shape, dtype),
+            dO: T.Tensor(o_shape, dtype),
+            Indices: T.Tensor(indices_shape, indices_dtype),
+            Lse: T.Tensor(lse_shape, accum_dtype),
+            Delta: T.Tensor(delta_shape, accum_dtype),
+            Offsets: T.Tensor(offsets_shape, indices_dtype),
+            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
+            dQ: T.Tensor(q_shape, dtype),
+            dKV: T.Tensor(k_shape, accum_dtype),
+    ):
+        with T.Kernel(S, kv_group, threads=threads) as (b_s_i, bz):
+            Q_shared = T.alloc_shared([padded_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            KV_shared = T.alloc_shared([BS, D], dtype)
+            KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
+            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            mask = T.alloc_fragment([BS], "bool")
+
+            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dQ_shared = T.alloc_shared([padded_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+
+            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
+            acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
+            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
+            acc_dkv_tail_shared = T.view(
+                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+
+            max_kv_i = s_i
+
+            T.copy(Q[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[bos + s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+
+            T.clear(acc_dq)
+            T.clear(acc_dq_tail)
+
+            T.annotate_layout({
+                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
+                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
+            })
+
+            # Process each block of indices
+            for i_i in T.Pipelined(NS, num_stages=num_stages):
+                # Check which indices are valid
+                for bi_i in T.Parallel(BS):
+                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (
+                        Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
+
+                # Compute attention scores
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
+
+                # Load KV, V for this block of indices
+                for bi_i, d_i in T.Parallel(BS, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz,
+                                              d_i]
+
+                T.gemm(
+                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for bi_i, d_i in T.Parallel(BS, D_tail):
+                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i],
+                                                   bz, D + d_i]
+                T.gemm(
+                    Q_tail_shared,
+                    KV_tail_shared,
+                    acc_p,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale -
+                                             Lse[bos + s_i, bz * padded_H + h_i])
+
+                T.copy(acc_p, P_shared_cast)
+
+                T.gemm(
+                    dO_shared,
+                    KV_shared,
+                    acc_dp,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullCol,
+                    clear_accum=True)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
+                        acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
+
+                T.copy(acc_dp, dP_shared_cast)
+                T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
+
+                T.gemm(
+                    dP_shared_cast,
+                    Q_shared,
+                    acc_dkv,
+                    transpose_A=True,
+                    policy=T.GemmWarpPolicy.FullCol,
+                    clear_accum=True)
+                T.gemm(
+                    P_shared_cast,
+                    dO_shared,
+                    acc_dkv,
+                    transpose_A=True,
+                    policy=T.GemmWarpPolicy.FullCol)
+
+                T.clear(acc_dkv_tail)
+                T.gemm(
+                    dP_shared_cast,
+                    Q_tail_shared,
+                    acc_dkv_tail,
+                    transpose_A=True,
+                    policy=T.GemmWarpPolicy.FullCol)
+
+                for s in range(split_store):
+                    for bi_i, d_i in T.Parallel(BS, D):
+                        if bi_i < BS // split_store:
+                            acc_dkv_shared[bi_i, d_i] = acc_dkv[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS, D_tail):
+                        if bi_i < BS // split_store:
+                            acc_dkv_tail_shared[bi_i,
+                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
+                                                                    d_i]
+
+                    for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s *
+                                              (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4])
+
+                    # Atomically update dKV, dKV_tail tensors
+                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s *
+                                              (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4])
+
+            # Store the accumulated dQ
+            T.copy(acc_dq, dQ_shared)
+            T.copy(acc_dq_tail, dQ_tail_shared)
+
+            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+
+    return sparse_mla_bwd_kernel
+
+
+def sparse_mla_bwd(q,
+                   kv,
+                   o,
+                   do,
+                   indices,
+                   lse,
+                   offsets,
+                   sm_scale=None,
+                   is_casual=True,
+                   return_kernel=False,
+                   delta=None):
+    assert q.is_contiguous()
+    assert kv.is_contiguous()
+    assert indices.is_contiguous()
+    assert lse.is_contiguous()
+    S, H, dim_plus_tail_dim = q.shape
+    S_kv, kv_group, _ = kv.shape
+    assert kv.shape[-1] == dim_plus_tail_dim
+    assert S == S_kv
+    # dim should be assigned
+    D = 512
+
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    assert indices.shape == (S, kv_group, topk)
+    assert lse.shape == (S, H)
+
+    token_indices = prepare_token_indices(offsets)
+
+    # Get kernels
+    preprocess_kernel = preprocess(H, D)
+    bwd_kernel = bwd(H, D, D_tail, topk, kv_group, sm_scale, is_casual)
+    postprocess_kernel = postprocess(D, D_tail, kv_group)
+
+    if delta is None:
+        delta = preprocess_kernel(o, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+    dq = bwd_kernel(q, kv, do, indices, lse, delta, offsets, token_indices, dkv)
+    dkv = postprocess_kernel(dkv)
+
+    return dq, dkv
+
+
+def ref_sparse_mla_bwd_interface(q,
+                                 kv,
+                                 o,
+                                 do,
+                                 indices,
+                                 lse,
+                                 offsets,
+                                 sm_scale=None,
+                                 is_casual=True):
+    from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+    q = q.detach().clone()
+    kv = kv.detach().clone()
+    q.requires_grad = True
+    kv.requires_grad = True
+    o = ref_sparse_mla_fwd_interface(q, kv, indices, offsets, sm_scale, is_casual)
+    o.backward(do)
+    return q.grad, kv.grad
+
+
+def test_sparse_mla_bwd(B=1,
+                        S=2048,
+                        H=64,
+                        HKV=1,
+                        DQKV=576,
+                        DV=512,
+                        topk=512,
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
+    # Prepare data
+    q = torch.randn((S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
+    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
+    do = torch.randn((S, H, DV), dtype=dtype, device='cuda')
+    offsets = torch.tensor([0, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device='cuda')
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, :len(i_i)] = i_i
+
+    # Forward
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets)
+
+    tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+    ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None, offsets)
+
+    if check_correctness:
+        assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
+        assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
+        print("assert_tensors_similar passed")
+
+    per_token_flop = 2 * sum([
+        H * DV * topk,
+        H * DQKV * topk,
+        H * DQKV * topk,
+        H * DQKV * topk,
+        H * DV * topk,
+    ])
+    from tilelang.profiler import do_bench
+
+    def fn():
+        return sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+
+    ms = do_bench(fn, rep=100, warmup=250)
+    print(f"Average time: {ms:.3f} ms")
+    print(f'bwd io bandwidth = ',
+          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_bwd(
+        B=1,
+        S=2048,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=512,
+        dtype=torch.bfloat16,
+        check_correctness=True)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
new file mode 100644
index 00000000..5f03dfbb
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -0,0 +1,332 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(
+        dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(
+        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert (topk %
+            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim))**0.5
+    else:
+        sm_scale = sm_scale
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    head_kv = heads // kv_group
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len, kv_group, dim + tail_dim]
+    o_shape = [seq_len, heads, dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert (
+            kv_group == 1
+        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+            Q: T.Tensor(q_shape, dtype),  # type: ignore
+            KV: T.Tensor(kv_shape, dtype),  # type: ignore
+            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+            Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+            Output: T.Tensor(o_shape, dtype),  # type: ignore
+            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(
+                seq_len * REPLICATE_H, kv_group, threads=threads) as (
+                    bx,
+                    by,
+                ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+            T.fill(sumexp, 0)
+            T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (
+                        Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i,
+                                              d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i],
+                                                  g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.copy(m_i, m_i_prev)
+                T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    alpha[h_i] = T.exp((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale)
+                T.reduce_sum(acc_s, sumexp_i, dim=1)  # is this a accumulate operator?
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
+                for h_i, d_i in T.Parallel(H_per_block, D):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] * alpha[h_i]
+
+                T.copy(acc_s, S_shared)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Rescale
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] /= sumexp[h_i]
+            for h_i in T.Parallel(H_per_block):
+                sumexp[h_i] = T.log(sumexp[h_i]) + m_i[h_i] * sm_scale
+
+            T.copy(acc_o, Output[bos + s_i, H0:H1, :])
+            T.copy(sumexp, Lse[bos + s_i, H0:H1])
+
+    return main
+
+
+def sparse_mla_fwd_interface(q,
+                             kv,
+                             indices,
+                             offsets,
+                             sm_scale=None,
+                             return_p_sum: bool = False,
+                             d_v=512,
+                             block_I=32,
+                             num_stages=2,
+                             threads=128):
+    is_casual = True
+    assert return_p_sum == False, "This kernel file is for fwd only"
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    seq_len, heads, dim_plus_tail_dim = q.shape
+    seq_len_kv, kv_group, _ = kv.shape
+    assert seq_len == seq_len_kv
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = d_v
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, topk = indices.shape
+    assert indices.shape == (seq_len, kv_group, topk)
+
+    token_indices = prepare_token_indices(offsets)
+
+    kernel = sparse_mla_fwd(
+        heads,
+        dim,
+        tail_dim,
+        topk,
+        kv_group,
+        sm_scale,
+        is_casual,
+        block_I=block_I,
+        num_stages=num_stages,
+        threads=threads)
+    out, lse = kernel(q, kv, indices, offsets, token_indices)
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casual=True):
+    Q = Q.float()
+    KV = KV.float()
+    all_o = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[None, offsets[i]:offsets[i + 1]]
+        kv = KV[None, offsets[i]:offsets[i + 1]]
+        indices = Indices[None, offsets[i]:offsets[i + 1]].clone()
+
+        indices = indices.transpose(1, 2)
+        b, sq, h, dim_q = q.shape
+        b, sk, g, _ = kv.shape
+
+        assert kv.shape[-1] == 576, "you should assign dim otherwise"
+        dim = 512
+        k = kv
+        v = kv[..., :dim]
+
+        b, _, _, dim_v = v.shape
+        g_index = g
+        h_index = h // g
+        compressed_casual_mask = torch.arange(
+            0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+                1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+
+        indices[indices > sk] = sk
+        mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+        mask = mask[..., :-1]
+        mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+        mask[:, :, :1 - 1, 0] = True
+        mask = mask.view(b, g_index, 1, sq, sk)
+
+        q = q.view(b, sq, g, -1, dim_q)
+        score = torch.einsum("bmghd,bngd->bghmn", q, k)
+        sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+        score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+        p = score.softmax(dim=-1)
+        p = p.view(b, g_index, h_index, -1, sq, sk)
+        p = p.view(b, g, -1, sq, sk)
+        o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+        o = o.reshape(b, sq, h, dim_v)
+        all_o.append(o.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd(B=1,
+                        S=4096,
+                        H=128,
+                        HKV=1,
+                        DQK=576,
+                        DV=512,
+                        topk=2048,
+                        dtype=torch.bfloat16,
+                        check_correctness=True,
+                        block_I=64,
+                        num_stages=2,
+                        threads=256):
+    torch.random.manual_seed(0)
+    q = torch.randn((S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    offsets = torch.tensor([0, S // 2 - 1, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, :len(i_i)] = i_i
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(
+        q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    if check_correctness:
+        # otherwise may cause out of memory
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, offsets)
+        assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
+        print("assert_tensors_similar passed")
+
+    def fn():
+        return sparse_mla_fwd_interface(
+            q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=100,
+        warmup=250,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print("fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_fwd(
+        B=1,
+        S=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=1024,
+        dtype=torch.bfloat16,
+        check_correctness=True,
+        block_I=64,
+        num_stages=2,
+        threads=256)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
new file mode 100644
index 00000000..94bdb8fb
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -0,0 +1,241 @@
+# ruff: noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tilelang
+from tilelang import language as T
+from einops import repeat, rearrange, einsum
+from index import prepare_token_indices
+from utils import get_abs_err, get_err_ratio
+
+BF16 = "bfloat16"
+FP32 = "float32"
+INT32 = "int32"
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def tl_sparse_mla_topk_reducesum_impl(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(
+        dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(
+        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert (topk %
+            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim))**0.5
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    seq_len_kv = T.symbolic("seq_len_kv")
+
+    head_kv = heads // kv_group
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert (
+            kv_group == 1
+        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len_kv, kv_group, dim + tail_dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    reducesum_shape = [seq_len, kv_group, REPLICATE_H, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    @T.prim_func
+    def tl_sparse_mla_topk_reducesum_kernel(
+            Q: T.Tensor(q_shape, dtype),  # type: ignore
+            KV: T.Tensor(kv_shape, dtype),  # type: ignore
+            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+            Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+            ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(
+                seq_len * REPLICATE_H, kv_group, threads=threads) as (
+                    bx,
+                    by,
+                ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            reducesum = T.alloc_fragment([BI], accum_dtype)
+            lse = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(lse, 0)
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            r_i = bx % REPLICATE_H
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+            T.copy(Lse[bos + s_i, H0:H1], lse)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (
+                        Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i,
+                                              d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i],
+                                                  g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - lse[h_i])
+                T.reduce_sum(acc_s, reducesum, dim=0)
+                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI:i_i * BI + BI])
+
+    return tl_sparse_mla_topk_reducesum_kernel
+
+
+def sparse_mla_topk_reducesum_interface(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    topk_indices: torch.Tensor,
+    lse: torch.Tensor,
+    offsets: torch.Tensor,
+    dim_v: int,
+):
+    assert kv.shape[-2] == 1
+    seq_len, heads, dim_plus_tail_dim, topk = *q.shape, topk_indices.shape[-1]
+    REPLICATE_H = max(heads // 64, 1)
+    tail_dim = dim_plus_tail_dim - dim_v
+    token_indices = prepare_token_indices(offsets)
+
+    reducesum = torch.zeros([seq_len, 1, REPLICATE_H, topk], dtype=torch.float32, device=q.device)
+    kernel = tl_sparse_mla_topk_reducesum_impl(heads=heads, dim=dim_v, tail_dim=tail_dim, topk=topk)
+    kernel(q, kv, topk_indices, lse, offsets, token_indices, reducesum)
+    reducesum = reducesum.sum(dim=-2)  # [batch, seq_len, 1, RH, topk] -> [batch, seq_len, 1, topk]
+    attn_score = reducesum / reducesum.sum(dim=-1, keepdim=True)
+
+    return attn_score
+
+
+def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor,
+                         offsets: torch.Tensor):
+    # q: [batch, seq_len, heads, dim]
+    # k: [batch, seq_len, dim]
+    sm_scale = Q.shape[-1]**-0.5
+    all_lse = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[offsets[i]:offsets[i + 1]]
+        k = K[offsets[i]:offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i]:offsets[i + 1]]
+        seq_len = q.shape[0]
+        mask = (torch.arange(seq_len)[:, None]
+                >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
+        logits = einsum(q, k, 's1 h d, s2 d -> s1 h s2') * sm_scale
+        logits = torch.where(mask, logits, float('-inf'))
+        score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        score_sum = score.sum(dim=-2)
+        topk_score = torch.gather(score_sum, dim=-1, index=topk_indices.to(torch.int64))
+        topk_score = topk_score / topk_score.sum(dim=-1, keepdim=True)
+        max_logits = logits.amax(dim=-1).to(torch.float32)
+        lse = torch.log(
+            (logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
+        all_lse.append(lse)
+        all_topk_score.append(topk_score)
+    lse = torch.cat(all_lse, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return lse, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    topk=128,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    topk_indices = repeat(
+        torch.arange(topk, dtype=torch.int32).cuda(), 'k -> s k', s=S).contiguous()
+
+    lse, ref_attn_score = ref_mla_topk_softmax(q, kv, topk_indices, offsets)
+
+    kv = kv.unsqueeze(-2)
+    topk_indices = topk_indices.unsqueeze(-2)
+
+    attn_score = sparse_mla_topk_reducesum_interface(
+        q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
+    print(
+        f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}"
+    )
+
+
+if __name__ == '__main__':
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/utils.py b/examples/dsa_sparse_finetune/utils.py
new file mode 100644
index 00000000..691af64d
--- /dev/null
+++ b/examples/dsa_sparse_finetune/utils.py
@@ -0,0 +1,75 @@
+import torch
+
+
+def get_abs_err(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    return (x - y).flatten().abs().max().item()
+
+
+def get_err_ratio(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    err = (x - y).flatten().square().mean().sqrt().item()
+    base = (x).flatten().square().mean().sqrt().item()
+    return err / base
+
+
+def calculate_tensor_similarity(x, y, name="tensor"):
+    """
+    Calculate similarity between two tensors using a normalized dot product metric.
+
+    Unlike torch.testing.assert_close which uses absolute/relative tolerance based on
+    element-wise differences, this function computes a global similarity score:
+        sim = 2 * <x, y> / (||x||^2 + ||y||^2)
+
+    This metric is scale-invariant and measures the cosine-like similarity normalized
+    by the magnitude of both tensors. It returns 1 for identical tensors and values
+    closer to 0 for dissimilar ones. This is particularly useful for comparing tensors
+    with varying magnitudes where relative errors matter more than absolute differences.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        name: Name of the tensor for logging purposes
+
+    Returns:
+        Similarity score in range [0, 1] where 1 means identical
+    """
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print(f"\033[33mWARNING: {name} all zero\033[0m")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
+    """
+    Assert that two tensors are similar using a global similarity metric.
+
+    Key differences from torch.testing.assert_close:
+    - torch.testing.assert_close: Uses element-wise comparison with rtol/atol, checking
+      that |x - y| <= atol + rtol * |y| for each element. It's sensitive to outliers
+      and requires all elements to satisfy the tolerance.
+    - assert_tensors_similar: Uses a single global similarity score (1 - sim) where sim is the
+      normalized dot product. It's more robust to outliers and focuses on overall
+      tensor similarity rather than element-wise precision. This is better suited for
+      comparing large tensors where a few outlier elements shouldn't fail the test.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        eps: Maximum allowed difference (1 - similarity), default 1e-8
+        name: Name of the tensor for error messages
+        raise_assert: Whether to raise assertion error on failure
+    """
+    sim = calculate_tensor_similarity(x, y, name)
+    diff = 1. - sim
+    if not (0 <= diff <= eps):
+        print(
+            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
+        )
+        if raise_assert:
+            assert False  # noqa: B011
-- 
GitLab


From 1e92d11cd252e014c44a1c0dc94deaade14c7d2f Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 28 Nov 2025 03:28:14 +0800
Subject: [PATCH 056/139] [Refactor] Improve assertion handling in CodeGenCHost
 and ArgBinder (#1352)

* [Refactor] Improve assertion handling in CodeGenCHost and ArgBinder

This commit refines the assertion message generation in CodeGenCHost by optimizing the handling of equality checks and reducing buffer size for error messages. Additionally, it enhances the ArgBinder by introducing a nullable guard mechanism for assertions, allowing for more precise error handling when binding arguments. The changes improve the clarity and efficiency of assertion handling across the codebase.

* [Enhancement] Update matmul kernel and optimize argument binding

This commit enhances the matmul kernel by introducing additional tensor parameters and refining the pipeline stages for improved performance. It also updates the argument binding mechanism to include a flag indicating whether buffers are used, enhancing the efficiency of buffer management. Furthermore, the optimization phase in the engine is improved by adding a simplification step, ensuring better performance and clarity in the generated code.

* lint fix

* [Enhancement] Add tensor checks documentation and improve argument binding assertions

This commit introduces a new documentation page for host-side tensor checks, detailing the automatic validations performed by TileLang on kernel arguments. It enhances the ArgBinder by adding assertions for non-null pointers when arguments are used, improving error handling. Additionally, the optimization phase in the engine is updated to include a simplification step, ensuring better performance and clarity in the generated code.

* [Enhancement] Update .gitignore and refine matmul kernel for improved performance

This commit adds host checks logs to the .gitignore file to prevent unnecessary log files from being tracked. Additionally, it refines the matmul kernel by adjusting pipeline stages, updating tensor parameters, and enhancing argument handling for better performance. The changes also include improved error messages in the argument binding process, ensuring clearer diagnostics for users.

* lint fix

* lint fix

* [Refactor] Simplify tensor_null_test function and remove ptr_null_test

This commit refactors the tensor_null_test function by adding a with_bias parameter and removing the ptr_null_test function, which was previously unused. The run_test function is updated to reflect these changes, streamlining the testing process for tensor operations.

* lint fix

* fix
---
 .gitignore                                    |   3 +
 docs/compiler_internals/tensor_checks.md      | 387 ++++++++++++++++++
 docs/index.md                                 |   1 +
 examples/quickstart.py                        |   2 +-
 maint/host_checks/01_num_args_mismatch.py     |  21 +
 maint/host_checks/02_pointer_type_error.py    |  22 +
 maint/host_checks/03_ndim_mismatch.py         |  19 +
 maint/host_checks/04_dtype_mismatch.py        |  19 +
 maint/host_checks/05_shape_mismatch.py        |  19 +
 maint/host_checks/06_strides_mismatch.py      |  19 +
 maint/host_checks/07_device_type_mismatch.py  |  18 +
 maint/host_checks/08_device_id_mismatch.py    |  25 ++
 maint/host_checks/09_null_data_pointer.py     |  25 ++
 maint/host_checks/10_scalar_type_mismatch.py  |  15 +
 maint/host_checks/README.md                   |  21 +
 maint/host_checks/common.py                   |  50 +++
 maint/host_checks/run_all.py                  |  71 ++++
 src/runtime/error_helpers.cc                  |  60 +++
 src/target/codegen_c_host.cc                  |  81 +---
 src/transform/arg_binder.cc                   | 205 ++++------
 src/transform/arg_binder.h                    |   2 +-
 src/transform/make_packed_api.cc              | 109 ++++-
 src/transform/merge_if_stmt.cc                |  45 +-
 src/transform/merge_if_stmt.h                 |  52 +++
 .../python/jit/test_tilelang_jit_nullptr.py   |  74 +---
 tilelang/engine/phase.py                      |   1 +
 tilelang/jit/adapter/tvm_ffi.py               |  17 -
 27 files changed, 1100 insertions(+), 283 deletions(-)
 create mode 100644 docs/compiler_internals/tensor_checks.md
 create mode 100644 maint/host_checks/01_num_args_mismatch.py
 create mode 100644 maint/host_checks/02_pointer_type_error.py
 create mode 100644 maint/host_checks/03_ndim_mismatch.py
 create mode 100644 maint/host_checks/04_dtype_mismatch.py
 create mode 100644 maint/host_checks/05_shape_mismatch.py
 create mode 100644 maint/host_checks/06_strides_mismatch.py
 create mode 100644 maint/host_checks/07_device_type_mismatch.py
 create mode 100644 maint/host_checks/08_device_id_mismatch.py
 create mode 100644 maint/host_checks/09_null_data_pointer.py
 create mode 100644 maint/host_checks/10_scalar_type_mismatch.py
 create mode 100644 maint/host_checks/README.md
 create mode 100644 maint/host_checks/common.py
 create mode 100644 maint/host_checks/run_all.py
 create mode 100644 src/runtime/error_helpers.cc
 create mode 100644 src/transform/merge_if_stmt.h

diff --git a/.gitignore b/.gitignore
index 752f6cb7..730398df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,6 @@ cmake-build-*/
 
 # pre-commit cache
 .pre-commit-cache/*
+
+# host checks logs
+maint/host_checks/logs/*
diff --git a/docs/compiler_internals/tensor_checks.md b/docs/compiler_internals/tensor_checks.md
new file mode 100644
index 00000000..b4d2a0b3
--- /dev/null
+++ b/docs/compiler_internals/tensor_checks.md
@@ -0,0 +1,387 @@
+# Tensor Checks (Host-Side Auto-Validation)
+
+This page explains the host-side checks that TileLang automatically inserts into the generated host stub for kernels. When you pass `torch.Tensor` or any DLPack-compatible object to a TileLang kernel, the host stub validates argument count, pointer kinds, dtype, shape, strides, device, and more — so you don’t need to handwrite Python checks. This keeps the ABI stable and significantly reduces Python overhead compared to doing equivalent checks in Python or via pybind.
+
+## Why Host-Side Checks
+- ABI stability: the entry is based on TVM FFI + DLPack, consistently accepting tensors and scalars.
+- Lower overhead: shifting checks from Python into C reduces interpreter/property-access costs; the call overhead is lower than pybind-based approaches.
+- Focused error reporting: assertions are raised close to the call site with precise “which field failed” messages.
+
+## How To Inspect Host Source
+You can inspect the auto-generated host source (with all checks and the final device-kernel call) for debugging:
+
+```python
+print(matmul_relu_kernel.get_host_source())
+```
+
+---
+
+## What The Host Checks
+
+### 1) Argument count and pointer kind
+- `num_args` must match the number of formal parameters; otherwise the kernel returns `-1` with an error message.
+- Each argument’s FFI type must be a pointer kind (for DLTensor/handle) or a valid scalar type; otherwise you’ll see errors like `Expect arg[i] to be pointer` or a scalar type error.
+
+### 2) Tensor checks (per tensor, after nullability decision)
+- Nullability
+  - If the tensor is “statically reachable/used” by the function body, the handle must be non-NULL; otherwise: `xxx is expected to have non-NULL pointer`.
+  - If an input tensor is not used by the function (statically unreachable), NULL is allowed; other field checks are executed only when `handle != NULL`.
+- Rank (`ndim`)
+  - Runtime `ndim` must equal the compile-time rank.
+- Data type (`dtype`)
+  - Match the triple `(code, bits, lanes)` with tolerance:
+    - `float8_e4m3`: accept `e4m3`, `e4m3fn`, `e4m3fnuz`.
+    - `float8_e5m2`: accept `e5m2`, `e5m2fnuz`.
+    - `bool`: accept `int8/uint8` with `bits=8` (same lanes), `kDLBool(code=6, bits=1 or 8)`, and any `bitwidth=1` (lanes must match).
+  - For packed-bit dtypes (e.g., `Int(1)`, `Int(4)`, `UInt(4)`), strict dtype checking is skipped.
+- Shape
+  - Each runtime dimension is bound to the compile-time shape (constants or symbols) and checked for consistency.
+  - Linear equations among symbolic dims can be solved on the fly (when there’s only one unknown at a given check point), enabling cross-tensor constraints.
+- Strides
+  - If `buffer_type = AutoBroadcast`: allow `strides == NULL` and derive strides from `shape`. If explicit `strides` is present, bind to compile-time constraints and check for equality.
+  - Otherwise: check per-dimension; if `strides == NULL`, derive from `shape` and compare (e.g., contiguous: `strides[-1] == 1`, `strides[-2] == shape[-1]`).
+- `byte_offset`
+  - Must be 0 (non-zero raises an error) to keep addressing simple and aligned.
+- Device info
+  - Assert `device_type == target backend` (CUDA/ROCM/Metal/OneAPI/WebGPU/CPU, etc.). Error messages include a DLPack code legend.
+  - When multiple tensors participate, assert that `device_id` matches across them.
+- Data pointer
+  - Must be non-NULL when the tensor is required to be non-null by the nullability rule.
+
+### 3) Scalar checks
+- `T.int*` family: require integer; error: `Expect arg[i] to be int`.
+- `T.bool`: require boolean; error: `Expect arg[i] to be boolean`.
+
+---
+
+## Shapes and Symbolic Equations: Linear Solving
+When shapes are symbolic, the host binds and (when possible) solves linear relations at runtime (only one unknown per check point). Example:
+
+```python
+@T.prim_func
+def main(
+    A: T.Tensor((m,), dtype),
+    B: T.Tensor((m + n,), dtype),
+    C: T.Tensor((n * k,), dtype),
+):
+    ...
+```
+
+This enables enforcing cross-tensor relationships like `len(B) == m + n` and `len(C) == n * k` at runtime.
+
+---
+
+## Nullability Rules and Examples
+Which tensors may be NULL?
+
+- Rule: If an input tensor is not used by the function under static analysis (i.e., the access is statically unreachable), it is considered Nullable; otherwise it must be non-NULL.
+- Examples:
+
+1) Must be non-NULL (used)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    A[0] = 1
+```
+Passing `None` raises: `main.A_handle is expected to have non-NULL pointer`.
+
+2) Still must be non-NULL (constant-true branch)
+```python
+some_cond: bool = True
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+3) Nullable (constant-false branch, statically unreachable)
+```python
+some_cond: bool = False
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+4) Must be non-NULL (runtime condition)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype), some_cond: T.bool):
+    if some_cond:
+        A[0] = 1
+```
+Since `some_cond` is only known at runtime, static analysis cannot prove `A` is unused; `A` is thus non-nullable.
+
+---
+
+## Device Type Codes (DLPack)
+Supported and referenced device codes in error messages: `1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, 14=OneAPI, 15=WebGPU`.
+Kernels assert that `device_type` matches the target backend, and require `device_id` consistency across tensors.
+
+---
+
+## Common Error Examples (What you’ll see)
+- Argument count mismatch (num_args)
+  - Trigger: missing/extra argument
+  - Error: `<kernel>: num_args should be N; expected: <num_args>, got: N`
+
+- Pointer-typed argument expected
+  - Trigger: scalar passed where a tensor is expected
+  - Error: `<kernel>: Expect arg[i] to be pointer`
+
+- Rank (ndim) mismatch
+  - Trigger: runtime rank differs from compile-time rank
+  - Error: `<kernel>.<name>.ndim is expected to equal R, but got mismatched ndim`
+
+- Dtype mismatch
+  - Trigger: dtype not equal to the compiled dtype and not within the tolerance set
+  - Error: `<kernel>.<name>.dtype is expected to be <dtype>, but got incompatible dtype`
+
+- Shape constraint violation
+  - Trigger: a dimension doesn’t match a constant/symbol binding
+  - Error: `Argument <kernel>.<name>.shape[i] has an unsatisfied constraint: ... == <expected>`
+
+- Strides check failed (e.g., non-contiguous layout)
+  - Trigger: transposed/sliced tensors that violate expected strides
+  - Error: `Argument <kernel>.<name>.strides[j] has an unsatisfied constraint: ... == <expected>`
+
+- Device type mismatch
+  - Trigger: calling a CUDA kernel with CPU tensors, etc.
+  - Error: `<kernel>.<name>.device_type mismatch [expected: <code> (<name>)] ...`
+
+- Device id mismatch
+  - Trigger: mixing tensors from different GPUs
+  - Error: `Argument <kernel>.<name>.device_id has an unsatisfied constraint: ... == ...`
+
+- NULL data pointer
+  - Trigger: tensor required to be non-null has a NULL data pointer
+  - Error: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`
+
+- Scalar type mismatch
+  - Trigger: passing float to `T.int32`, or non-boolean to `T.bool`
+  - Error: `<kernel>: Expect arg[i] to be int/boolean`
+
+---
+
+## Troubleshooting Tips
+- Print the host source: `print(fn.get_host_source())` to see the exact assertion and expected vs. actual fields.
+- Fix strides: call `.contiguous()` for non-contiguous tensors, or avoid generating transposed/sliced layouts that break assumptions.
+- Align devices: ensure all participating tensors share the same `device_type` and `device_id`.
+- Align dtype: use `.to(<dtype>)` or construct tensors with the correct dtype; pay attention to `float8` and `bool` tolerance.
+- Dynamic shapes: ensure cross-tensor linear relations can be uniquely determined at the check point (only one unknown at a time).
+
+---
+
+## FAQ
+- Can I disable the checks?
+  - Not recommended and usually not supported. Checks are done on the host to preserve ABI stability and fail early close to the device call.
+- Is the overhead noticeable?
+  - The checks are lightweight (branches and field reads). Compared to Python-side checks, it’s faster; the dominating cost remains the Python→C boundary. Overall it’s cheaper than equivalent checks in Python.
+
+---
+
+## Reference Example (Matmul + ReLU)
+
+```python
+@T.prim_func
+def matmul_relu_kernel(
+    A: T.Tensor((M, K), dtype),
+    B: T.Tensor((K, N), dtype),
+    C: T.Tensor((M, N), dtype),
+):
+    # Initialize Kernel Context
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        T.clear(C_local)
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+            T.gemm(A_shared, B_shared, C_local)
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+# For debugging, print the host source
+print(matmul_relu_kernel.get_host_source())
+```
+
+The host will insert all checks described above for this example.
+
+---
+
+## Quick Error Reference (Short List)
+- Argument count
+  - Trigger: missing/extra args; Error: `num_args should be N; expected: <num_args>, got: N`.
+- Pointer kind
+  - Trigger: scalar passed to tensor arg; Error: `Expect arg[i] to be pointer`.
+- Rank (ndim)
+  - Trigger: runtime rank != compile-time; Error: `ndim ... expected to equal R`.
+- Dtype
+  - Trigger: mismatch and not tolerated; Error: `dtype ... expected to be <dtype>`.
+- Shape
+  - Trigger: constant/symbol binding violated; Error: `shape[i] ... == <expected>`.
+- Strides
+  - Trigger: layout mismatch; Error: `strides[j] ... == <expected>`.
+- Device type
+  - Trigger: wrong backend device; Error: `device_type mismatch [expected: ...]`.
+- Device id
+  - Trigger: tensors on different GPUs; Error: `device_id ... == ...`.
+- Data pointer
+  - Trigger: required non-NULL but NULL; Error: `non-NULL data pointer`.
+- Scalar types
+  - Trigger: wrong scalar type; Error: `Expect arg[i] to be int/boolean`.
+
+---
+
+## Host Error Troubleshooting (Minimal Repros)
+
+Below are minimal repro snippets for common host-side errors, assuming a CUDA-targeted kernel like `matmul_relu_kernel` with:
+
+```python
+# Convention:
+# A: float16 [M, K]
+# B: float16 [K, N]
+# C: float16 [M, N]
+# Target: CUDA (device_type=2)
+fn = matmul_relu_kernel  # your compiled function
+M = N = K = 1024
+```
+
+Adjust dtype/device if your kernel differs.
+
+### 0. Tip: print the host source
+```python
+print(fn.get_host_source())
+```
+
+### 1. num_args mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+# Missing C
+fn(A, B)
+```
+Expected: `<kernel>: num_args should be 3; expected: <num_args>, got: 3`.
+
+Fix: pass all arguments per the signature.
+
+### 2. Expect pointer (tensor) but got scalar
+```python
+import torch
+
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(1, B, C)
+```
+Expected: `<kernel>: Expect arg[0] to be pointer`.
+
+Fix: pass a DLPack-compatible tensor (e.g., torch.Tensor).
+
+### 3. ndim mismatch
+```python
+import torch
+
+A = torch.empty((M, K, 1), device='cuda', dtype=torch.float16)  # rank=3
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.ndim is expected to equal 2, but got mismatched ndim`.
+
+Fix: ensure runtime rank equals compiled rank.
+
+### 4. dtype mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float32)  # should be float16
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.dtype is expected to be float16, but got incompatible dtype`.
+
+Fix: `A = A.to(torch.float16)` or create with the correct dtype.
+
+### 5. Shape constant/symbol mismatch
+```python
+import torch
+
+A = torch.empty((M, K + 1), device='cuda', dtype=torch.float16)  # K mismatched
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.A_handle.shape[i] has an unsatisfied constraint: ... == <expected>`.
+
+Fix: satisfy linear constraints and constants across tensors.
+
+### 6. Strides check failure (non-contiguous)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+A_nc = A.t()  # transpose -> non-contiguous
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A_nc, B, C)
+```
+Expected: `Argument <kernel>.A_handle.strides[1] has an unsatisfied constraint: ... == 1`.
+
+Fix: pass `A_nc.contiguous()` or align the layout expectation in the kernel.
+
+### 7. device_type mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cpu', dtype=torch.float16)
+B = torch.empty((K, N), device='cpu', dtype=torch.float16)
+C = torch.empty((M, N), device='cpu', dtype=torch.float16)
+fn(A, B, C)  # CUDA-targeted kernel
+```
+Expected: `<kernel>.A_handle.device_type mismatch [expected: 2 (cuda)] ...`.
+
+Fix: move tensors to the CUDA device.
+
+### 8. device_id mismatch (multi-GPU)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda:0', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda:1', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda:0', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.B_handle.device_id has an unsatisfied constraint: ... == ...`.
+
+Fix: place all tensors on the same GPU (e.g., `cuda:0`).
+
+### 9. NULL data pointer (advanced)
+This usually comes from hand-constructed DLTensor/NDArray, or external frameworks passing unallocated/freed storage. Regular `torch.Tensor` allocations rarely hit this.
+
+Expected: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`.
+
+Fix: ensure valid underlying storage; in PyTorch scenarios, avoid constructing tensors from invalid external handles.
+
+### 10. Scalar type mismatch (int / bool)
+```python
+import tilelang.language as T
+
+@T.prim_func
+def scalar_check(x: T.int32, flag: T.bool()):
+    T.evaluate(0)
+
+scalar_check(1.0, True)  # x is float -> Expect arg[0] to be int
+scalar_check(1, 2.5)     # flag is float -> Expect arg[1] to be boolean
+```
+
+Fix: pass correct scalar types, e.g., `scalar_check(1, True)`.
+
+---
+
+## Closing Notes
+- Cross-check “shape / strides / device / dtype” against the kernel signature to localize issues efficiently.
+- For complex symbolic relations, print the host source to confirm binding/solving order, then adjust runtime shapes/layouts accordingly.
+
diff --git a/docs/index.md b/docs/index.md
index 5d9a158f..9f794776 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -42,6 +42,7 @@ deeplearning_operators/deepseek_mla
 
 compiler_internals/letstmt_inline
 compiler_internals/inject_fence_proxy
+compiler_internals/tensor_checks
 :::
 
 :::{toctree}
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 46a39e0d..39ad348b 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -77,7 +77,7 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
diff --git a/maint/host_checks/01_num_args_mismatch.py b/maint/host_checks/01_num_args_mismatch.py
new file mode 100644
index 00000000..8ba36646
--- /dev/null
+++ b/maint/host_checks/01_num_args_mismatch.py
@@ -0,0 +1,21 @@
+"""Reproduce: Argument count mismatch.
+
+Note: The adapter-level wrapper expects only inputs (A, B) because C is marked as output.
+Calling with the wrong number of inputs raises a ValueError before host entry.
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    # Missing b
+    # Expected: ValueError with message about expected vs. actual inputs
+    fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/02_pointer_type_error.py b/maint/host_checks/02_pointer_type_error.py
new file mode 100644
index 00000000..fd358540
--- /dev/null
+++ b/maint/host_checks/02_pointer_type_error.py
@@ -0,0 +1,22 @@
+"""Reproduce: Pointer-type argument expected but scalar provided.
+
+We pass an integer for A; wrapper forwards it to the host where a pointer is expected.
+Expected: error like "Expect buffer A_handle to be pointer or tensor" (exact name depends on kernel param).
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # Wrong type for A (int instead of tensor)
+    a = 1
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/03_ndim_mismatch.py b/maint/host_checks/03_ndim_mismatch.py
new file mode 100644
index 00000000..994ce23e
--- /dev/null
+++ b/maint/host_checks/03_ndim_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: ndim (rank) mismatch for A.
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A has rank 3 instead of 2
+    a = torch.empty((M, K, 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/04_dtype_mismatch.py b/maint/host_checks/04_dtype_mismatch.py
new file mode 100644
index 00000000..6e6a0503
--- /dev/null
+++ b/maint/host_checks/04_dtype_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: dtype mismatch for A (float32 vs expected float16).
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+    print(fn.get_host_source())
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float32)  # should be float16
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/05_shape_mismatch.py b/maint/host_checks/05_shape_mismatch.py
new file mode 100644
index 00000000..8b41ae36
--- /dev/null
+++ b/maint/host_checks/05_shape_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: shape constant/symbol mismatch on A.
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A's second dimension is wrong (K+1 instead of K)
+    a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/06_strides_mismatch.py b/maint/host_checks/06_strides_mismatch.py
new file mode 100644
index 00000000..477d200b
--- /dev/null
+++ b/maint/host_checks/06_strides_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: strides check failure (non-contiguous A via transpose).
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    a_nc = a.t()  # non-contiguous after transpose
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a_nc, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/07_device_type_mismatch.py b/maint/host_checks/07_device_type_mismatch.py
new file mode 100644
index 00000000..67cb7718
--- /dev/null
+++ b/maint/host_checks/07_device_type_mismatch.py
@@ -0,0 +1,18 @@
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel.
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cpu", dtype=torch.float16)
+    b = torch.empty((K, N), device="cpu", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/08_device_id_mismatch.py b/maint/host_checks/08_device_id_mismatch.py
new file mode 100644
index 00000000..64910966
--- /dev/null
+++ b/maint/host_checks/08_device_id_mismatch.py
@@ -0,0 +1,25 @@
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices).
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if torch.cuda.device_count() < 2:
+        print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
+        return
+
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
+    # Output device is derived by the adapter; mismatch occurs in host checks
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/09_null_data_pointer.py b/maint/host_checks/09_null_data_pointer.py
new file mode 100644
index 00000000..00bac67d
--- /dev/null
+++ b/maint/host_checks/09_null_data_pointer.py
@@ -0,0 +1,25 @@
+"""Reproduce: NULL data pointer (advanced).
+
+Passing None for a tensor argument will be forwarded through the adapter. Depending on
+FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
+or a host-side non-NULL pointer check.
+
+Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
+demonstrates passing None, which still reproduces the intended class of failure.
+"""
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = None  # attempt to pass a null-like pointer
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/10_scalar_type_mismatch.py b/maint/host_checks/10_scalar_type_mismatch.py
new file mode 100644
index 00000000..f1fcba27
--- /dev/null
+++ b/maint/host_checks/10_scalar_type_mismatch.py
@@ -0,0 +1,15 @@
+"""Reproduce: scalar parameter type mismatch (int/bool).
+"""
+from common import build_scalar_check_kernel
+
+
+def main():
+    fn = build_scalar_check_kernel(target="cuda")
+
+    # Wrong types
+    fn(1.0, True)  # x should be int -> Expect arg[0] to be int
+    fn(1, 2.5)  # flag should be bool -> Expect arg[1] to be boolean
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/README.md b/maint/host_checks/README.md
new file mode 100644
index 00000000..ac23d6fd
--- /dev/null
+++ b/maint/host_checks/README.md
@@ -0,0 +1,21 @@
+# Host-Side Check Repro Scripts
+
+This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
+
+Prerequisites
+- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
+- Python packages: torch, tilelang
+
+Usage
+- Run any script, e.g.:
+  - `python 01_num_args_mismatch.py`
+  - `python 02_pointer_type_error.py`
+  - ... up to `10_scalar_type_mismatch.py`
+
+- Or run all at once with a summary:
+  - `python run_all.py`
+  - Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
+
+Notes
+- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
+- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
new file mode 100644
index 00000000..cdafc8bd
--- /dev/null
+++ b/maint/host_checks/common.py
@@ -0,0 +1,50 @@
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def make_matmul_prim(M,
+                     N,
+                     K,
+                     block_M=128,
+                     block_N=128,
+                     block_K=32,
+                     dtype="float16",
+                     accum_dtype="float"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
+    """Compile and return a callable kernel that takes (A, B) and returns C."""
+    if target.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
+    prim = make_matmul_prim(M, N, K)
+    # out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
+    return tilelang.compile(prim, out_idx=[2], target=target)
+
+
+def build_scalar_check_kernel(target="cuda"):
+
+    @T.prim_func
+    def scalar_check(x: T.int32, flag: T.bool()):
+        T.evaluate(0)
+
+    return tilelang.compile(scalar_check, target=target)
diff --git a/maint/host_checks/run_all.py b/maint/host_checks/run_all.py
new file mode 100644
index 00000000..7fecd8b1
--- /dev/null
+++ b/maint/host_checks/run_all.py
@@ -0,0 +1,71 @@
+import sys
+import subprocess
+from pathlib import Path
+
+
+def main():
+    root = Path(__file__).resolve().parent
+    scripts = [
+        "01_num_args_mismatch.py",
+        "02_pointer_type_error.py",
+        "03_ndim_mismatch.py",
+        "04_dtype_mismatch.py",
+        "05_shape_mismatch.py",
+        "06_strides_mismatch.py",
+        "07_device_type_mismatch.py",
+        "08_device_id_mismatch.py",
+        "09_null_data_pointer.py",
+        "10_scalar_type_mismatch.py",
+    ]
+
+    logs_dir = root / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    results = []
+    for name in scripts:
+        script_path = root / name
+        if not script_path.exists():
+            results.append((name, "MISSING", 0))
+            print(f"[MISSING] {name}")
+            continue
+
+        print(f"\n=== Running {name} ===")
+        proc = subprocess.run(
+            [sys.executable, str(script_path)],
+            cwd=str(root),
+            capture_output=True,
+            text=True,
+        )
+
+        # Save logs
+        (logs_dir / f"{name}.out").write_text(proc.stdout)
+        (logs_dir / f"{name}.err").write_text(proc.stderr)
+
+        out = (proc.stdout or "") + (proc.stderr or "")
+        if "[SKIP]" in out:
+            status = "SKIP"
+        elif proc.returncode != 0:
+            status = "PASS"  # error reproduced as expected
+        else:
+            status = "FAIL"  # no error observed
+
+        results.append((name, status, proc.returncode))
+        print(f"[{status}] {name} (rc={proc.returncode})")
+
+    # Summary
+    print("\n=== Summary ===")
+    counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
+    for name, status, _ in results:
+        counts[status] = counts.get(status, 0) + 1
+        print(f"{status:7} {name}")
+
+    print("\nTotals:")
+    for k in ("PASS", "FAIL", "SKIP", "MISSING"):
+        print(f"  {k:7}: {counts.get(k, 0)}")
+
+    # Exit non-zero if any FAIL
+    sys.exit(1 if counts.get("FAIL", 0) else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/runtime/error_helpers.cc b/src/runtime/error_helpers.cc
new file mode 100644
index 00000000..ba66305a
--- /dev/null
+++ b/src/runtime/error_helpers.cc
@@ -0,0 +1,60 @@
+/*
+ * Helper functions for nicer runtime error messages.
+ */
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/data_type.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace tl {
+
+// Return non-zero so that tvm_call_packed sites treat it as failure and return
+// -1.
+static int DTypeMismatch(const tvm::ffi::String &kernel_name,
+                         const tvm::ffi::String &buffer_name,
+                         int64_t actual_code, int64_t actual_bits,
+                         int64_t actual_lanes, int64_t expect_code,
+                         int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << std::string(kernel_name) << ": dtype of " << std::string(buffer_name)
+     << " is expected to be " << expect << ", but got " << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Variant without names, to avoid passing extra raw strings through packed
+// args.
+static int DTypeMismatchNoNames(int64_t actual_code, int64_t actual_bits,
+                                int64_t actual_lanes, int64_t expect_code,
+                                int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "dtype mismatch: expected " << expect << ", but got " << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+} // namespace tl
+} // namespace tvm
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch",
+                        &tvm::tl::DTypeMismatch);
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch2",
+                        &tvm::tl::DTypeMismatchNoNames);
+}
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
index b5e74b0a..fedf8a1d 100644
--- a/src/target/codegen_c_host.cc
+++ b/src/target/codegen_c_host.cc
@@ -348,7 +348,6 @@ void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
 }
 
 void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
-  using namespace tvm::tir;
   if (emit_asserts_) {
     std::string cond = PrintExpr(op->condition);
     PrintIndent();
@@ -356,88 +355,28 @@ void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
     int assert_if_scope = this->BeginScope();
     {
       // Prepare the base error message
-      const auto *msg_node = op->message.as<StringImmNode>();
+      const auto *msg_node = op->message.as<tvm::tir::StringImmNode>();
       ICHECK(msg_node != nullptr) << "Assert message expected to be StringImm";
       const std::string &raw_msg = msg_node->value;
       const std::string esc_msg = tvm::support::StrEscape(
           raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
           /*escape_whitespace_special_chars=*/true);
 
-      // If the assertion condition contains any equality checks anywhere
-      // in a composite boolean expression, append the actual LHS/RHS values
-      // Collect all EQ nodes within the condition (including inside And/Or/Not)
-      std::vector<const EQNode *> eq_nodes;
-      {
-        std::vector<PrimExpr> stk;
-        stk.push_back(op->condition);
-        while (!stk.empty()) {
-          PrimExpr cur = stk.back();
-          stk.pop_back();
-          if (const auto *eq = cur.as<EQNode>()) {
-            eq_nodes.push_back(eq);
-            continue;
-          }
-          if (const auto *an = cur.as<AndNode>()) {
-            stk.push_back(an->a);
-            stk.push_back(an->b);
-            continue;
-          }
-          if (const auto *on = cur.as<OrNode>()) {
-            stk.push_back(on->a);
-            stk.push_back(on->b);
-            continue;
-          }
-          if (const auto *nn = cur.as<NotNode>()) {
-            stk.push_back(nn->a);
-            continue;
-          }
-        }
-      }
-
-      if (!eq_nodes.empty()) {
-        // Build a single detailed message that includes all LHS/RHS pairs
+      // If the assertion is an equality check, append the actual LHS/RHS values
+      if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
+        std::string lhs = PrintExpr(eq->a);
+        std::string rhs = PrintExpr(eq->b);
         PrintIndent();
-        stream << "char __tvm_assert_msg_buf[1024];\n";
+        stream << "char __tvm_assert_msg_buf[512];\n";
         PrintIndent();
-        stream << "int __tvm_assert_msg_len = snprintf(__tvm_assert_msg_buf, "
-                  "sizeof(__tvm_assert_msg_buf), \"%s\", \""
-               << esc_msg << "\");\n";
-
-        auto escape_for_printf_literal = [&](const std::string &s) {
-          std::string out;
-          out.reserve(s.size());
-          for (char c : s) {
-            if (c == '%') {
-              out += "%%";
-            } else if (c == '"') {
-              out += "\\\"";
-            } else if (c == '\\') {
-              out += "\\\\";
-            } else {
-              out.push_back(c);
-            }
-          }
-          return out;
-        };
-
-        for (const auto *eq : eq_nodes) {
-          std::string lhs = PrintExpr(eq->a);
-          std::string rhs = PrintExpr(eq->b);
-          std::string lhs_disp = escape_for_printf_literal(lhs);
-          std::string rhs_disp = escape_for_printf_literal(rhs);
-          PrintIndent();
-          stream << "__tvm_assert_msg_len += snprintf(__tvm_assert_msg_buf + "
-                    "__tvm_assert_msg_len, "
-                    "sizeof(__tvm_assert_msg_buf) - __tvm_assert_msg_len, \"; ("
-                 << lhs_disp << " == " << rhs_disp
-                 << ") got: %lld, expected: %lld\", (long long)(" << lhs
-                 << "), (long long)(" << rhs << "));\n";
-        }
+        stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
+                  "got: %lld\", \""
+               << esc_msg << "\", (long long)(" << lhs << "), (long long)("
+               << rhs << "));\n";
         PrintIndent();
         stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
                   "__tvm_assert_msg_buf);\n";
       } else {
-        // Fallback: just emit the base message
         PrintIndent();
         stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \"" << esc_msg
                << "\");\n";
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 361cfe90..6857f502 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file arg_binder.cc
  * \brief Helper utility to match and bind arguments.
@@ -24,6 +5,7 @@
 #include "arg_binder.h"
 
 #include <tvm/runtime/device_api.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
@@ -44,16 +26,32 @@ namespace tl {
 using namespace tir;
 
 void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
-                     const std::string &arg_name, std::vector<Stmt> *asserts) {
+                     const std::string &arg_name, std::vector<Stmt> *asserts,
+                     PrimExpr nullable_guard = PrimExpr()) {
   PrimExpr scond = ana->Simplify(cond);
   if (is_zero(scond)) {
     LOG(FATAL) << "Bind have an unmet assertion: " << cond << ", "
                << " on argument " << arg_name;
   }
+
   if (!is_one(scond)) {
     std::ostringstream os;
     os << "Argument " << arg_name << " has an unsatisfied constraint: " << cond;
-    asserts->emplace_back(AssertStmt(scond, StringImm(os.str()), Evaluate(0)));
+
+    // Check if the condition is of the form "is_null || actual_cond"
+    // If so, generate "if !is_null: assert actual_cond" instead of "assert
+    // is_null || actual_cond"
+    if (nullable_guard.defined()) {
+      // Pattern: nullable_guard || actual_condition
+      // We want to transform this into: if !nullable_guard: assert
+      // actual_condition
+      Stmt check = AssertStmt(scond, StringImm(os.str()), Evaluate(0));
+      check = IfThenElse(Not(nullable_guard), check);
+      asserts->emplace_back(SeqStmt({check, Evaluate(0)}));
+    } else {
+      asserts->emplace_back(
+          AssertStmt(scond, StringImm(os.str()), Evaluate(0)));
+    }
   }
 }
 
@@ -106,8 +104,8 @@ bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
       return true;
     } else {
       // Second or later binding: add is_null short-circuit
-      PrimExpr cond = MakeGuarded(it->second == value);
-      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
+      PrimExpr cond = value == it->second;
+      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
     }
   } else {
     // 2. complex binding expr = value
@@ -129,7 +127,7 @@ bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
         auto value_opt = sol->src_to_dst.Get(v);
         ICHECK(value_opt->defined())
             << "Unable to solve variable `" << v << "` from expression `"
-            << (arg == value) << "`";
+            << (value == arg) << "`";
         auto value = ffi::GetRef<PrimExpr>(sol->src_to_dst.Get(v)->get());
         BindVar(v.as<VarNode>(), value);
       }
@@ -138,9 +136,10 @@ bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
     //    because the solved expression may contain floordiv (e.g. 3 * m == n
     //    ==>   m = n // 3) we re-compute the constraint to verify the solution
     //    is correct
-    PrimExpr cond = MakeGuarded(arg == value);
-    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_);
+    PrimExpr cond = value == arg;
+    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
   }
+  // ICHECK(false);
   return false;
 }
 
@@ -160,10 +159,10 @@ bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
       }
       return true;
     } else {
-      BinderAddAssert(&analyzer_, it->second == value, arg_name, &asserts_);
+      BinderAddAssert(&analyzer_, value == it->second, arg_name, &asserts_);
     }
   } else {
-    BinderAddAssert(&analyzer_, arg == value, arg_name, &asserts_);
+    BinderAddAssert(&analyzer_, value == arg, arg_name, &asserts_);
   }
   return false;
 }
@@ -236,7 +235,7 @@ void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
         PrimExpr offset = value->elem_offset;
         PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
         PrimExpr zero = make_zero(offset.dtype());
-        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero,
+        BinderAddAssert(&analyzer_, zero == truncmod(offset, factor),
                         arg_name + ".elem_offset", &asserts_);
       }
     }
@@ -277,7 +276,7 @@ inline PrimExpr TVMArrayGet(DataType t, Var arr,
 
 void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
                              const PrimExpr &device_id, const Var &handle,
-                             const std::string &arg_name) {
+                             const std::string &arg_name, bool is_used) {
   const DataType tvm_shape_type = DataType::ShapeIndex();
   const DataType tvm_ndim_type = DataType::Int(32);
   const Stmt nop = Evaluate(0);
@@ -286,11 +285,18 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   // avoid dereferencing it by using expression-level conditionals and
   // short-circuiting guards in asserts. Cache the null check in a Let-bound
   // boolean so codegen does not repeat `(handle == NULL)` everywhere.
+
   Var is_null_var(arg_name + "_is_null", DataType::Bool());
   init_nest_.emplace_back(
       LetStmt(is_null_var,
               Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
-  const PrimExpr &is_null = is_null_var;
+  const PrimExpr &is_null = is_used ? const_false() : is_null_var;
+  if (is_used) {
+    init_nest_.emplace_back(AssertStmt(
+        !is_null_var,
+        tvm::tir::StringImm(arg_name + " is expected to have non-NULL pointer"),
+        nop));
+  }
 
   // dimension checks
   PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
@@ -318,9 +324,10 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   ndim_err_msg << arg_name << ".ndim is expected to equal "
                << buffer->shape.size() << ", but got mismatched ndim";
   auto msg = StringImm(ndim_err_msg.str());
-  // Only check ndim when handle is non-NULL (using short-circuit OR)
-  v_ndim = tvm::if_then_else(Not(is_null), v_ndim, make_zero(tvm_ndim_type));
-  init_nest_.emplace_back(AssertStmt(Or(is_null, a_ndim == v_ndim), msg, nop));
+  // Only check ndim when handle is non-NULL (using if statement)
+  Stmt ndim_check = AssertStmt(a_ndim == v_ndim, msg, nop);
+  ndim_check = IfThenElse(Not(is_null), ndim_check);
+  init_nest_.emplace_back(SeqStmt({ndim_check, nop}));
   // type checks
   std::ostringstream type_err_msg;
   // Avoid dumping TIR expressions in error text; just state mismatch.
@@ -396,8 +403,10 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         buffer->dtype == DataType::Int(4) ||
         buffer->dtype == DataType::UInt(4))) {
     auto type_msg = StringImm(type_err_msg.str());
-    // Only check dtype when handle is non-NULL (short-circuit)
-    asserts_.emplace_back(AssertStmt(Or(is_null, cond), type_msg, nop));
+    // Only check dtype when handle is non-NULL (using if statement)
+    Stmt dtype_check = AssertStmt(cond, type_msg, nop);
+    dtype_check = IfThenElse(Not(is_null), dtype_check);
+    asserts_.emplace_back(SeqStmt({dtype_check, nop}));
   }
 
   // shape field
@@ -427,31 +436,16 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
     }
 
     // The "real" runtime shape value read from DLTensor
-    PrimExpr raw_shape_val =
+    PrimExpr shape_val =
         cast(buffer->shape[k].dtype(),
              BufferLoad(buf_shape,
                         {IntImm(DataType::Int(32), static_cast<int>(k))}));
 
-    // Bind to the value of the symbolic dimension (e.g., m) in TIR, with an
-    // is_null guard:
-    //   handle is NULL → use 0, placeholder but no dereference
-    //   handle non-NULL → actually read from DLTensor's shape array
-    PrimExpr bound_shape_val = tvm::if_then_else(
-        is_null, make_zero(buffer->shape[k].dtype()), raw_shape_val);
-
     // When first encountering a Var (e.g., m), this will generate:
     //   Let(m, bound_shape_val, ...)
     // Constant dimensions will only generate consistency assertions.
-    BindNullable(buffer->shape[k], bound_shape_val, shape_element_name(k), true,
+    BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
                  is_null);
-
-    // Keep an explicit "consistency check": when non-NULL, the symbolic
-    // dimension must equal the DLTensor's shape.
-    Stmt shape_check = AssertStmt(
-        Or(is_null, buffer->shape[k] == raw_shape_val),
-        StringImm(shape_element_name(k) + " mismatch with DLTensor shape"),
-        Evaluate(0));
-    asserts_.emplace_back(shape_check);
   }
 
   // strides field
@@ -499,7 +493,7 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
       asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
     }
   } else if (buffer->buffer_type == kAutoBroadcast) {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
+    PrimExpr stride_from_shape = 1;
     for (size_t i = buffer->shape.size(); i != 0; --i) {
       size_t k = i - 1;
       DataType stride_dtype = buffer->strides[k].dtype();
@@ -507,31 +501,15 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
           cast(stride_dtype,
                BufferLoad(buf_strides,
                           {IntImm(DataType::Int(32), static_cast<int>(k))}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-
-      PrimExpr core_value = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape_cast, explicit_stride);
-      core_value = tvm::if_then_else(buffer->shape[k] == 1,
-                                     make_zero(stride_dtype), core_value);
 
-      // Bind like shape: define var when needed, and only assert when non-NULL
-      PrimExpr bound_stride_val =
-          tvm::if_then_else(is_null, make_zero(stride_dtype), core_value);
-      BindNullable(buffer->strides[k], bound_stride_val, stride_element_name(k),
-                   true, is_null);
+      PrimExpr stride_val = tvm::if_then_else(
+          v_strides_is_null, stride_from_shape, explicit_stride);
 
-      Stmt stride_check = AssertStmt(
-          Or(is_null, buffer->strides[k] == core_value),
-          StringImm(stride_element_name(k) + " mismatch with DLTensor strides"),
-          Evaluate(0));
-      asserts_.emplace_back(stride_check);
-
-      PrimExpr shape_extent = cast(stride_dtype, buffer->shape[k]);
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_extent);
+      BindNullable(buffer->strides[k], stride_val, stride_element_name(k), true,
+                   is_null);
     }
   } else {
-    PrimExpr stride_from_shape = make_const(buffer->DefaultIndexType(), 1);
+    PrimExpr stride_from_shape = 1;
 
     for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0; --k) {
       DataType stride_dtype = buffer->strides[k].dtype();
@@ -540,24 +518,12 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
                BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
       PrimExpr shape_stride = cast(
           stride_dtype, BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
-      PrimExpr stride_from_shape_cast = cast(stride_dtype, stride_from_shape);
-
-      PrimExpr core_value = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape_cast, explicit_stride);
-
-      PrimExpr bound_stride_val =
-          tvm::if_then_else(is_null, make_zero(stride_dtype), core_value);
-      BindNullable(buffer->strides[k], bound_stride_val, stride_element_name(k),
-                   true, is_null);
 
-      Stmt stride_check = AssertStmt(
-          Or(is_null, buffer->strides[k] == core_value),
-          StringImm(stride_element_name(k) + " mismatch with DLTensor strides"),
-          Evaluate(0));
-      asserts_.emplace_back(stride_check);
+      PrimExpr stride_val = tvm::if_then_else(
+          v_strides_is_null, stride_from_shape, explicit_stride);
 
-      stride_from_shape =
-          analyzer_.Simplify(stride_from_shape_cast * shape_stride);
+      BindNullable(buffer->strides[k], stride_val, stride_element_name(k), true,
+                   is_null);
     }
   }
 
@@ -574,9 +540,10 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
     PrimExpr expect_byte_offset =
         make_const(DataType::UInt(64), const_offset->value * data_bytes);
     Stmt byte_off_check =
-        AssertStmt(Or(is_null, expect_byte_offset == actual_byte_offset),
+        AssertStmt(expect_byte_offset == actual_byte_offset,
                    StringImm(arg_name + ".byte_offset mismatch"), nop);
-    asserts_.emplace_back(byte_off_check);
+    byte_off_check = IfThenElse(Not(is_null), byte_off_check);
+    asserts_.emplace_back(SeqStmt({byte_off_check, nop}));
   } else {
     PrimExpr actual_byte_offset = tvm::if_then_else(
         Not(is_null),
@@ -586,28 +553,15 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         cast(buffer->elem_offset.dtype(),
              (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
 
-    // Like shape/stride, do NULL-safe binding for elem_offset:
-    //   handle is NULL → 0
-    //   handle non-NULL → actual_byte_offset / data_bytes
-    PrimExpr bound_elem_off = tvm::if_then_else(
-        is_null, make_zero(buffer->elem_offset.dtype()), expect_elem_off);
-    BindNullable(buffer->elem_offset, bound_elem_off, arg_name + ".elem_offset",
-                 true, is_null);
-
-    // Strict consistency check for non-NULL case
-    Stmt elem_off_check =
-        AssertStmt(Or(is_null, buffer->elem_offset == expect_elem_off),
-                   StringImm(arg_name + ".elem_offset mismatch"), nop);
-    asserts_.emplace_back(elem_off_check);
+    BindNullable(buffer->elem_offset, expect_elem_off,
+                 arg_name + ".elem_offset", true, is_null);
 
     if (buffer->offset_factor > 1) {
       PrimExpr offset = buffer->elem_offset;
       PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
       PrimExpr zero = make_zero(offset.dtype());
-      Stmt off_factor_check =
-          AssertStmt(Or(is_null, truncmod(offset, factor) == zero),
-                     StringImm(arg_name + ".elem_offset factor mismatch"), nop);
-      asserts_.emplace_back(off_factor_check);
+      BindNullable(offset, truncmod(offset, factor), arg_name + ".elem_offset",
+                   true, is_null);
     }
   }
 
@@ -621,14 +575,29 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
       Not(is_null),
       TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
       make_zero(DataType::Int(32)));
+
   // Bind device_id to a safe expression (0 when NULL handle)
   BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
                is_null);
   // Check device_type consistency (device_id equality is implicitly ensured by
   // binding above)
-  init_nest_.emplace_back(
-      AssertStmt(Or(is_null, device_type == actual_dev_type),
-                 StringImm(arg_name + ".device_type mismatch"), nop));
+  {
+    std::ostringstream dev_msg;
+    dev_msg << arg_name << ".device_type mismatch";
+    if (const auto *imm = device_type.as<IntImmNode>()) {
+      dev_msg << " [expected: " << imm->value << " ("
+              << tvm::runtime::DLDeviceType2Str(static_cast<int>(imm->value))
+              << ")]";
+    }
+    // Give a short legend so users can interpret numeric codes in the
+    // appended "got/expected" part printed by the runtime.
+    dev_msg << "; DLPack codes: 1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, "
+               "14=OneAPI, 15=WebGPU";
+    auto device_type_check =
+        IfThenElse(Not(is_null), AssertStmt(device_type == actual_dev_type,
+                                            StringImm(dev_msg.str()), nop));
+    asserts_.emplace_back(SeqStmt({device_type_check, Evaluate(0)}));
+  }
 
   // Data field.  Because the validation of the data field may depend
   // on a dynamic size defined by the other DLTensor* parameters, this
@@ -650,12 +619,14 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         product *= dim;
       return product;
     }();
-    asserts_.emplace_back(AssertStmt(
-        Or(is_null, (alloc_size == 0) ||
-                        !Call(DataType::Bool(), builtin::isnullptr(), {vptr})),
+    Stmt data_null_check = AssertStmt(
+        (alloc_size == 0) ||
+            !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
         StringImm(arg_name +
                   " is expected to have non-NULL data pointer, but got NULL"),
-        nop));
+        nop);
+    data_null_check = IfThenElse(Not(is_null), data_null_check);
+    asserts_.emplace_back(SeqStmt({data_null_check, nop}));
 
     // mark alignment of external bufs
     init_nest_.emplace_back(
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index 793ada11..6a580636 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -105,7 +105,7 @@ public:
    */
   void BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
                     const PrimExpr &device_id, const Var &handle,
-                    const std::string &arg_name);
+                    const std::string &arg_name, bool is_used);
 
   /*! \return The defs generated in binding. */
   const std::vector<Var> &defs() const { return defs_; }
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 187a75dc..f9d54dd5 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -39,6 +39,7 @@
 
 #include "../op/builtin.h"
 #include "arg_binder.h"
+#include "merge_if_stmt.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
@@ -297,6 +298,81 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   std::vector<std::pair<PrimExpr, Var>> var_def;
   std::vector<std::pair<Var, Buffer>> buffer_def;
 
+  // First, collect a reverse map from Buffer->data var to parameter var so we
+  // can detect whether a buffer is actually used by the function body. In
+  // addition, collect variables that appear in the buffer's shape/stride so we
+  // can consider uses of those symbols as a use of the buffer itself.
+  std::unordered_map<const VarNode *, const VarNode *> data_var2param;
+  std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+      shape_var2params;
+  for (const auto &kv : func_ptr->buffer_map) {
+    const Var &param = kv.first;
+    const Buffer &buf = kv.second;
+    data_var2param[buf->data.get()] = param.get();
+    auto record_shape_vars = [&](const PrimExpr &e) {
+      PostOrderVisit(e, [&](const ObjectRef &n) {
+        if (const auto *v = n.as<VarNode>()) {
+          shape_var2params[v].push_back(param.get());
+        }
+      });
+    };
+    for (const PrimExpr &e : buf->shape)
+      record_shape_vars(e);
+    for (const PrimExpr &e : buf->strides)
+      record_shape_vars(e);
+    if (buf->elem_offset.defined())
+      record_shape_vars(buf->elem_offset);
+  }
+
+  // A visitor that marks a buffer as used when its underlying data var is
+  // referenced (e.g. BufferLoad/BufferStore or any direct var usage).
+  struct UsedBufferDetector : public StmtExprVisitor {
+    UsedBufferDetector(
+        const std::unordered_map<const VarNode *, const VarNode *> &data2param,
+        const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+            &shape2params)
+        : data2param(data2param), shape2params(shape2params) {}
+    void VisitExpr_(const VarNode *op) override {
+      auto it = data2param.find(op);
+      if (it != data2param.end()) {
+        used_params.insert(it->second);
+      }
+      auto it2 = shape2params.find(op);
+      if (it2 != shape2params.end()) {
+        for (const VarNode *p : it2->second)
+          used_params.insert(p);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+    void VisitStmt_(const BufferStoreNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params.insert(it->second);
+      }
+      StmtExprVisitor::VisitStmt_(op);
+    }
+    void VisitExpr_(const BufferLoadNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params.insert(it->second);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+
+    const std::unordered_map<const VarNode *, const VarNode *> &data2param;
+    const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+        &shape2params;
+    std::unordered_set<const VarNode *> used_params;
+  };
+
+  UsedBufferDetector detector(data_var2param, shape_var2params);
+  detector(func_ptr->body);
+
+  // Build the packed argument handling. While doing so, keep track of whether
+  // each parameter buffer is actually used. Unused input buffers can be
+  // nullable and do not require DLTensor field dereferences.
+  std::unordered_set<const VarNode *> used_param_buffers = detector.used_params;
+
   for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
     Var param = func_ptr->params[i];
     PrimExpr arg_value;
@@ -311,7 +387,23 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     DataType dtype = param.dtype();
     if (dtype.is_handle()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be pointer";
+      // Prefer the Buffer name if available; otherwise, fall back to param name
+      // (trim _handle).
+      std::string display_name;
+      auto it_buf = func_ptr->buffer_map.find(param);
+      if (it_buf != func_ptr->buffer_map.end()) {
+        const auto &kv = *it_buf;
+        display_name = kv.second->data->name_hint;
+      } else {
+        display_name = param->name_hint;
+        const char *suffix = "_handle";
+        if (display_name.size() >= 7 &&
+            display_name.compare(display_name.size() - 7, 7, suffix) == 0) {
+          display_name.erase(display_name.size() - 7);
+        }
+      }
+      msg << name_hint << ": Expect buffer " << display_name
+          << " to be pointer or tensor";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
                          type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
@@ -331,7 +423,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
                          handle_from_tensor, arg_value);
     } else if (dtype.is_bool()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be boolean";
+      msg << name_hint << ": Expect " << param->name_hint << " to be boolean";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
                          type_index == ffi::TypeIndex::kTVMFFIInt,
@@ -341,7 +433,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
     } else if (dtype.is_int() || dtype.is_uint()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be int";
+      msg << name_hint << ": Expect " << param->name_hint << " to be int";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
                          type_index == ffi::TypeIndex::kTVMFFIBool,
@@ -350,7 +442,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     } else {
       ICHECK(dtype.is_float());
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be float";
+      msg << name_hint << ": Expect " << param->name_hint << " to be float";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
                          type_index == ffi::TypeIndex::kTVMFFIInt ||
@@ -388,8 +480,11 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   for (const auto &[var, buffer] : buffer_def) {
-    binder.BindDLTensor(buffer, device_type, device_id, var,
-                        name_hint + "." + var->name_hint);
+    // Prefer buffer data var name in diagnostics to avoid exposing low-level
+    // handle vars
+    std::string display = name_hint + "." + buffer->data->name_hint;
+    binder.BindDLTensor(buffer, device_type, device_id, var, display,
+                        used_param_buffers.count(var.get()));
     arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
   // reset global symbol to attach prefix
@@ -436,7 +531,6 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
   func_ptr->buffer_map = ffi::Map<Var, Buffer>();
   func_ptr->ret_type = PrimType(DataType::Int(32));
-
   // return the function.
   return func;
 }
@@ -467,6 +561,7 @@ tvm::transform::Pass MakePackedAPI() {
           func.CopyOnWrite()->body = body.value();
         }
         func = MakePackedAPI(std::move(func));
+        func = MergeIfStmtSubstitute(func);
 
         if (!func.same_as(orig_func)) {
           updates->Add(gvar, func);
diff --git a/src/transform/merge_if_stmt.cc b/src/transform/merge_if_stmt.cc
index 39ea3b0b..98d9d3ac 100644
--- a/src/transform/merge_if_stmt.cc
+++ b/src/transform/merge_if_stmt.cc
@@ -3,6 +3,8 @@
  * \brief Merge the If Stmt in SeqStmt
  */
 
+#include "merge_if_stmt.h"
+
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
@@ -20,23 +22,46 @@ using namespace tir;
 class MergeIfStmtRewriter : public StmtExprMutator {
 public:
   static PrimFunc Substitute(PrimFunc &f) {
-    auto rewriter = MergeIfStmtRewriter();
-    f.CopyOnWrite()->body = rewriter(f->body);
+    f.CopyOnWrite()->body = MergeIfStmtRewriter::Apply(f->body);
     return f;
   }
 
+  static Stmt Apply(Stmt stmt) {
+    auto rewriter = MergeIfStmtRewriter();
+    return rewriter(stmt);
+  }
+
 private:
   MergeIfStmtRewriter() = default;
 
+  void FlattenAppend(const Stmt &s, Array<Stmt> *out) {
+    if (const auto *seq = s.as<SeqStmtNode>()) {
+      for (const Stmt &e : seq->seq) {
+        FlattenAppend(e, out);
+      }
+    } else {
+      out->push_back(s);
+    }
+  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
-    Array<Stmt> new_seq;
+    // First, recursively flatten nested SeqStmt so that
+    //   SeqStmt{ if, SeqStmt{ if, SeqStmt{ if } } }
+    // becomes a single-level sequence of [if, if, if].
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : op->seq) {
+      Stmt new_stmt = this->VisitStmt(stmt);
+      FlattenAppend(new_stmt, &flat_seq);
+    }
 
+    // Then, merge consecutive IfThenElse (without else) that share the same
+    // condition.
+    Array<Stmt> new_seq;
     PrimExpr current_condition;
     Array<Stmt> current_if_bodies;
 
-    for (const Stmt &stmt : op->seq) {
-      Stmt new_stmt = this->VisitStmt(stmt);
-      if (const IfThenElseNode *if_node = new_stmt.as<IfThenElseNode>()) {
+    for (const Stmt &stmt : flat_seq) {
+      if (const auto *if_node = stmt.as<IfThenElseNode>()) {
         if (!if_node->else_case.defined()) {
           if (current_condition.defined() &&
               ExprDeepEqual()(current_condition, if_node->condition)) {
@@ -73,7 +98,7 @@ private:
         current_if_bodies.clear();
       }
 
-      new_seq.push_back(new_stmt);
+      new_seq.push_back(stmt);
     }
 
     if (!current_if_bodies.empty()) {
@@ -90,6 +115,12 @@ private:
   }
 };
 
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f) {
+  return MergeIfStmtRewriter::Substitute(f);
+}
+
+Stmt ApplyMergeIfStmt(Stmt stmt) { return MergeIfStmtRewriter::Apply(stmt); }
+
 using namespace tir::transform;
 tvm::transform::Pass MergeIfStmt() {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
diff --git a/src/transform/merge_if_stmt.h b/src/transform/merge_if_stmt.h
new file mode 100644
index 00000000..5d7a282d
--- /dev/null
+++ b/src/transform/merge_if_stmt.h
@@ -0,0 +1,52 @@
+/*!
+ * \file merge_if_stmt.h
+ * \brief Merge consecutive If statements with the same condition
+ */
+#ifndef TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+#define TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Forward declaration
+class MergeIfStmtRewriter;
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a PrimFunc
+ *
+ * This function merges consecutive IfThenElse statements that have the same
+ * condition into a single if statement with a SeqStmt body.
+ *
+ * Example:
+ *   if (cond) { stmt1 }
+ *   if (cond) { stmt2 }
+ *   if (cond) { stmt3 }
+ *
+ * Becomes:
+ *   if (cond) {
+ *     stmt1
+ *     stmt2
+ *     stmt3
+ *   }
+ *
+ * \param f The PrimFunc to transform
+ * \return Transformed PrimFunc with merged if statements
+ */
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f);
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a statement
+ * \param stmt The statement to transform
+ * \return Transformed statement with merged if statements
+ */
+Stmt ApplyMergeIfStmt(Stmt stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index 07d4e04c..cce1fce8 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -7,49 +7,15 @@ from tilelang.utils import map_torch_type
 
 
 @tl.jit
-def ptr_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
-    @T.prim_func
-    def main(
-        a_ptr: T.ptr,
-        b_ptr: T.ptr,
-        c_ptr: T.ptr,
-        bias_ptr: T.ptr,
-        m: T.int32,
-        n: T.int32,
-        k: T.int32,
-        with_bias: T.bool,
-    ):
-        A = T.make_tensor(a_ptr, (m, k), dtype)
-        B = T.make_tensor(b_ptr, (k, n), dtype)
-        C = T.make_tensor(c_ptr, (m, n), accum_dtype)
-        Bias = T.make_tensor(bias_ptr, (n), accum_dtype)
-
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_N, block_K), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(k, block_K), num_stages=3):
-                # Copy tile of A
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[bx * block_N, ko * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
-
-            if with_bias:
-                for i, j in T.Parallel(block_M, block_N):
-                    C_local[i, j] += Bias[bx * block_N + j]
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-@tl.jit
-def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def tensor_null_test(M,
+                     N,
+                     K,
+                     block_M,
+                     block_N,
+                     block_K,
+                     dtype="float16",
+                     accum_dtype="float",
+                     with_bias=False):
 
     @T.prim_func
     def main(
@@ -57,7 +23,6 @@ def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_
             B: T.Tensor((K, N), dtype),
             C: T.Tensor((M, N), accum_dtype),
             Bias: T.Tensor((N), accum_dtype),
-            with_bias: T.bool,
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -83,28 +48,13 @@ def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_
 
 
 def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    kernel = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
 
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
-    d = torch.randn(N, device="cuda", dtype=map_torch_type(accum_dtype))
-    kernel(a, b, c, None, M, N, K, False)
-
-    ref_no_bias = (a @ b.T).to(map_torch_type(accum_dtype))
-    ref_with_bias = ref_no_bias + d
-
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-
-    kernel(a, b, c, d, M, N, K, True)
-
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
-
-    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel(a, b, c, None, False)
-    torch.testing.assert_close(c, ref_no_bias, atol=1e-2, rtol=1e-2)
-    kernel(a, b, c, d, True)
-    torch.testing.assert_close(c, ref_with_bias, atol=1e-2, rtol=1e-2)
+    kernel = tensor_null_test(
+        M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
+    kernel(a, b, c, None)
 
 
 def test_nullptr():
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 17d6e4aa..dfa8050a 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -225,6 +225,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
         mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tilelang.transform.MakePackedAPI()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tilelang.transform.LowerDeviceKernelLaunch()(mod)
 
     # Transform threadblock to persistent threadblock
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index e06e9862..6bd4abac 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -166,8 +166,6 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
             else:
                 expected_dtype_strs.append(None)
                 is_buffer_param.append(False)
-        # Global function name used in error messages
-        global_symbol = str(prim_func.attrs.get("global_symbol", "main"))
 
         # Map torch dtype to TVM-style dtype string
         def torch_dtype_to_tvm_str(dtype: torch.dtype) -> str:
@@ -236,21 +234,6 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
                     tensor = torch.empty(*shape, dtype=dtype, device=out_device)
                 else:
                     tensor = inputs[ins_idx]
-                    # Input dtype validation with clear error message
-                    if is_buffer_param[i]:
-                        expected_dtype_str = expected_dtype_strs[i]
-                        expected_torch_dtype = param_dtypes[i]
-                        # Only check when the argument is a tensor and expected dtype is known
-                        if isinstance(
-                                tensor, torch.Tensor
-                        ) and expected_dtype_str is not None and tensor.dtype != expected_torch_dtype:
-                            param_var = params[i]
-                            # Reconstruct TVM-like handle name A_handle for error clarity
-                            handle_name = f"{param_var.name}_handle"
-                            actual_dtype_str = torch_dtype_to_tvm_str(tensor.dtype)
-                            raise RuntimeError(
-                                f"{global_symbol}.{handle_name}.dtype is expected to be {expected_dtype_str}, but got {actual_dtype_str}"
-                            )
                     ins_idx += 1
                 tensor_list.append(tensor)
 
-- 
GitLab


From 36a2b2f3fe06dfdbdd92f286938b6235815e73ff Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 28 Nov 2025 13:46:15 +0800
Subject: [PATCH 057/139] [Refactor] Simplify index sign state handling in
 LegalizeNegativeIndex (#1354)

This commit refines the logic for determining the sign state of indices in the LegalizeNegativeIndex transformation. It prioritizes vector patterns, specifically Ramp and Broadcast nodes, to avoid compile-time lane queries. The handling of scalar indices is also streamlined, ensuring clearer diagnostics when non-negativity cannot be proven. These changes enhance the robustness and clarity of index handling in the transformation pass.
---
 src/transform/legalize_negative_index.cc | 70 ++++++++++--------------
 1 file changed, 30 insertions(+), 40 deletions(-)

diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
index f0df555e..5a84ffac 100644
--- a/src/transform/legalize_negative_index.cc
+++ b/src/transform/legalize_negative_index.cc
@@ -44,52 +44,28 @@ private:
       PrimExpr simplified = analyzer_.Simplify(indices[i]);
       IndexSignState state = IndexSignState::kUnknown;
 
-      // Handle scalar indices with the standard analyzer
-      if (simplified.dtype().lanes() == 1) {
-        if (analyzer_.CanProve(simplified >= 0))
+      // Handle vector patterns first to avoid querying lanes() on
+      // scalable vectors (which is not allowed at compile-time).
+      if (const auto *ramp = simplified.as<RampNode>()) {
+        // For scalable vectors, we cannot rely on a constant lane count.
+        // Use sufficient (but not necessary) conditions:
+        // - If base >= 0 and stride >= 0, all lanes are non-negative.
+        // - If base < 0 and stride <= 0, all lanes are negative.
+        bool base_nonneg = analyzer_.CanProve(ramp->base >= 0);
+        bool base_neg = analyzer_.CanProve(ramp->base < 0);
+        bool stride_nonneg = analyzer_.CanProve(ramp->stride >= 0);
+        bool stride_nonpos = analyzer_.CanProve(ramp->stride <= 0);
+
+        if (base_nonneg && stride_nonneg) {
           state = IndexSignState::kNonNegative;
-        else if (analyzer_.CanProve(simplified < 0))
+        } else if (base_neg && stride_nonpos) {
           state = IndexSignState::kNegative;
-        else
-          DLOG(WARNING)
-              << "LegalizeNegativeIndex: cannot prove non-negative index "
-              << simplified << " for buffer " << buffer_name << " (axis " << i
-              << ", index " + indices[i]->Script() + ").";
-      }
-      // Vector indices: try to reason about non-negativity/negativity
-      // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
-      // lanes).
-      else if (const auto *ramp = simplified.as<RampNode>()) {
-        // Compute a safe lower/upper bound for the vector lanes
-        // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
-        // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
-        auto base_bound = analyzer_.const_int_bound(ramp->base);
-        auto stride_bound = analyzer_.const_int_bound(ramp->stride);
-        int lanes = *as_const_int(ramp->lanes);
-
-        int64_t base_min = base_bound->min_value;
-        int64_t base_max = base_bound->max_value;
-        int64_t s_min = stride_bound->min_value;
-        int64_t s_max = stride_bound->max_value;
-
-        // Guard against overflow is not strictly necessary here because
-        // bounds may be +/-inf represented by sentinel values.
-        int64_t lower = base_min;
-        if (s_min < 0)
-          lower += s_min * (lanes - 1);
-        int64_t upper = base_max;
-        if (s_max > 0)
-          upper += s_max * (lanes - 1);
-
-        if (lower >= 0)
-          state = IndexSignState::kNonNegative;
-        else if (upper < 0)
-          state = IndexSignState::kNegative;
-        else
+        } else {
           DLOG(WARNING)
               << "LegalizeNegativeIndex: cannot prove non-negative index "
               << simplified << " for buffer " << buffer_name << " (axis " << i
               << ", index " + indices[i]->Script() + ").";
+        }
       } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
         auto v = analyzer_.Simplify(broadcast->value);
         if (analyzer_.CanProve(v >= 0))
@@ -109,6 +85,20 @@ private:
                 << simplified << " for buffer " << buffer_name << " (axis " << i
                 << ", index " + indices[i]->Script() + ").";
         }
+      } else {
+        // Assume scalar (or non-Ramp/Broadcast) index; avoid querying lanes().
+        // Fall back to scalar reasoning. If this expression is actually a
+        // vector-but-not-Ramp/Broadcast, treat as unknown to be safe.
+        // Try to prove scalar first; if proof fails, leave as unknown.
+        if (analyzer_.CanProve(simplified >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(simplified < 0))
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
       }
       states.push_back(state);
     }
-- 
GitLab


From 17cfeb769a3f385e2bebc88b0a50ee5643f6b099 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:40:59 +0800
Subject: [PATCH 058/139] [Enhancement] Improve error handling and assertion
 messages across runtime and argument binding (#1356)

This commit enhances the error handling mechanisms in the runtime by introducing CPU-safe runtime helpers and refining assertion messages in the CodeGenCHost and ArgBinder. It includes structured packed error messages for various conditions, improving clarity in diagnostics. Additionally, the CMake configuration is updated to always include necessary runtime helpers, ensuring consistent error reporting. The changes aim to provide clearer feedback during runtime errors and improve the overall robustness of the argument binding process.
---
 3rdparty/tvm                             |   2 +-
 CMakeLists.txt                           |   7 +-
 src/runtime/error_helpers.cc             | 172 +++++++++++++++-
 src/runtime/error_helpers.h              |  27 +++
 src/target/codegen_c_host.cc             |  58 +++---
 src/transform/arg_binder.cc              | 245 ++++++++++++++++-------
 src/transform/legalize_negative_index.cc |  70 ++++---
 src/transform/make_packed_api.cc         |  13 +-
 tilelang/jit/adapter/tvm_ffi.py          |   3 +-
 9 files changed, 459 insertions(+), 138 deletions(-)
 create mode 100644 src/runtime/error_helpers.h

diff --git a/3rdparty/tvm b/3rdparty/tvm
index e3af4000..fc7ed0b9 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit e3af400013551755a8df668ba77b530735931ade
+Subproject commit fc7ed0b9cb7a52eb1c8bf6e8c26bbb8dff3655ce
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f784f11f..e39891a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,11 @@ file(GLOB TILE_LANG_SRCS
   src/target/intrin_rule*.cc
 )
 
+# Always include CPU-safe runtime helpers
+list(APPEND TILE_LANG_SRCS
+  src/runtime/error_helpers.cc
+)
+
 # Track if the user explicitly selected a backend via cache options.
 set(TILELANG_BACKEND_USER_SELECTED OFF)
 foreach(BACKEND IN LISTS TILELANG_BACKENDS)
@@ -206,7 +211,7 @@ elseif(USE_CUDA)
   cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
 
   file(GLOB TILE_LANG_CUDA_SRCS
-    src/runtime/*.cc
+    src/runtime/runtime.cc
     src/target/ptx.cc
     src/target/codegen_cuda.cc
     src/target/rt_mod_cuda.cc
diff --git a/src/runtime/error_helpers.cc b/src/runtime/error_helpers.cc
index ba66305a..903f8b1d 100644
--- a/src/runtime/error_helpers.cc
+++ b/src/runtime/error_helpers.cc
@@ -1,9 +1,14 @@
 /*
  * Helper functions for nicer runtime error messages.
  */
+#include "error_helpers.h"
+
 #include <tvm/ffi/c_api.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/data_type.h>
+#include <tvm/runtime/device_api.h>
 
 #include <sstream>
 #include <string>
@@ -25,8 +30,9 @@ static int DTypeMismatch(const tvm::ffi::String &kernel_name,
                                 static_cast<int>(expect_bits),
                                 static_cast<int>(expect_lanes));
   std::ostringstream os;
-  os << std::string(kernel_name) << ": dtype of " << std::string(buffer_name)
-     << " is expected to be " << expect << ", but got " << actual;
+  os << "kernel " << std::string(kernel_name) << " input "
+     << std::string(buffer_name) << " dtype expected " << expect << ", but got "
+     << actual;
   TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
   return -1;
 }
@@ -48,13 +54,169 @@ static int DTypeMismatchNoNames(int64_t actual_code, int64_t actual_bits,
   return -1;
 }
 
-} // namespace tl
-} // namespace tvm
-
+// Register packed versions, following the design in runtime.cc
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
+
+  // Packed: __tvm_error_dtype_mismatch(kernel_name, buffer_name,
+  //                                    actual_code, actual_bits, actual_lanes,
+  //                                    expect_code, expect_bits, expect_lanes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_dtype_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 8) << "Expected 8 args: kernel, buffer, "
+                                    "actual_code, actual_bits, actual_lanes, "
+                                 << "expect_code, expect_bits, expect_lanes";
+
+        auto kernel_name = args[0].cast<tvm::ffi::String>();
+        auto buffer_name = args[1].cast<tvm::ffi::String>();
+        int64_t actual_code = args[2].cast<int64_t>();
+        int64_t actual_bits = args[3].cast<int64_t>();
+        int64_t actual_lanes = args[4].cast<int64_t>();
+        int64_t expect_code = args[5].cast<int64_t>();
+        int64_t expect_bits = args[6].cast<int64_t>();
+        int64_t expect_lanes = args[7].cast<int64_t>();
+
+        // Reuse the helper to format the message
+        (void)DTypeMismatch(kernel_name, buffer_name, actual_code, actual_bits,
+                            actual_lanes, expect_code, expect_bits,
+                            expect_lanes);
+        // Provide a return value for completeness, then signal the error
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_ndim_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_ndim_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " ndim expected " << expect << ", but got "
+           << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_byte_offset_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_byte_offset_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " byte_offset expected " << expect
+           << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_device_type_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_device_type_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        const char *expect_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(expect));
+        const char *got_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(got));
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " device_type expected " << expect_str
+           << ", but got " << got_str;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_null_ptr,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3)
+            << "__tvm_error_null_ptr(kernel, buffer, field)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " expected non-NULL, but got NULL";
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_expect_eq,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 5)
+            << "__tvm_error_expect_eq(kernel, buffer, field, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        int64_t expect = args[3].cast<int64_t>();
+        int64_t got = args[4].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field) << " expected "
+           << expect << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String [, reason:String]
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_constraint_violation,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3 || args.size() == 4)
+            << "__tvm_error_constraint_violation(kernel, buffer, field[, "
+               "reason])";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::string reason;
+        if (args.size() == 4) {
+          reason = args[3].cast<tvm::ffi::String>();
+        }
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " constraint not satisfied";
+        if (!reason.empty()) {
+          os << ": " << reason;
+        }
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // Legacy typed registrations for backward compatibility
   refl::GlobalDef().def("tilelang_error_dtype_mismatch",
                         &tvm::tl::DTypeMismatch);
   refl::GlobalDef().def("tilelang_error_dtype_mismatch2",
                         &tvm::tl::DTypeMismatchNoNames);
 }
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/runtime/error_helpers.h b/src/runtime/error_helpers.h
new file mode 100644
index 00000000..6620d837
--- /dev/null
+++ b/src/runtime/error_helpers.h
@@ -0,0 +1,27 @@
+/*!
+ * \file tl/runtime/error_helpers.h
+ * \brief Error helper FFI names for TileLang runtime.
+ */
+
+#ifndef TVM_TL_RUNTIME_ERROR_HELPERS_H_
+#define TVM_TL_RUNTIME_ERROR_HELPERS_H_
+
+namespace tvm {
+namespace tl {
+
+// Error helper packed functions
+constexpr const char *tvm_error_dtype_mismatch = "__tvm_error_dtype_mismatch";
+constexpr const char *tvm_error_ndim_mismatch = "__tvm_error_ndim_mismatch";
+constexpr const char *tvm_error_byte_offset_mismatch =
+    "__tvm_error_byte_offset_mismatch";
+constexpr const char *tvm_error_device_type_mismatch =
+    "__tvm_error_device_type_mismatch";
+constexpr const char *tvm_error_null_ptr = "__tvm_error_null_ptr";
+constexpr const char *tvm_error_expect_eq = "__tvm_error_expect_eq";
+constexpr const char *tvm_error_constraint_violation =
+    "__tvm_error_constraint_violation";
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_RUNTIME_ERROR_HELPERS_H_
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
index fedf8a1d..4f1a70ce 100644
--- a/src/target/codegen_c_host.cc
+++ b/src/target/codegen_c_host.cc
@@ -354,32 +354,44 @@ void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
     stream << "if (!(" << cond << ")) {\n";
     int assert_if_scope = this->BeginScope();
     {
-      // Prepare the base error message
+      // Prepare the base error message: allow StringImm or general PrimExpr
       const auto *msg_node = op->message.as<tvm::tir::StringImmNode>();
-      ICHECK(msg_node != nullptr) << "Assert message expected to be StringImm";
-      const std::string &raw_msg = msg_node->value;
-      const std::string esc_msg = tvm::support::StrEscape(
-          raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
-          /*escape_whitespace_special_chars=*/true);
-
-      // If the assertion is an equality check, append the actual LHS/RHS values
-      if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
-        std::string lhs = PrintExpr(eq->a);
-        std::string rhs = PrintExpr(eq->b);
-        PrintIndent();
-        stream << "char __tvm_assert_msg_buf[512];\n";
-        PrintIndent();
-        stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
-                  "got: %lld\", \""
-               << esc_msg << "\", (long long)(" << lhs << "), (long long)("
-               << rhs << "));\n";
-        PrintIndent();
-        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
-                  "__tvm_assert_msg_buf);\n";
+      bool msg_is_literal = (msg_node != nullptr);
+      std::string esc_msg;
+      std::string msg_expr;
+      if (msg_is_literal) {
+        const std::string &raw_msg = msg_node->value;
+        esc_msg = tvm::support::StrEscape(
+            raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
+            /*escape_whitespace_special_chars=*/true);
+      } else {
+        msg_expr = PrintExpr(op->message);
+      }
+
+      // Only print expected/got values for equality when message is StringImm
+      if (msg_is_literal) {
+        if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
+          std::string lhs = PrintExpr(eq->a);
+          std::string rhs = PrintExpr(eq->b);
+          PrintIndent();
+          stream << "char __tvm_assert_msg_buf[512];\n";
+          PrintIndent();
+          stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
+                    "got: %lld\", \""
+                 << esc_msg << "\", (long long)(" << lhs << "), (long long)("
+                 << rhs << "));\n";
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
+                    "__tvm_assert_msg_buf);\n";
+        } else {
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \""
+                 << esc_msg << "\");\n";
+        }
       } else {
         PrintIndent();
-        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \"" << esc_msg
-               << "\");\n";
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", " << msg_expr
+               << ");\n";
       }
     }
     PrintIndent();
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 6857f502..318ff0f9 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -13,6 +13,7 @@
 #include <sstream>
 #include <unordered_set>
 
+#include "../runtime/error_helpers.h"
 #include "tir/transforms/ir_utils.h"
 #include "tvm/arith/int_solver.h"
 #include "tvm/ffi/cast.h"
@@ -35,22 +36,58 @@ void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
   }
 
   if (!is_one(scond)) {
-    std::ostringstream os;
-    os << "Argument " << arg_name << " has an unsatisfied constraint: " << cond;
-
-    // Check if the condition is of the form "is_null || actual_cond"
-    // If so, generate "if !is_null: assert actual_cond" instead of "assert
-    // is_null || actual_cond"
-    if (nullable_guard.defined()) {
-      // Pattern: nullable_guard || actual_condition
-      // We want to transform this into: if !nullable_guard: assert
-      // actual_condition
-      Stmt check = AssertStmt(scond, StringImm(os.str()), Evaluate(0));
-      check = IfThenElse(Not(nullable_guard), check);
-      asserts->emplace_back(SeqStmt({check, Evaluate(0)}));
+    // Extract kernel/buffer/field from arg_name (e.g., "main.A.shape[0]")
+    std::string kernel = arg_name;
+    std::string buf_and_field = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel = arg_name.substr(0, dot_pos);
+      buf_and_field = arg_name.substr(dot_pos + 1);
+    }
+    std::string buffer = buf_and_field;
+    std::string field;
+    size_t dot2 = buf_and_field.find('.');
+    if (dot2 != std::string::npos) {
+      buffer = buf_and_field.substr(0, dot2);
+      field = buf_and_field.substr(dot2 + 1);
+    }
+
+    // If cond is an equality, prefer structured packed error with expect/got
+    if (const auto *eq = scond.as<tvm::tir::EQNode>()) {
+      PrimExpr lhs = eq->a;
+      PrimExpr rhs = eq->b;
+      // Choose rhs as expected and lhs as got for better semantics in most
+      // binding cases
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_expect_eq));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      pargs.push_back(cast(DataType::Int(64), rhs)); // expected
+      pargs.push_back(cast(DataType::Int(64), lhs)); // got
+
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      // Only emit at runtime when the equality fails
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
     } else {
-      asserts->emplace_back(
-          AssertStmt(scond, StringImm(os.str()), Evaluate(0)));
+      // Fallback: packed generic constraint violation without dumping cond
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_constraint_violation));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
     }
   }
 }
@@ -318,22 +355,29 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
 
   PrimExpr a_ndim =
       make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
-  std::ostringstream ndim_err_msg;
-  // Note: We cannot embed runtime values into the message string.
-  // Keep message human-friendly without printing TIR exprs.
-  ndim_err_msg << arg_name << ".ndim is expected to equal "
-               << buffer->shape.size() << ", but got mismatched ndim";
-  auto msg = StringImm(ndim_err_msg.str());
-  // Only check ndim when handle is non-NULL (using if statement)
-  Stmt ndim_check = AssertStmt(a_ndim == v_ndim, msg, nop);
-  ndim_check = IfThenElse(Not(is_null), ndim_check);
-  init_nest_.emplace_back(SeqStmt({ndim_check, nop}));
+  // Build clearer ndim message with kernel/buffer names
+  std::string kernel_nm = arg_name;
+  std::string buf_nm = arg_name;
+  size_t dot_pos = arg_name.find('.');
+  if (dot_pos != std::string::npos) {
+    kernel_nm = arg_name.substr(0, dot_pos);
+    buf_nm = arg_name.substr(dot_pos + 1);
+  }
+  // Only check ndim when handle is non-NULL: use packed error helper
+  PrimExpr ndim_ok = (a_ndim == v_ndim);
+  ffi::Array<PrimExpr> ndim_args;
+  ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
+  ndim_args.push_back(StringImm(kernel_nm));
+  ndim_args.push_back(StringImm(buf_nm));
+  ndim_args.push_back(cast(DataType::Int(64), a_ndim));
+  ndim_args.push_back(cast(DataType::Int(64), v_ndim));
+  Stmt ndim_call =
+      Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
+  init_nest_.emplace_back(
+      SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
+                          Evaluate(0)),
+               nop}));
   // type checks
-  std::ostringstream type_err_msg;
-  // Avoid dumping TIR expressions in error text; just state mismatch.
-  // Include expected dtype triplet for clarity.
-  type_err_msg << arg_name << ".dtype is expected to be " << buffer->dtype
-               << ", but got incompatible dtype";
   // Guard all dtype field loads by `is_null` using if_then_else
   PrimExpr v_type_code = tvm::if_then_else(
       Not(is_null),
@@ -402,11 +446,36 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   if (!(buffer->dtype == DataType::Int(1) ||
         buffer->dtype == DataType::Int(4) ||
         buffer->dtype == DataType::UInt(4))) {
-    auto type_msg = StringImm(type_err_msg.str());
-    // Only check dtype when handle is non-NULL (using if statement)
-    Stmt dtype_check = AssertStmt(cond, type_msg, nop);
-    dtype_check = IfThenElse(Not(is_null), dtype_check);
-    asserts_.emplace_back(SeqStmt({dtype_check, nop}));
+    // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch occurs.
+    // Only issue the call when handle is non-NULL and cond is false.
+    ffi::Array<PrimExpr> packed_args;
+    packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
+    // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
+    // diagnostics
+    std::string kernel_name = arg_name;
+    std::string buffer_name = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel_name = arg_name.substr(0, dot_pos);
+      buffer_name = arg_name.substr(dot_pos + 1);
+    }
+    packed_args.push_back(StringImm(kernel_name));
+    packed_args.push_back(StringImm(buffer_name));
+
+    auto i64 = DataType::Int(64);
+    // Cast to int64 for FFI function signature
+    packed_args.push_back(cast(i64, v_type_code));  // actual_code
+    packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
+    packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
+    packed_args.push_back(cast(i64, expect_code));  // expect_code
+    packed_args.push_back(cast(i64, expect_bits));  // expect_bits
+    packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
+
+    Stmt call_err = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
+    // Guard the call: only when handle is not null and cond fails
+    Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
+    asserts_.emplace_back(SeqStmt({guarded, nop}));
   }
 
   // shape field
@@ -482,14 +551,27 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         << stride_handle_name()
         << ": expected to be compact array, but got non-compact strides";
     if (!conds.empty()) {
-      auto stride_msg = StringImm(stride_err_msg.str());
-      Stmt check =
-          AssertStmt(foldl([](PrimExpr a, PrimExpr b,
-                              Span span) { return logical_and(a, b, span); },
-                           const_true(1), conds),
-                     stride_msg, Evaluate(0));
-      // Only check when strides array is actually present at runtime
-      check = IfThenElse(Not(v_strides_is_null), check);
+      PrimExpr all_ok = foldl([](PrimExpr a, PrimExpr b,
+                                 Span span) { return logical_and(a, b, span); },
+                              const_true(1), conds);
+      // Packed generic violation for non-compact strides
+      std::string kernel_nm3 = arg_name;
+      std::string buf_nm3 = arg_name;
+      size_t dot_pos3 = arg_name.find('.');
+      if (dot_pos3 != std::string::npos) {
+        kernel_nm3 = arg_name.substr(0, dot_pos3);
+        buf_nm3 = arg_name.substr(dot_pos3 + 1);
+      }
+      ffi::Array<PrimExpr> pargs4;
+      pargs4.push_back(StringImm(tvm_error_constraint_violation));
+      pargs4.push_back(StringImm(kernel_nm3));
+      pargs4.push_back(StringImm(buf_nm3));
+      pargs4.push_back(StringImm("strides"));
+      Stmt call_err4 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
+      // Only check when strides array is present and condition fails
+      Stmt check = IfThenElse(Not(v_strides_is_null),
+                              IfThenElse(Not(all_ok), call_err4), Evaluate(0));
       asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
     }
   } else if (buffer->buffer_type == kAutoBroadcast) {
@@ -539,11 +621,18 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         make_const(DataType::UInt(64), 0));
     PrimExpr expect_byte_offset =
         make_const(DataType::UInt(64), const_offset->value * data_bytes);
-    Stmt byte_off_check =
-        AssertStmt(expect_byte_offset == actual_byte_offset,
-                   StringImm(arg_name + ".byte_offset mismatch"), nop);
-    byte_off_check = IfThenElse(Not(is_null), byte_off_check);
-    asserts_.emplace_back(SeqStmt({byte_off_check, nop}));
+    PrimExpr ok = (expect_byte_offset == actual_byte_offset);
+    ffi::Array<PrimExpr> pargs;
+    pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
+    pargs.push_back(StringImm(kernel_nm));
+    pargs.push_back(StringImm(buf_nm));
+    pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
+    pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
+    Stmt call_err =
+        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+    asserts_.emplace_back(SeqStmt(
+        {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
+         nop}));
   } else {
     PrimExpr actual_byte_offset = tvm::if_then_else(
         Not(is_null),
@@ -582,21 +671,18 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
   // Check device_type consistency (device_id equality is implicitly ensured by
   // binding above)
   {
-    std::ostringstream dev_msg;
-    dev_msg << arg_name << ".device_type mismatch";
-    if (const auto *imm = device_type.as<IntImmNode>()) {
-      dev_msg << " [expected: " << imm->value << " ("
-              << tvm::runtime::DLDeviceType2Str(static_cast<int>(imm->value))
-              << ")]";
-    }
-    // Give a short legend so users can interpret numeric codes in the
-    // appended "got/expected" part printed by the runtime.
-    dev_msg << "; DLPack codes: 1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, "
-               "14=OneAPI, 15=WebGPU";
-    auto device_type_check =
-        IfThenElse(Not(is_null), AssertStmt(device_type == actual_dev_type,
-                                            StringImm(dev_msg.str()), nop));
-    asserts_.emplace_back(SeqStmt({device_type_check, Evaluate(0)}));
+    PrimExpr ok = (device_type == actual_dev_type);
+    ffi::Array<PrimExpr> pargs2;
+    pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
+    pargs2.push_back(StringImm(kernel_nm));
+    pargs2.push_back(StringImm(buf_nm));
+    pargs2.push_back(cast(DataType::Int(64), device_type));
+    pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
+    Stmt call_err2 =
+        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
+    asserts_.emplace_back(SeqStmt(
+        {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2), Evaluate(0)),
+         Evaluate(0)}));
   }
 
   // Data field.  Because the validation of the data field may depend
@@ -619,14 +705,31 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
         product *= dim;
       return product;
     }();
-    Stmt data_null_check = AssertStmt(
-        (alloc_size == 0) ||
-            !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-        StringImm(arg_name +
-                  " is expected to have non-NULL data pointer, but got NULL"),
-        nop);
-    data_null_check = IfThenElse(Not(is_null), data_null_check);
-    asserts_.emplace_back(SeqStmt({data_null_check, nop}));
+    // Improve message: kernel/buffer naming for data pointer null check
+    std::string kernel_nm2 = arg_name;
+    std::string buf_nm2 = arg_name;
+    size_t dot_pos2 = arg_name.find('.');
+    if (dot_pos2 != std::string::npos) {
+      kernel_nm2 = arg_name.substr(0, dot_pos2);
+      buf_nm2 = arg_name.substr(dot_pos2 + 1);
+    }
+    // expand combined condition via nested IfThenElse for portability
+    ffi::Array<PrimExpr> pargs3;
+    pargs3.push_back(StringImm(tvm_error_null_ptr));
+    pargs3.push_back(StringImm(kernel_nm2));
+    pargs3.push_back(StringImm(buf_nm2));
+    pargs3.push_back(StringImm("data pointer"));
+    Stmt call_err3 =
+        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
+    asserts_.emplace_back(SeqStmt(
+        {IfThenElse(Not(is_null),
+                    IfThenElse(Not(alloc_size == 0),
+                               IfThenElse(Call(DataType::Bool(),
+                                               builtin::isnullptr(), {vptr}),
+                                          call_err3),
+                               Evaluate(0)),
+                    Evaluate(0)),
+         nop}));
 
     // mark alignment of external bufs
     init_nest_.emplace_back(
diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
index 5a84ffac..f0df555e 100644
--- a/src/transform/legalize_negative_index.cc
+++ b/src/transform/legalize_negative_index.cc
@@ -44,28 +44,52 @@ private:
       PrimExpr simplified = analyzer_.Simplify(indices[i]);
       IndexSignState state = IndexSignState::kUnknown;
 
-      // Handle vector patterns first to avoid querying lanes() on
-      // scalable vectors (which is not allowed at compile-time).
-      if (const auto *ramp = simplified.as<RampNode>()) {
-        // For scalable vectors, we cannot rely on a constant lane count.
-        // Use sufficient (but not necessary) conditions:
-        // - If base >= 0 and stride >= 0, all lanes are non-negative.
-        // - If base < 0 and stride <= 0, all lanes are negative.
-        bool base_nonneg = analyzer_.CanProve(ramp->base >= 0);
-        bool base_neg = analyzer_.CanProve(ramp->base < 0);
-        bool stride_nonneg = analyzer_.CanProve(ramp->stride >= 0);
-        bool stride_nonpos = analyzer_.CanProve(ramp->stride <= 0);
-
-        if (base_nonneg && stride_nonneg) {
+      // Handle scalar indices with the standard analyzer
+      if (simplified.dtype().lanes() == 1) {
+        if (analyzer_.CanProve(simplified >= 0))
           state = IndexSignState::kNonNegative;
-        } else if (base_neg && stride_nonpos) {
+        else if (analyzer_.CanProve(simplified < 0))
           state = IndexSignState::kNegative;
-        } else {
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      }
+      // Vector indices: try to reason about non-negativity/negativity
+      // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
+      // lanes).
+      else if (const auto *ramp = simplified.as<RampNode>()) {
+        // Compute a safe lower/upper bound for the vector lanes
+        // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
+        // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
+        auto base_bound = analyzer_.const_int_bound(ramp->base);
+        auto stride_bound = analyzer_.const_int_bound(ramp->stride);
+        int lanes = *as_const_int(ramp->lanes);
+
+        int64_t base_min = base_bound->min_value;
+        int64_t base_max = base_bound->max_value;
+        int64_t s_min = stride_bound->min_value;
+        int64_t s_max = stride_bound->max_value;
+
+        // Guard against overflow is not strictly necessary here because
+        // bounds may be +/-inf represented by sentinel values.
+        int64_t lower = base_min;
+        if (s_min < 0)
+          lower += s_min * (lanes - 1);
+        int64_t upper = base_max;
+        if (s_max > 0)
+          upper += s_max * (lanes - 1);
+
+        if (lower >= 0)
+          state = IndexSignState::kNonNegative;
+        else if (upper < 0)
+          state = IndexSignState::kNegative;
+        else
           DLOG(WARNING)
               << "LegalizeNegativeIndex: cannot prove non-negative index "
               << simplified << " for buffer " << buffer_name << " (axis " << i
               << ", index " + indices[i]->Script() + ").";
-        }
       } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
         auto v = analyzer_.Simplify(broadcast->value);
         if (analyzer_.CanProve(v >= 0))
@@ -85,20 +109,6 @@ private:
                 << simplified << " for buffer " << buffer_name << " (axis " << i
                 << ", index " + indices[i]->Script() + ").";
         }
-      } else {
-        // Assume scalar (or non-Ramp/Broadcast) index; avoid querying lanes().
-        // Fall back to scalar reasoning. If this expression is actually a
-        // vector-but-not-Ramp/Broadcast, treat as unknown to be safe.
-        // Try to prove scalar first; if proof fails, leave as unknown.
-        if (analyzer_.CanProve(simplified >= 0))
-          state = IndexSignState::kNonNegative;
-        else if (analyzer_.CanProve(simplified < 0))
-          state = IndexSignState::kNegative;
-        else
-          DLOG(WARNING)
-              << "LegalizeNegativeIndex: cannot prove non-negative index "
-              << simplified << " for buffer " << buffer_name << " (axis " << i
-              << ", index " + indices[i]->Script() + ").";
       }
       states.push_back(state);
     }
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index f9d54dd5..a7228613 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -402,8 +402,8 @@ PrimFunc MakePackedAPI(PrimFunc func) {
           display_name.erase(display_name.size() - 7);
         }
       }
-      msg << name_hint << ": Expect buffer " << display_name
-          << " to be pointer or tensor";
+      msg << "kernel " << name_hint << " input " << display_name
+          << " expected pointer or tensor handle";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
                          type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
@@ -423,7 +423,8 @@ PrimFunc MakePackedAPI(PrimFunc func) {
                          handle_from_tensor, arg_value);
     } else if (dtype.is_bool()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect " << param->name_hint << " to be boolean";
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected boolean";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
                          type_index == ffi::TypeIndex::kTVMFFIInt,
@@ -433,7 +434,8 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
     } else if (dtype.is_int() || dtype.is_uint()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect " << param->name_hint << " to be int";
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected integer";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
                          type_index == ffi::TypeIndex::kTVMFFIBool,
@@ -442,7 +444,8 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     } else {
       ICHECK(dtype.is_float());
       std::ostringstream msg;
-      msg << name_hint << ": Expect " << param->name_hint << " to be float";
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected float";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
                          type_index == ffi::TypeIndex::kTVMFFIInt ||
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index 6bd4abac..a6a1bfeb 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -195,8 +195,7 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
             expected_inputs = len(self.params) - len(self.result_idx)
             if len(inputs) != expected_inputs:
                 raise ValueError(
-                    f"Expected {expected_inputs} inputs, got {len(inputs)} (params={len(self.params)}, outputs={len(self.result_idx)})"
-                )
+                    f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
 
             # Resolve the device used for outputs. Prefer the first tensor input's device
             # if available, otherwise use PyTorch's current device.
-- 
GitLab


From a4ea7da9ee1bcdfabdd30a915900c0581357ba4a Mon Sep 17 00:00:00 2001
From: LJC00118 <77378439+LJC00118@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:52:33 +0800
Subject: [PATCH 059/139] [Bugfix] Disable floordiv optimization due to integer
 overflow risk (#1355)

* disable overflow-prone floordiv optimization in lower_intrin.cc

* disable overflow-prone floordiv optimization in lower_intrin.cc
---
 src/transform/lower_intrin.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/transform/lower_intrin.cc b/src/transform/lower_intrin.cc
index edd0e1a1..cf312264 100644
--- a/src/transform/lower_intrin.cc
+++ b/src/transform/lower_intrin.cc
@@ -122,8 +122,14 @@ public:
         return truncdiv(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floordiv
       // in terms of truncdiv using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -165,6 +171,7 @@ public:
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncdiv(offset_numerator, op->b) - ceildiv;
       }
+      */
 
       DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
       PrimExpr rdiv = truncdiv(op->a, op->b);
@@ -223,8 +230,14 @@ public:
         return truncmod(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floormod
       // in terms of truncmod using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -265,6 +278,7 @@ public:
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncmod(offset_numerator, op->b);
       }
+      */
 
       DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
       // NOTE:condition on b >= 0.
-- 
GitLab


From c6a19fb23ec41ebb563eeb66db55414f823c4b23 Mon Sep 17 00:00:00 2001
From: Leon Lu <gfvvz@163.com>
Date: Sun, 30 Nov 2025 15:56:29 +0800
Subject: [PATCH 060/139] [Bugfix] Fix the jit_kernel issue (#1357)

* [Bugfix] Fix the jit_kernel issue

* Update README.md

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d7cdabee..0c0769e7 100644
--- a/README.md
+++ b/README.md
@@ -209,7 +209,7 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
-- 
GitLab


From 1b42c87bb9bcebf3b84dcbaa5e9393207c79dcf4 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 1 Dec 2025 13:25:12 +0800
Subject: [PATCH 061/139] [Refactor] Update Fragment Indexing in
 ParallelOpNode's InferLayout Method (#1359)

This commit refines the Fragment creation process in the InferLayout method of ParallelOpNode. It removes the unnecessary forward_index array and utilizes default fragment indexing for consistency with other operations. Additionally, it binds the thread range to enhance comparability across different operations.
---
 src/op/parallel.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 0d09cc12..94572098 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -252,17 +252,18 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
           forward_vars.push_back(
               IterVar(Range(0, s), Var(), IterVarType::kDataPar));
         }
-        Array<PrimExpr> forward_index;
-        for (const auto &iv : forward_vars) {
-          forward_index.push_back(iv->var);
-        }
         Var rep;
         auto rep_iter =
             IterVar({0, T.thread_bounds->extent}, rep, IterVarType::kDataPar);
 
+        // Use default fragment indexing (single output dim) to
+        // stay consistent with other ops (e.g., ReduceOp), and
+        // bind the thread range for comparability.
         const PrimExpr &forward_thread = rep;
-        results.Set(buffer, Fragment(forward_vars, forward_index,
-                                     forward_thread, rep_iter));
+        auto frag = Fragment(forward_vars, /*forward_index=*/{}, forward_thread,
+                             rep_iter)
+                        ->BindThreadRange(T.thread_bounds);
+        results.Set(buffer, frag);
       }
     }
     return results;
-- 
GitLab


From b10ef75fc31deea3684256bb77bdcea27aede0ff Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Mon, 1 Dec 2025 16:50:43 +0800
Subject: [PATCH 062/139] [Analysis] Enhance NestedLoopChecker with tile op
 cases (#1358)

* [Analysis] Enhance NestedLoopChecker with tile op cases

* fix tileop issue
---
 src/op/atomic_add.cc                          |   2 +-
 src/op/copy.cc                                |   4 +-
 src/op/fill.cc                                |   2 +-
 src/op/finalize_reducer.cc                    |   2 +-
 src/op/gemm.cc                                |   2 +-
 src/op/gemm_py.cc                             |   2 +-
 src/op/gemm_sp.cc                             |   2 +-
 src/op/operator.h                             |   6 +-
 src/op/reduce.cc                              |   4 +-
 src/op/region.cc                              |  12 +-
 .../test_tilelang_nested_loop_checker.py      | 173 ++++++++++++++++++
 tilelang/analysis/ast_printer.py              |   2 +-
 tilelang/analysis/nested_loop_checker.py      |  18 +-
 tilelang/engine/phase.py                      |   2 -
 tilelang/language/atomic.py                   |   4 +-
 tilelang/language/copy.py                     |   4 +-
 tilelang/language/experimental/gemm_sp.py     |   2 +-
 tilelang/language/fill.py                     |   2 +-
 tilelang/language/gemm.py                     |   4 +-
 tilelang/language/reduce.py                   |  19 +-
 tilelang/language/utils.py                    |   2 +-
 21 files changed, 224 insertions(+), 46 deletions(-)

diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 1a49b770..4ae19baf 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -539,7 +539,7 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return vectorized_thread_loop;
 }
 
-TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
+TIR_REGISTER_TL_TILE_OP(AtomicAdd, atomicadd)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 1bd548bc..93a0ff0e 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -2037,7 +2037,7 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
 // - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
 // eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Copy, copy)
+TIR_REGISTER_TL_TILE_OP(Copy, copy)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -2062,7 +2062,7 @@ LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
 // - Takes 9 inputs: src_buffer, dst_buffer, nhw_step, c_step, kernel, stride,
 // dilation, padding, eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
+TIR_REGISTER_TL_TILE_OP(Conv2DIm2ColOp, c2d_im2col)
     .set_num_inputs(9)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 5a773768..714e97ad 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -209,7 +209,7 @@ LayoutMap FillNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(Fill, fill)
+TIR_REGISTER_TL_TILE_OP(Fill, fill)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index effc4baf..f542b2d9 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -159,7 +159,7 @@ TileOperator FinalizeReducerOpNode::Clone() const {
   return TileOperator(node);
 }
 
-TIR_REGISTER_TL_OP(FinalizeReducerOp, finalize_reducer)
+TIR_REGISTER_TL_TILE_OP(FinalizeReducerOp, finalize_reducer)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 5a98cba6..dd14eb74 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -826,7 +826,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(Gemm, gemm)
+TIR_REGISTER_TL_TILE_OP(Gemm, gemm)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index aa6c0282..f12a2de5 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -318,7 +318,7 @@ LayoutMap GemmPyNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmPy, gemm_py)
+TIR_REGISTER_TL_TILE_OP(GemmPy, gemm_py)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index df923d0e..bdabefaf 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -302,7 +302,7 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmSP, gemm_sp)
+TIR_REGISTER_TL_TILE_OP(GemmSP, gemm_sp)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/operator.h b/src/op/operator.h
index 0d9f859a..1453f9c1 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -77,12 +77,12 @@ TileOperator ParseOperator(Stmt stmt);
 
 using OpBuilderFunc = ffi::TypedFunction<TileOperator(Array<PrimExpr>)>;
 
-#define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
+#define TIR_REGISTER_TL_TILE_OP(Entry, OpName)                                 \
   const Op &Entry::Get() {                                                     \
-    static const Op &op = Op::Get("tl." #OpName);                              \
+    static const Op &op = Op::Get("tl.tileop." #OpName);                       \
     return op;                                                                 \
   }                                                                            \
-  TVM_REGISTER_OP("tl." #OpName)                                               \
+  TVM_REGISTER_OP("tl.tileop." #OpName)                                        \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)             \
       .set_attr<OpBuilderFunc>(                                                \
           "TLOpBuilder", [](Array<PrimExpr> args) { return Entry(args); })
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index caf9198a..40c9b83c 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -478,7 +478,7 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(ReduceOp, reduce)
+TIR_REGISTER_TL_TILE_OP(ReduceOp, reduce)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -563,7 +563,7 @@ LayoutMap CumSumOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(CumSumOp, cumsum)
+TIR_REGISTER_TL_TILE_OP(CumSumOp, cumsum)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
diff --git a/src/op/region.cc b/src/op/region.cc
index 2a1f2745..25e78eba 100644
--- a/src/op/region.cc
+++ b/src/op/region.cc
@@ -76,17 +76,7 @@ LayoutMap RegionOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-const Op &RegionOp::Get() {
-  static const Op &op = Op::Get("tl.region");
-  return op;
-}
-
-TVM_REGISTER_OP("tl.region")
-    .set_attr<TScriptPrinterName>("TScriptPrinterName", "region")
-    .set_attr<OpBuilderFunc>("TLOpBuilder",
-                             [](Array<PrimExpr> args) {
-                               return RegionOp(args);
-                             })
+TIR_REGISTER_TL_TILE_OP(RegionOp, region)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
index b572a707..d3c2ec20 100644
--- a/testing/python/analysis/test_tilelang_nested_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -550,5 +550,178 @@ def test_mixed_pp():
     run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
 
 
+"""
+TiledOp in a T.Parallel is also not permitted.
+"""
+
+
+def matmul_with_parallel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                for _ in T.Parallel(1):
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_tiled_op_with_parallel(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = "float16"
+    out_dtype = "float16"
+    dtypeAccum = "float32"
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == "float32":
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
+            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_with_parallel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            })
+
+
+@tilelang.jit(out_idx=[1])
+def tir_op_with_parallel(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = T.max(A[i * block + j], 0.0)
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def customize_op_with_parallel(length=256, block=16, dtype="float32"):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor((length,), dtype),
+            B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j]
+                    T.atomic_add(B[i * block + j], 1.0)
+
+    return main
+
+
+def test_tiled_op_with_parallel():
+    run_gemm_tiled_op_with_parallel(order=[0, 1, 2], stage=[0, 0, 1])
+
+    kernel1 = tir_op_with_parallel(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, torch.relu(data), atol=1e-5, rtol=1e-5)
+    kernel2 = customize_op_with_parallel(length=256, block=16)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result2, data + 1, atol=1e-5, rtol=1e-5)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
index c54ec5cf..e634e027 100644
--- a/tilelang/analysis/ast_printer.py
+++ b/tilelang/analysis/ast_printer.py
@@ -14,7 +14,7 @@ def ASTPrinter():
         Pre-order visitor to print all visited statements.
         """
 
-        print(f"Visiting statement: {type(statement)}")
+        print(f"Visiting statement: {type(statement)}, {statement}")
 
     def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
         new_body = ir_transform(func.body, pre_visit, None)
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
index 7a0d94da..eff0fc2d 100644
--- a/tilelang/analysis/nested_loop_checker.py
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -1,6 +1,7 @@
 from tvm import tir
 from tvm.tir import (
     For,
+    Call,
     PrimFunc,
     PyStmtExprVisitor,
 )
@@ -17,6 +18,12 @@ def is_pipelined_for(op: For) -> bool:
     return any(key in op.annotations for key in anno_keys)
 
 
+def is_tile_op(op: Call) -> bool:
+    """Check if a call is a tile-op"""
+
+    return op.op.get_attr("TLOpBuilder") is not None
+
+
 @tir.functor.visitor
 class _NestedLoopCheckVisitor(PyStmtExprVisitor):
 
@@ -39,7 +46,7 @@ class _NestedLoopCheckVisitor(PyStmtExprVisitor):
                                  "Nested parallel loops are not allowed. "
                                  "Please check your loop structure.")
             self.in_parallel_context = True
-            self.visit_stmt(child)
+            super().visit_for_(op)
             self.in_parallel_context = False
             return
         elif is_pipelined_for(op):
@@ -48,7 +55,14 @@ class _NestedLoopCheckVisitor(PyStmtExprVisitor):
                                  "Pipelined loop cannot be nested inside a parallel loop. "
                                  "Please check your loop structure.")
 
-        self.visit_stmt(op.body)
+        super().visit_for_(op)
+
+    def visit_call_(self, op: Call) -> None:
+        if self.in_parallel_context and is_tile_op(op):
+            raise ValueError("[Tilelang Semantic Check] "
+                             "Only elementwise operations are allowed inside a parallel loop. " \
+                             f"Got a tile-op \"{op.op}\"."
+                             )
 
 
 def NestedLoopChecker():
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index dfa8050a..1a98c893 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -76,10 +76,8 @@ def PreLowerSemanticCheck(mod: IRModule) -> None:
 
     # Debug
     # tilelang.analysis.ASTPrinter()(mod)
-
     # Check if there are any invalid nested loops.
     tilelang.analysis.NestedLoopChecker()(mod)
-
     # Check if there are any invalid symbolic T.Parallel + fragment access.
     tilelang.analysis.FragmentLoopChecker()(mod)
 
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 56f87473..07e45bbc 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -212,9 +212,9 @@ def atomic_add(dst: Buffer,
             "return_prev is not supported for tile-region-based atomic operations")
 
     if memory_order is None:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma, 0)
+        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma, 0)
     else:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma,
+        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma,
                              _MEMORY_ORDER_ID_MAP[memory_order])
 
 
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index 965919fd..cabc4a3e 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -90,7 +90,7 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width,
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, coalesced_width,
                            disable_tma, eviction_policy)
 
 
@@ -124,5 +124,5 @@ def c2d_im2col(img: tir.Buffer,
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
     img_region = to_buffer_region(img, access_type="r")
     col_region = to_buffer_region(col, access_type="w")
-    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img_region, col_region,
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.c2d_im2col"), img_region, col_region,
                            nhw_step, c_step, kernel, stride, dilation, pad, eviction_policy)
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index 7cc3d736..b391d2d0 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -70,7 +70,7 @@ def gemm_sp(
     C_arg = to_buffer_region(C, access_type="rw")
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp"),
+        tir.op.Op.get("tl.tileop.gemm_sp"),
         A_arg,
         E_arg,
         B_arg,
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index fbbcf1b6..b2373337 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -32,7 +32,7 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
             extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
     else:
         extents = []
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fill"),
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"),
                            to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 2bfd3a0c..db8e04ab 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -116,7 +116,7 @@ def gemm_v1(
 ):
     """GEMM v1: use op tl.gemm."""
     return _gemm_impl(
-        "tl.gemm",
+        "tl.tileop.gemm",
         A,
         B,
         C,
@@ -145,7 +145,7 @@ def gemm_v2(
 ):
     """GEMM v2: use op tl.gemm_py."""
     return _gemm_impl(
-        "tl.gemm_py",
+        "tl.tileop.gemm_py",
         A,
         B,
         C,
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 3c4d8187..fb84b6d7 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -13,6 +13,9 @@ def _legalize_dim(buffer: tir.Buffer, dim: int):
     return dim
 
 
+_REDUCE_OP_KEY = "tl.tileop.reduce"
+
+
 def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
     """Perform a reduction operation on a buffer along a specified dimension.
 
@@ -50,7 +53,7 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
                 to_buffer_region(red_frag_in, access_type="r"),
                 to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
@@ -65,7 +68,7 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
                 to_buffer_region(red_frag_in, access_type="r"),
                 to_buffer_region(out, access_type="w"),
                 reduce_type,
@@ -78,7 +81,7 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
 
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
                 to_buffer_region(buffer, access_type="r"),
                 to_buffer_region(red_frag_out, access_type="w"),
                 reduce_type,
@@ -89,7 +92,7 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         elif is_fragment(buffer) and is_fragment(out):
             tir.call_intrin(
                 "handle",
-                tir.op.Op.get("tl.reduce"),
+                tir.op.Op.get(_REDUCE_OP_KEY),
                 to_buffer_region(buffer, access_type="r"),
                 to_buffer_region(out, access_type="w"),
                 reduce_type,
@@ -245,7 +248,7 @@ def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -
     copy(src, cumsum_smem)
     tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
+        tir.op.Op.get("tl.tileop.cumsum"),
         to_buffer_region(cumsum_smem, access_type="r"),
         to_buffer_region(cumsum_smem, access_type="w"),
         dim,
@@ -299,7 +302,7 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
         return cumsum_fragment(src, dst, dim, reverse)
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.cumsum"),
+        tir.op.Op.get("tl.tileop.cumsum"),
         to_buffer_region(src, access_type="r"),
         to_buffer_region(dst, access_type="w"),
         dim,
@@ -309,7 +312,7 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
 
 def finalize_reducer(reducer: tir.Buffer):
     """
-    Finalize a reducer buffer by emitting the `tl.finalize_reducer` intrinsic.
+    Finalize a reducer buffer by emitting the `tl.tileop.finalize_reducer` intrinsic.
 
     This returns a TVM `tir.Call` handle that finalizes the given reducer using its writable pointer.
     The call does not modify Python objects directly; it produces the low-level intrinsic call used by the IR.
@@ -322,7 +325,7 @@ def finalize_reducer(reducer: tir.Buffer):
     """
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.finalize_reducer"),
+        tir.op.Op.get("tl.tileop.finalize_reducer"),
         to_buffer_region(reducer, access_type="w"),
     )
 
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 75fea4c0..136bc0ba 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -7,7 +7,7 @@ from tilelang import language as T
 def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
     """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
-    return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
+    return T.call_intrin("handle", op.Op.get("tl.tileop.region"), buffer, access_type, *args)
 
 
 def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-- 
GitLab


From 283a9a00aac20ee22dfa9a613636c5cfdbc1bbe5 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Mon, 1 Dec 2025 20:59:30 +0800
Subject: [PATCH 063/139] [Language] support `T.gemm_sp_v2` on sm80 and sm89
 (#1056)

* [misc] add a cpp side wrapper for gemm_sp_py

* [misc] typing

* [IR] bind GemmSPWarpPolicy

* [chore] add wrapper code

* [IR] fix GemmSPWarpPolicy

* [codegen] apply ptxas instructions

* [intrinsic] add typical (unused) mma layout

* [template] add uint16 debug func

* [intrinsic] add b matrix layout

* [gemm_sp] enable fp16/bf16 on sm8x

* [layout] refactor fp16/bf16 layout

* [gemm_sp] enable int8

* [chore] update test case dtype

* [gemm_sp] enable fp32

* [layout] refactor layouts

* [intrinsic] enable ldmatrix for mat A

* [layout] enable ldsm for matrix b

* [layout] add ldmatrix for fp32 and fp8

* [chore] refine

* [chore] refactor

* [chore] add fp8 efactor

* [chore] refactor

* [chore] add remove negative zero util

* [example] add a custom compress kernel

* [chore] minor update

* [test] refactor gemm_sp test

* [refactor] make metadata layout func

* [example] add option for using cutlass layout

* [doc] add a gemm_sp doc

* [doc] minor polish

* [chore] remove unused

* [bugfix] fix non replicate b case

* [test] refactor

* [chore] add a check

* [bugfix] fix util bug

* [wip] init a new test case for v2

* [chore] minor refactor

* [chore] minor update

* [bugfix] enable 16bit rs

* [language] enable rs

* [language] enable gemm_sp_sr

* [language] enable gemm_sp_rr

* [test] enable more tests

* [tvm] update ffi binding

* [chore] remove print

* [chore] fix benchmark script

* [lint] precommit lint

* [chore] apply feedback

* [test] use arch 8.0

* [chore] rollback ::ordered_metadata for backward compatibility

* [bugfix] fix captialized

* [example] keep gemm_sp on hopper

* [test] fix no fp8 normal kernel

* [test] reduce matmul size to satisfy accum error

* [test] use cal_diff for assertion

* [bugfix] expand float8 type

* [lib] add make_int4 for short type

* [language] add transpose E

* [bugfix] fix wrong var

* [format] format

* [chore] refactor binding

* [chore] fix wrong passing var
---
 benchmark/matmul/benchmark_matmul_sp.py       |  23 +-
 .../img/sparse_mma_storage_example.png        | Bin 0 -> 292010 bytes
 docs/deeplearning_operators/matmul_sparse.md  | 262 ++++++
 docs/index.md                                 |   1 +
 examples/gemm_sp/example_custom_compress.py   | 363 ++++++++
 examples/gemm_sp/example_gemm_sp.py           |  26 +-
 examples/gemm_sp/test_example_gemm_sp.py      |  16 +
 .../tilelang_example_sparse_tensorcore.py     |  14 +-
 src/op/gemm_sp.cc                             |  15 +-
 src/op/gemm_sp.h                              |   8 +
 src/op/gemm_sp_py.cc                          | 289 ++++++
 src/op/gemm_sp_py.h                           |  94 ++
 src/tl_templates/cuda/common.h                |  10 +
 src/tl_templates/cuda/debug.h                 |  10 +
 .../test_tilelang_tilelibrary_gemm_sp.py      | 130 +--
 .../test_tilelang_tilelibrary_gemm_sp_v2.py   | 666 ++++++++++++++
 tilelang/intrinsics/mma_layout.py             |  31 +
 tilelang/intrinsics/mma_macro_generator.py    |   6 +
 tilelang/intrinsics/mma_sp_layout.py          | 190 ++++
 tilelang/intrinsics/mma_sp_macro_generator.py | 864 ++++++++++++++++++
 tilelang/ir.py                                |  13 +
 tilelang/language/__init__.py                 |   2 +-
 tilelang/language/experimental/gemm_sp.py     | 135 ++-
 tilelang/layout/__init__.py                   |   2 +-
 tilelang/layout/gemm_sp.py                    |  57 +-
 tilelang/profiler/__init__.py                 |  13 +-
 tilelang/tileop/__init__.py                   |   1 +
 tilelang/tileop/gemm/__init__.py              |   6 +-
 tilelang/tileop/gemm_sp/__init__.py           |  69 ++
 tilelang/tileop/gemm_sp/gemm_sp_base.py       | 131 +++
 tilelang/tileop/gemm_sp/gemm_sp_mma.py        | 247 +++++
 tilelang/utils/sparse.py                      |  48 +-
 tilelang/utils/tensor.py                      |  16 +
 33 files changed, 3615 insertions(+), 143 deletions(-)
 create mode 100644 docs/_static/img/sparse_mma_storage_example.png
 create mode 100644 docs/deeplearning_operators/matmul_sparse.md
 create mode 100644 examples/gemm_sp/example_custom_compress.py
 create mode 100644 examples/gemm_sp/test_example_gemm_sp.py
 create mode 100644 src/op/gemm_sp_py.cc
 create mode 100644 src/op/gemm_sp_py.h
 create mode 100644 testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
 create mode 100644 tilelang/intrinsics/mma_sp_layout.py
 create mode 100644 tilelang/intrinsics/mma_sp_macro_generator.py
 create mode 100644 tilelang/tileop/gemm_sp/__init__.py
 create mode 100644 tilelang/tileop/gemm_sp/gemm_sp_base.py
 create mode 100644 tilelang/tileop/gemm_sp/gemm_sp_mma.py

diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 4e4ed612..0ff3cd0b 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -9,7 +9,7 @@ import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
 from tilelang.contrib import nvcc
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 
 # Configure logger
 logger = logging.getLogger(__name__)
@@ -86,7 +86,7 @@ def get_configs(M, N, K):
     return configs
 
 
-def matmul_sp(M, N, K, accum_dtype):
+def matmul_sp(M, N, K, in_dtype, accum_dtype):
     """
     Create an autotuned matrix multiplication kernel for matrices of shape:
       - A: (M, K)
@@ -161,14 +161,13 @@ def matmul_sp(M, N, K, accum_dtype):
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
         e_factor, e_dtype = ARCH_INFO[arch]
 
         @T.prim_func
         def main(
-                A_sparse: T.Tensor((M, K // 2), dtype),
+                A_sparse: T.Tensor((M, K // 2), in_dtype),
                 E: T.Tensor((M, K // e_factor), e_dtype),
-                B: T.Tensor((K, N), dtype),
+                B: T.Tensor((K, N), in_dtype),
                 C: T.Tensor((M, N), accum_dtype),
         ):
             """
@@ -187,9 +186,9 @@ def matmul_sp(M, N, K, accum_dtype):
                     T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
 
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
-                A_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+                A_shared = T.alloc_shared((block_M, block_K // 2), in_dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
+                B_shared = T.alloc_shared((block_K, block_N), in_dtype)
                 # Allocate shared memory for E sub-block of shape (block_M, block_K // E_factor)
                 E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
                 # Allocate a local fragment for intermediate accumulation
@@ -204,11 +203,9 @@ def matmul_sp(M, N, K, accum_dtype):
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
                 T.annotate_layout({
                     E:
-                        make_metadata_layout(
-                            E, mma_dtype="float16", backend="cutlass", block_k=block_K),
+                        make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
                     E_shared:
-                        make_metadata_layout(
-                            E_shared, mma_dtype="float16", backend="cutlass", block_k=block_K),
+                        make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
                 })
                 # Loop over sub-blocks in K dimension, pipelined by num_stages
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
@@ -220,7 +217,7 @@ def matmul_sp(M, N, K, accum_dtype):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                     # Perform a partial matrix multiplication:
                     #   C_local += A_shared @ B_shared
-                    T.gemm_sp(
+                    T.gemm_sp_v2(
                         A_shared,
                         E_shared,
                         B_shared,
@@ -268,7 +265,7 @@ if __name__ == "__main__":
     total_flops = 2 * M * N * K
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_result = matmul_sp(M, N, K, args.accum_dtype)
+    best_result = matmul_sp(M, N, K, "float16", args.accum_dtype)
     best_latency = best_result.latency
     best_config = best_result.config
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
diff --git a/docs/_static/img/sparse_mma_storage_example.png b/docs/_static/img/sparse_mma_storage_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b16398197b28f10b5681cfd27a9f2fe061be5cd
GIT binary patch
literal 292010
zcmeFYWl&r}*DgA^yE_C6?rtGiaCZ;x?l2GtZoz{mXmEEKg1fuByAOIN?<c43&s%kW
zo~b>xr>lDJ>0WE~lBb`JR8f*aLncNB003yRUnJE40654`=t)Fq$n6t_92DdR<)$X{
z8BjS+dIV{}T8S%)0{}JgC@-e}x*|D$(RBj=(0l%Up$44(m;(Usg|d?38s0{y*>E~|
ziUb1>)3j?P!H5x|QXvr^sXPbly#9U6s2aciyEmEq*hpVn>hOCo!hG4B(;mMNh&L)~
zw6r_`sVfKuQzHKNwF{*Jp#Gl*r4KZ2*Z(%Y<N)B<|F>r%{U@lH|MM8q|9uSc|B3`+
zEB}96$u2;af`TG)W3U#ydbVIQOC*lDivj&VhL{K=yvRc*CV_O=dWzy9jR^}5`ERrQ
zGA}Do_f=(So3zlEO)x`p4U6V=e%}<X<jvwma6CXc8w*zajq<;(HqoQA@<{)i9BJ6q
zS?w;4$9sq@*xmQysgSg7++TVq$8L39ushTejqcaY3&t$?o%(JB43l}dvjV*C@j8Yq
zgKN8A0LZ_KW73@X$CBWR)1vo$zcqNH!0g(nY(nQOwmQ{$K8!X$j@#vK`4E(J;ZbHT
zkJAvh{CtSuAp;pT?f(wsEd=^;pu$nLos~(&??PQerA4X|aWfeK#@1(2@~qUp?LDOx
zvOr(3KL%m#R?*z`Mwx0dV(BTaWF*TO){URYXX?9L3B`2YP(5GODIQOS_Iy}`nSZj{
zHayJ{{~?z4{tKWKxZc#tB(n24Rw=Nk@=Kf>Js+&|QybP@^t5EQEt#*pAw)JnG|1Aw
zM!342&e4oA@VS%Uz8SeI0@BsDnCO|W-JN=R(Nr7}$mMe>6k8CKxUpXQSSHopdV`_8
zKn0azR9jQ`Jh#$N^p2hR=ZL|(ABd1QNv>w{beA!fKM5WI{X42He96Wc1Y75Rrc87e
zI8*sdSK8+cf<4I__el6(9(~O&w)eBt-QDlrZZ1I7;~&(<`(l5uqqRA-6#eSV%{5wU
zB)%i9J0}%b%$x~WnU9P^c4?e0wkmfzAi?hoJ?gym*1A8EReXBdT#T|bAeD=SDpLZ2
zOK&UA|8{!DmesvUOkk&Hr(QmdE`~KMew>;@6YR?YrcS}`?p{RlJ+RnIhF~ARab0yk
zOCAvcTSZ&-CFVhEMRrJ+{EdniBX7DM=A#_o+r{K6V<RKNwS0~fDH}20qn?RKZotG$
z4~oEsrD0lvAUAlWA|U#^bu7YI$17`yy3&)U=?o1qt6=NSB3>1!P*jZlH+#p_Ff>27
zXDy)Pr0yYn+I;A*&_*In%je!U6bZrk=t}Hk+st^nQoA!U;vuQ?>dha!`!saz%e?s%
z2l6(nW;3B_!x+foKiW}|O<=(|k3eBh*Vk4a5r^3$p`3wA$<Wp@1oGzWF}8KKI4K|7
zNgDSvOVZabYjtwl_4_&8f;}1}O=*G$A94@s(f84dod_Ba1U$xm)Hl`i_(?vtxTx~S
zT&Gp8<(-^oHebL8?=nJ^*^^}F7Kfix(U`lqIJ>}-i+|Addux-Z<9L-R!CH2WtBCPH
zyWl$6Z$ioQ`i6Tx#f6oba|W7L#=eu`3G-)jvmwXvkj{f{V6<S}s@}#zlrM*=L}S;t
z#PM;DseIPqMXlWHnc*vNOJPIEngZG*vF0c0+!VXTU9<(f*%h$Q{B2^7XS;uOt8q{8
zKK$wiX$E+d$usv^sq}H0oFbFp=<I8Gv2@QBp<XNZIp{j}3#f2yeba2rM=)Nm&Q7ih
zS#Z@DlfVAJLG8Uz807OuCb>AJh3f7{zT@#Gr^%hb72&qs$g(rA)w6*&XL*UE{QEAJ
z4lZiC+h)RUGLA8sgLcjL-@ItOdFQ8^PJ@&9aEG~3(2plcL=Myy3I!MviW?<+i8tIa
z*s=s^_&h~aDI6#i)PX#79YQsK(xUT2o%vwv(Z)om{c`-jzrG;`t3cNOpcCEQ&JIHb
z%x%H2Hk9E?cZeCArP<y{e3PK|FQ(4H)$Vw)F**A1pA;lvQm#`(=|jDS@qhlUyh;Or
z_VROZYbo8`T=Ua9(OPv-qgx%*0~YKMIz(4Hbs4pewsifT`<8N134H|$Wl2tO94Wpg
zaxn9{9YNus+%&Rh{iqzXzF0mu+LY#cE-^jHipefCz7npHY?dAaaOv;<pJbGo>W<~i
zj<SZy472_T1%bNV(PlStstxH-$5PVgtr8M2X{q>)Gz66u+Rb10YLTy1;yg2t4tfm}
zF}##-x$}Xgljs-i(u5m{o_1!TpR<%2S@zS7=~73fGP8Vgb8?&u8<Z`FIQnVvvr^;J
zT3T934(xwU^e?OFiecX(&n;eQ+#<`hi{(Xe-Hi|?0F)Lv`3<(rB1jt|7zS)U(yBRD
zJv0_?CA(PjM_-I9MNp`ZMF^Jl7fEKCDZ5-l5*OHi4ox3?<tyg~biklA{%iS4Ke7Ll
zlY7wc_Va>UM(YDoYzmx_(Gn;~(Q>UMe3n6orlYMcCmICy-UhM5@e4se&Q5E+TslgR
zaF`5g+zZpAUAxhm?UGXa3XfE-%V5%wyl4Q_xyMj$bDB=NzQJT3Jh;wng=Xk|qatyd
zL3D*uQ04Qq6>=eyj5OvM*4IvI<+K%D)25mH7&+3KoIidX3h$tC2$hOx!s*I))YCU3
zgMJ!Ua@JF3w{oq%ecweD<c7hO&3AR>elm^Eji0|oJZ}!yk^zHo&+N3DZH$?-EOkuk
zJGzkA4%nY!o-w~v!nO)P2xQcUB5Tq^$ekiQs*2~k{I%o9qqw@PNSCI(So6!0gJe8=
z$~oP2n}<O7?7j#&6juyTdZMp`y?Tu12k3sfdmA=qGGdW`cI~jLDlR$Gohi?P3xv+l
zY~eU&4hHP2@KxK2`B3*RHhgVPi@Zb}6&e+ip)M#kHeQi9T+o_vJ`(mo6X@RlTX>46
zq=4CF!}cJ#=xW4ylIW4@+JwdGVsz0Rip-tm6^2$ARS=&xK`vD7<&ppE#oj%v=9Z)>
zqhHMJ9gX1l>RV)G>#VV%Jjl07_jw7RPjLT`jPY58X~kz~AaKVcjB+p+{htHOHi$i9
zbTY<`({s}k-v;`)xw*<yY@Io%6}4>pH-RFVQ8s4>I)<mDe`mN{4dul{=NgF_@YEWf
zB$T<JatH6y>0)=$p(m7$ix!a|$)3PV->m)CL+iiTr%z!-G!ZG|tixE#7o{AXM`h@p
zSkZK=41B~}u*pz_XjcdJL}MPl$LQyR!Q=eCo=fi<4Y;^r*Na}i6(tAXFwHoxTaG!1
z$HNw5=`t)n`Fej)tn$U#h^KA6tjbl?%dG9EYq!Rltx5a#ZG7<xquNv{-ntjlf)guN
zR|dWjbsE|0ZTgSLSdqT^H2Ls)I}XR0Vm9QSFe9?;^FEN0@Nuknw^vUzoU>?Ikihd^
z)9q$G-OCRPUCE2N7ruyte5Old9PU`+*j4(7VeJ8e=hk7NTsWvKch@kyxyR$qwWb=!
z<lgTSkzxhZ-6`He2JMLWDIL%h#acUjveTSSJV!sghp*2R^H<?&9JM_jc1GJ?AT9<6
zlKTAE7#LuR9llol2$Ob%pvm%jyQLtaZAzY|w9wp^<+eP6f+KwkBdr|$F83xZR3Zj;
zJFe}ppJ4^R^h@nt0AKqRR*Gf^5?sCUNOaSG=PQd*Rg5ky+{YFwxv_%w^-*&#*nSGb
zX2GKy|Glg2?5wpTBq%io;e{WyFx;G;(h)~kyljuj<tNuM2jPtKe*FAw?k+dm-rF>&
zk>Kwr<b@;Mn|q+<-{f=salX(PB+G%pg*}|>G(pPu10DuGmj6y;S=7(Rl;mW=YighA
z@}Mq>qy@1C!D(n^@Th7*kz!1+4mHO|O7vQN^qIx2e~P`UOX`$a6c%hY6b0aQ6x|{8
zu<)riixFZ&9jVV9U}Is!?&6&3SASr<od^)tEH3{=SkX_m={~`Rw0V)|@FVNwd}5gQ
zi>z{(=7AJ}*;L>O%<;IJwO&ByOajt*{g1diJS3CwN!AD4xTIFoAO0?XC&@>aTA$Wp
z`U`Ca%ELU&Zz@IFympS(>*Qw=wQhaeZIyOyIcfk}3k;&JR$+Km-!HuQH1}mf^uyii
zLXkc|l@o|qm;Hg44|KCM6?prAvOob&Zu>p46H-x9_RXyqU1|(yVZjINv1kD(1vFL&
zi_xqv_P6yt?qSqwj?hLoio5=4HFO~<NJem3@4uC={q1)LlZeNa==f0D`#{dsFVSda
zMu`_p8OZc6FKxQAfA}KwBx1C$YqmRr8fJM$sGoZQr;2ghm}vK{eMu<nnceAddT(%v
zXHNb~5xuxNgzY!fjj0A6@Y}}`#$rjuSb7A4S>#)8?67TyslX9+L{EQDrYAl#(Id=_
zE}zC?8bi~f+hcY5fu@Rcds@Az-oZ}C1EPt}NLK2*lZNaY+o$A@Camc%g=czn&i9?q
zdB1e=yEa&PuYSey^Q^rD;FII<E#QI$n?(4uRmvVpxfamJqd#ta5XC6YgTmQ%nymg_
zA3wnsZ>{%R()37DvhF2E5uOdV>U*n~J>ED|^U-#QzD#rG)c3P8j6exV$K@179Em6a
zl(lEl`9ixY(1+<ahTM2oGL2eqsv${nzb4l;I{UsM+reBpE!b-HiMfS8Od~Te>h0bP
zdv6oCjXwcjp-wryldL&ee%za=XkZ<eRjxVV&>Oz7K)qaBXLI^5{PsKyd!Ft~qBzUm
z9oH^^vqksmx@(Mrb*|Z$r$OP%(pwaN%+hTfL(QA*%yHQ5iFX^veTWdxn#MyhdBrw#
zvbx;<;(xXK7TJtHWzp^()kZKl5ExP&dm0e_?)wqD%Tm$uqryNypp8^D!(CmK;3kwG
zMxyeB2X|qaIo`(w<8_M(*T4Zn&9YB{hVxN@$*^}X=d>M+d=UcP2M$`k=3IDP6HVyc
zIJH`RQ{9uUIlXga-QP=_x5{TAMcWVI|0Jh|G=1@<10yo8(p+(BR<!3%WI4Hker=(5
zJNN~@omN?7OwnUwB5P}s@&UvtF)gqaofu}eY1^a<2DUkyE0h<kl^>Z#t<_`y9*PuX
z?k%p~`$u=%3`2d0L}Eiv^`ak`o^NFQVZ8T&>3C|#bF$01S>4T$O9vKy(i3$FsC?Kp
z_C&ggkyJ*o{TX|<oa8pkU$XO4+)pQYKTUI(bl9*^fy{^np29;xIH$I;rghNzg|4nn
zrv@C$!JxiU+rWbPn!BMv)RIoL8tclwi`>%+N_2+?(Co)vuBQfJ$AibsPvYU|`s-ag
zVP>P{ZRsCk!R3f1;guX|5M}QKB*|T#FyYzo#!{eC;djA3RccO-s<1HIkaiK!R21Dz
z)W*H10)U>)jtqJdBdeJ?o3w3R;Ui2Ek9w25t?!1F%@AED^t;hWR9rYxutfB{@Pvyf
zQA_9@KOv%MTZLJ5*chz+rVQ$6ON{+d;4k9ktYv|~6&oYwZb=!WNTMX5V5qxps6##&
zujX1VdPv12Y<x<()fJe`awzgBN}{&u3I7bB?+_6CZ7Ohtt<M>|?E!BTkn}BrU{C`f
zwmLfz)mdq7u~MM|tPiJ~+d;q{v{e-S45<}|dj1mu`VMp}ppWErM_ex(QML7y+N;}P
zL8v9!#yd>*WqW<=S6Kl3YG-FyWOyE7$$5VuU!#)E_*#KVMN<9D)$?^+XI6freAjKW
z#CIN?C?V%q;q0zdvwJcTaFpWvV>@~~|Ad&PPDLm|7b!73?^>}_827x=nR&iqrk-zQ
zh2v$#I0DIYP;_HttZ|6A-(o(QV7-&`tC~Mwxl-oRXMGxyXp;_pJ`n6i{~o(1GqG0F
z*tk3!Zl-E5pa{2Q-DU)eKm4wuBRe$J^wR@gecJwOi|zU5uL~V!S6@<*jpidm!}_BG
zgzzp_3^4Cy4pIZZ47s6@>oeI%L;Aglh>b47lQ|l))=$tKd)n-B4GXEfmJaY<B(3|#
z%nijgRvhiDyW#m<V6>(|rk=n-^nj`6nD~6<+sn7zM1UuGwUMdp!R>7tj;-(1guJcb
z+Jb99Y@X(?IYR^|?D?4wtML>^sp`;=;4Q&+fTcp|xj*Asi9gW2k>LGXm35NKL^BgM
z(q^6_fn2Alq0yQQW*cp$vJadRlGX7Qw2tCR*+4y9v?h+*CSx{rK4s8|XG<wfP%dfu
z2?J1Qa91VmT^T@6o|~Tgo`VSe2nDeP$Cr=5+J;~NT|viMo$1h^CstS<7u3rN>)X)$
zMLP24K<2al<%V6fR%Q|R9=(m*1!Fr%ugyZcge#cRhwxvf-R3P&Z{+f%HeW+rr*5-~
zKM~JVQ~1cylDcP`V>MTv2jDGP{d(U~C5AY(D#%f#2?04W&kn<|mt!T%9LdwyBRrCQ
zW{>H)$$*SOH@Yd=USEFtrSccHP|!2*D9BZSWIU{|Z*X9qP}e-uS&ig;yx6;A8iCNQ
z%)2C<OPF2Gv<Rtp!xOo=K?*%Ze>eN$cgMuxphy?(pKy#Q-o0z97*k$jisZA^>p9+R
zW=^+4(nyn;2RW;#&K##%aN7@%$eb%EgU}3p#};2BJ6;(WY>rCe+pZNI!v9<uc%x%e
zrpQwXG=_^26q!h}^$EBykPvKXz4Kagu=Kb<AznWZ(n8rUcocq4zMZfl_Pa^aniD=S
z-i*8Ppf&a+<Vh1J!SxMtay&ff4O|^o0kqAFJt~|hQAKJmhw@K;RQM}9fOo?2qm?83
zJJRqRe0lMc4o#4uUA(rzDW%6rwOkg#Y;r@G0~Mq2oe?O?y){aq&D|cK?$4Vfpg2Hx
zsZRJ{#QN}{isy21Z-w-rN%?`Gwfr3eDhtTcKNgb{<dYyxk85=!8bxtD2xdSrk?Ve4
zX<CRaIWZp&6k~rdau>hSlEp+ZDLxjW_~;+==xbY)1Qk%MlVb0p^i;Ur*PPL9;FxAZ
zU%+Hdg_wvS4ncxJ(F8Cj-qUHdm2*E0qdt83o(HJ>O%uf9Pco)rEA$7qi*Gr3AA7oz
z%TN^6{^7e|Ikoz)uWf$*b(dDqI{1^OL>}e>0&Oe~D95@=i%bp-pKop|y4GGBkZ0&@
zr!14eTob7Px>eDdGUXjgoI<qZk@}zaF`h@maMi75p|8k%k0Z%iw{Prv^KMGWXx_tV
z7`NsdoqnJ{`C<<G0=`boN||S_P|S6XBJwT@hI-q1_u^T_MU4R_H*W})ApZy58rAzF
z1Bp^sB9!99DV|cP8;vroBhxV?INd$gkwOOnSn%s^93YH;JvP2Q6=TP{)-0LJQ1E90
zp1tpr$b$!)0MMN0c=&LEj3w7C;qA@+1OR4+hl6@Kvphho<-z>hzMOLqW!$21V<Y=a
z2^?p}nPk-x=cZq~An9kTC~#b`gWITr1mTRLxE|gp%RykvbRLb^i&WWWP=kTjLlfHC
zXK(F>i1n*-Gs_oA06i#UF+vG@idoGJ)NFV6H_Qa6cJodfB$jKlKZF6tSxwQb#f|uM
z7OFHAyqt`(S#XfH!%LnVu~pFno`9mGEhZn*NTb`h&<`=5>!$HX$#xpMdGmh#{PH|J
z=;Pkr{k$`CBxP|5mX(}uz~~(Od>{r%?S%fNc3xMO3TA?eX;bxt%~G6k3QFDJ!alGO
zrVxludyy{U3xK3a|5tlt9wSuNo6DP+J7RsSVdUSs9Yn5YLH)^oaqzf&C{~jgPM0m-
zv*WKg=3)YmUn9;o`f(8icVB5lm+%M{K#`z?px@u<Hl)<s`!qU_vK*jBuXn)dcki>&
zAFV&eSi{>(_qk<%c7-Zet?#3SOUP`u&=K)n(hJM?8t%JlSX=qp@8wySPmgkFbK_t_
zzoE#N0$UACPf07-pfJiZ&<Jzlf!2GH-Y5T-Phj5;PIX${&^!3sl*mQ(E5#k1D)<Fz
zHx&Sn3_H8#yXW@x>vo-+U0L%m{XptT%QX`jFu~JD(aCh_qeOTYEj@+!BLThk9Ey8U
zB{MQK|J~6ay_X1*{(`%_9Za~#oIHMq&k6KbJ)}MkrIlxkw%bn!2K83O&wTg$GIN~!
zh59*<T}-wK0C;znRro5=H@H0iR^AeuUzk!L>$%s7qu*lWL{<YEL@cktJn*BfR~bU6
zBRA1zf(xZn-iJOw|M3af%aftlBS|wf^#Rr!4B(?&VoRWj_#LJ4n_u@keb9_L@BPlf
z1$#`o-|M4Y)g~rIm4g$#KduQD>}4IxYph4iNZ3LyUSN8b;@9o8g<?PNn_QaHSF<Xs
zQu!GpMaR*2KdYGcbdMnUT|Z7QjKE&cgW^yw&XHg|{BuwroVUMuBq;MowD}%F<$J~w
z3<LzJ3W6pS2NE!xJZbEZ$obyUQ51~JtuFH~e#6ktTZXN~qrvZDIrvz<GL)wVC=LXH
zXeW&t&Zlc{^#df<6Q4=|m6LWX5O2`%ON30n*o?;(!I1l8AUi;j@v39B^brKdzIQ*B
zGL`dUEg8lz59C<Uz(+EP@3-2R+UFWbC~>eM2vY!AF|XhyJ<(bEb2-K|+iD=2giOmT
zF+F$}ZwOTXk+UIT>3chZR$Z%dHg)Q4(Vao~{kx>1EFb%3Kk30WzzFf0J|P;KaFVyN
zrlFyM9h>Es6>0T#UVAt@_~^H9<+YQMUf4e7dLpZHxudm7FWQm{X4jFJCq^ahY=CWR
zKW5+jJe~U{d??TmiNR^ViA@F}2(zAL$+{8W|DE1M#kyN*>Ofu!N@0?na>0!HiKhQ>
zHl1eqlHsl|neN(1y=yi~2sN5kNJG+6;6ga|s!pIwcEFN{AelG6#A%Gqtca~O?p+%q
zyJkFsEajYEIY=!W^K`2)^IUv<TjcSSLB{qZpBkFby$&24n9)KX$r}d5o+4Ql1-Kg%
zoK-o=s99r!jLHPoUiRMB;T>O?P;#m|8A!V3yFVGhk-*gp-l6WX{Wo5=(%hO?Eb#7m
z|3@^LIU+B3H>IQJu`y>~%&pl&1rP4hKJ=AWR|92p#ZE@&WQb5Pz4KdpMHjo}IvxZe
ze@W1x+z)~jlwutTmt#3K4BP78sH<b}HgHE6pioz1PU!s)nqmouhet2+$Tw59ap_8F
zE67x`*EJnZh-74`&+hLzuHWcpf^Y4aG%nqDNrH$FH~UIq*)Mly%1#i{_z){pdQ&Na
z2G~=c1_DAzUQMFYSQa|9nKs<}Lk^TRueNWz>?^4PDUBCylP;`IHasHPVYfdTssO5w
zn{PirYW8}oEQBg=WS)$IuOu`)H(kpO{*P#uaI-f)vCGcVE~6*$Hz(n(*ZDqh=I|EM
zY?<?d)r1dLDG66!T{2&8ojdZTd&x*9Cj3*ym(wznY@bIK^i_Jh`ikFu_2|`ip>-@T
z$c*yay)P)x^CRNw&adSXsgKSIWzj;UO7$ZBrL?c~V~W@GC^jXtdF1Lr%y?atD^;;2
z6BvZ}zs*kejpvifVn(}LQJw)CI!lorFt;ZP8jrZo_p8`WZ;B>(a~I!U>XRm_NRo};
z=2W{J4!8GuGAG}*T+a@iqu&Zeg>=J>*OWhR4WB)r=n0#t8RgjP=^b_z>CETQ_4<3d
znq_ZyqxY&sNqelgYI4GoK(-K2{vl`%1;T5ZMvI7x^}Ewtj7Ag2j*hGsgM7=ka!vsX
znXw?sAa$dhKWEa!n8$P$mKUOpDXgrm-fNyi^VHu-F}s|Ur1A2Lb^hoDPDvAZWnQ;g
z?f(J*8=ZnDeKJAoG@e(QJ{Z**=?78KdAwT7(d!8>!)0N3UZ4*wBh7Ez)uTgR*PRFk
zpC~&6rIsBhiU9cD0TvQ#0H7gZuBZ8%-t}Bj=;3_^9QD(7Df3`p)UHI`%2I}l4qs-w
zyh<@7o)nB*_Q98H>Nr01fKw72XRC&P@fiQfz(FxtQ1zo|@AUOGFM|M%>AKE!zn>?r
z?&6_K9+d1I)Dy%|xK2v5=_y1XNb9e<1A_&M1B=d6ZqGYAJiK+Hud~=pRNME5qs187
zzbErq-CvOFOoT|A)p*8awc0E!waHCMK{sm*;2~X5oM$evhy`XS6*4Lr2O1r+Ywy0c
z^!Qocmn(%J>v%U3k+@wRraW}_wF@q@{{bANgVw$0-1NjY#4!1Irn6C`{>%cmkL(^E
zCJv9g-;Opc6WKAI#>2jqlWa~~jw2mIP`L9-c%xai<{9QGSO?0z`A`^$<56?ss=JOw
za1xW=hjBVGUdiCs;U%xreC)j3dW#epk%sgBS!aqAXM|T&vErvJ7(yelz9GU<<UkmU
ziW_O<FKucQ-b<f$U!HDM2zzJ6@lZ1Db+g_PAIMq*De@%#Q{)XEfLVsC6={PTj7J{j
zUDx1*ML<PHvcOj+X_5K?VpH{4W6Um!_%U}=v0o$H4&jv|ZWmO0Z)+q3<3?bb^Y<|<
zs2A@*9n<B%h8DBDytqGHEjDgl{ih)|C6;BV%2Yf5l3je!fyd)0CXzFqVO$!Oz>_)9
zu!XDjZTw<o`_j$#y|N#Tt;=gzPYT@(L8h4XT=KarHh_3zf9CDw#eM?0`#4`0r%Q}H
z*4nV@;FRUTc<4DbwSJ557vshrmWz_nd?V~Zw>92jn{`zI*5m1X3s)NXWfD2o?!*V4
zqm|<LWT!lf8Fe6y+CqG8r(CMKvZB(<=hsdkLD+pXY<2l3idNQHFXHo^fs1s4Zxr3S
z{<vxiatfOU*t;@V33SRYW?wxm2@5988FFd2Gi117m-DAcE0G^ZbN3H=b1S=&6+(tp
zQyohpk#YsZnnz|8@nrMV_qz)N4xGlDD7q^Y6`zI$KKn){W~QcIrv%zYZX{|5$C|H`
zkmIw(CxK{PwVB6YR2ZU_j@<ITf`F-y;?yumr4-_cKM-^~EiQjy!VZo6M^i%!GS457
zqontU1bNQ|lc(#XOsC2@(riO6qVXl4Ah7S#{;8OeR=ZsCFCv|$otU|{P$)+Mh#HJ-
z^ujz{ZHbq7Po*nuRN9CwU-CG+*bq0vSKyuVSM5i%$VS)ce~Ggj-r=&lR=C^>c#$DP
z6IL^0%~O2Fu->zzLG9-{Z#gC<;rAHeH=5y8I69oCS)=L*S(6Gmul*P#W>h(WeqYCg
zC611uy;he{2j6i8BqN}yt{AkEvE(0?kuF1+#UX#0{2j2|`Zv1{eKqYh#~Bu^3;{Qu
zD>XTY_r$`!00?Yiq8F(kJK>T4neJ%jh+TdFxQ|^kFSW}2PBiTZpEB91ZD3h#%YM<n
zXq#8~yLP^vkITCB_+0#rS(!0erlx^@r*{nYlSA>vjv#W}xXl2Y&JMeenBgoktTVe%
z1o3hyl?Wc}Wu%WoiQYmjDc1W>4|xIJT<yLnLP8QZ7D;6=@0VQ5rh=J%cLxctGSZvD
zYw)x!+m$V6lny?Iu7?q!3Nu01v9ENhCFA}Tn49!D;zC@uy{;{zjWTBM&k@P7UA5s7
z`nAIk$d2^5FK0~7LbrO<OjA-WdL;|N?pIV{YF-q#gqcnU;`vd%`pNiYf5%Kbcv#B8
z4FrgvhA<&E;2}(i7bqH01HnAE%qP48?6K(%mlY*-Epj!ZqP?Aj5=)Jt<3_zd&B!QQ
zJG;vofD?f^P?u)}T}LNJtE?<bBmH!deTtH>1Buk)Wk)S>k!tuN)L$!nul+*Zw<UUT
z1x@Y+)4`Yc;~5`35A&Ahau^Uz?AG_!7a-Y@tY6w~m0D`<8}jjDf8ENVfV@o8K6IMB
zNHoRzgM$l6t5;=)?ry@UWhgCK9MMdP1dKK{9IcaDo@=oZ&O|%z24f3g{>HZPV6>AX
zmJxYvB4hr%0)aU&kH6qz4%|#0%NQxyruXF)XxGS6J{VwE%|h}Xgnlg~+#q@a+q7ZV
z2X}PT*;#M#X@HLWWnUB4Y{2)-@2sPu`Bu^Q2|R`)#8YKiNfQ#SRlec1tL9nF9sJT)
zaGq-rZ|U3rd*PUA77QBcqh+;pZw}XgO(#-#kb*Ecfb{>WoC&I{)3>#+aRz&>YMKd;
z*N1;aXeQTtyf!|b6x65jjn~<niuzxppJdlMshL&k7U|cotM*x>&AX%L538`f=iSge
zF7RQ*IbV6@2Hd4w_`kiqopPMxK4f)_m3>XUaJnE{8h9e*zy9npt~Z>y(ruC5t|XY0
zWV9yO_2zR$-u8k!veK6JP#cF3zm8cXXw}_wGqAXsEYFx4jCBX_<8DzX$_x<k`PpI9
z=9Sx=I@R$C1hW&sD(w;aF+>vFC+Hw3XTA4sME0h6HB+|5=LDwx)mR(Qj-O_~{K&Ri
zRLS_I%|EJ*0$L_lHia|zB_+Z)=x$Z!+*>mHz0E*RZ!x$sK&5uXHY+d+!finah9vaY
z@Ff@0d<FW~$El@Wn>GG-3CY?7Z5k|Nce;O;w3gpLx7rT{;<o_ko&g|D7IiFGa4b!y
zpJ&L<c*w)X+w(Ip;NAOzx8qm1Q{6QKhB$=t()15D@CljyUTAnoqu*6sZS0V9-Dh9I
z)bsr=`Z?4?;9Ub3j$Npp1kxFD{g+#7?+$0uKTh7(QqhW3`jn~4$Cc^-oV&BKO-^;Y
zjXWn7EArojH6L;a2$dNAc)vTE2j2M*y<`_LRTFd0ZFzLu5xC~dvXS~wGHRG-^r?8|
z<#^=3NN!(p;2_IsSD>@Q4`@8vQye%|ZN*_@yxy_NdiA#q<Nb9g^uIa|zW>XU{%c$i
z&X4lHiswIr{5Q^j`(gjn5Wj|nnA89C{2>hW|M{^uWHr&O<#uz9YVP~d8K+sSp&z{{
zD`taddhOen9TT%?;y*xSkg5KMm=_4chN0MK`BCbf=>1X1Lq_9zuN#}s(6@~=?|D@k
z2_kNo|I?MeXra(I&XhY^^1@GU4J&0;XI(34oh8iWRzHv_))%69?GVVke!L5BtP2bK
zWjNo=X&-;5L`;0(7p@^raYM~FuD!2Wm=z(A^{=I}oHtMP5n`iLUl99VpfAJce8B6o
z)tc#4VCChb>gM6wM96b+eTAV4p#E=$*t<*OhLW{jhTUJC(8jjAZ-Nk3(=N}tfgaA0
z<xdQDPiuagfi#WdoM_>Vvz;!@F4UcwXBTXbH?GIp@;W{dzJ8DGZwIsK2R9vLcOA@w
zXTw%R9NuFCVhWwqh>7z5bRbCy9-n9Ew2(uAmE>E`Txz8(S!hy-FEB+?JsS{xd8-N(
z4-ELw`V-w@i;#a@;4btZv1(TY=<nNe{bpPmV@y_VF#5ABNb$vQ<FyXf`K$fA+-eKQ
zav%M<pF*}2O@PnHdZkz7d-nh|giD6{kHYog(pJ)kU|DO^<<IQ1I6PA%FrfPH#+TTz
zd|X}kC%n6Hd>i99hv@X-TJsIO<DXJF6p`-v;Z~^GX+~YDlMp4o<L4?%rm}A_n#^3o
z<f=l1Ssxoh-Rx%O-fl_iCjnut4XylVZkuz1Dj1a|lsj)Q=HqqBe{!T71MMAv_sN*;
zle+;JC0>nR0#;k{Skk@&2fLbPdF);uo4=aGzIc8QiksQkDlJC*K!5f+Qs(B%i;g?x
zJ9ynjfvslqU@~1}@hb4pDA82dwL!5?uFu1p;f9QgdLnu=A05xwiapcW=>w+p^cNzI
zqpjB(aY!?k^PQe1KX=gLg7t@iZX@-{&bdE3uOGi#+DxH$XkjdPxw<>E2+^-SPwp2v
zFz|b)mGV8E<*Pa`CtMGM{{Y)W-u(1;q?~2*`Rx4Bv0+h%XP?X;+3MjO+;Zi>Pw=JW
zdJ&BP`_ZME!Z?>?;}7E<atW{R4j8ss{HqbEov~QVonyz&R@3e6MU~UpxlxE45^9WX
z^-DxG>P>uFlcxs;lX6FCeT;ysvW4N@-q?tc-@X<dpThXy;*67+m5p+KZ}D^6WJvOb
zAkgZAhmvwTF3DJqwsndr-)8FDqyK}BLXQcriHS*QzsQ4;U)454F@1-RyS=j+l@NgH
z_2A66FK6_j?8H_J9rc#gGOz0I+&Zx|y2VS2304T70X0%Vz&dxK$O-ne(|O75?d}K(
zy;M!$f<506YfJ9N{n*7{L0EXLlj>J3=*T`dr-{+!SvKe0C#Gqxm#eqE!n5q-t28Xa
zMR)JhB)(J1U%T{NPqQefw{Bj)wzE{ZKI)W{5-^uP<nf)ZDZe@Uev*V6uh|RY-$UY#
zq}VW66jbzo{q_y7jZO6Sx}w_$dWes2Ar2M#VpHywwfjs~&fA-=(dHuoEqk+3@niHw
zbv%*y8|Hr-vPRzr1NFb1CO|%`WfB9QFT!^O38~H<9U!vD(jip2GzNww7vkp2dYE|Y
zh|?=y69;8pxc<uNN#jSk+*G+=fs}}!zX)@J^r>KgnFIoyW$-u~Z=2mOi~Q9e4vy8G
z)?+iT?vPi87Wyov8g57%!9~sP0sj6TUo+X6U_`)aoaAU^AwPI`&ZB1!2YQkoaYOL?
ztDH#oMqi<Dtp#sjrntpe%ZUs|HKc68rKtWN@rjY*o0P3hEy_vN6eoV2{+Oc&ioWcO
zfTh=0y$xe|alV=rc@0%njlB)!SMC@G9`7u~dnsN(UNBU4v`hb$%s0@CgK2!i@pd^o
zsZE<}4r^ZH!(jdB3C7~f_1&73Na@OnT>I@Y+D7TG<-m5)YUdyE72n8Hl0MAVlgWiw
zb-W~ojs9F1DB_>ukI}!iHpMzrejb>N{E)Hk*64JsS`kyZZ8ksaamIU59b768Y-=@k
zU<zO%{v0R01U&ise)V)d41|6Y;Y*YR3X`afg}FuAw|#VUVUnkclO5_yOPZb$Q+8mE
z5q2f7lixyfHMQ|mKkl2&a@T=y9>|SPjq`tei`%s<Ia=|?Sh=FSn*sawl_Vh>WI%Xb
zj@u1|eSdJwPF7wUgS(YFI;_5aaUnUvLeztg`)0OafUQiVBq*?!EV{L=-eiBSJz=bT
zb#8r#Yt85JK(Cog1c}yZ|Dtt7kO6=IwPd;2<JJGvNn*l!Pjqvjjeg%M6RuS$CG6g=
zREeaZ6wbN(Al_9}UNoyXK>p?W>h<n?%AK@MF?7A@@yC!sz7PxehJO3<i0f?Pi^(ev
zXlh?(IQ{~O*%(CUQW3p-VJrqTfeCmS49_c|4LDfc2*maE#oT4v{adLT%7y(g1s=2y
zcR7!xsY@RV?N}HsNKykTJY3dZ_CKH|pb}xBD$1m|8w}FJi9-qo_iUAf>SE<rI$j))
zdJWVJGNJ|KroKImuByQgZ>ROO%@~SP(}4`9gg)Z*^;z@Z8@h;3%P`Sysg)tGCMx=d
z8eZ3lC39R|GgQh1$`3BMA4z54UOJamMT<5Mn@fipp0;4BnR{9Ue3vDTvc@BK(-F1W
z-Ro4<V6Y6balXhz5(AGuJT1EO0l!9RXS)Og-u%65X$91!^+!gh6;u~d4)L;m)gE7d
z`r10W&9!_py>8Nnq7pr&FiQ}Z`(aTUBlIZ|eB~~g9b6vWnG^t&aEZ!brM~Y7k>SjT
zdwm+RCLjUHK#al1!b^ODFULgzGk%DT&9jQq-kVvf>U=agzDY=Xls(Au(?r{8^vA2?
z>sok?M*PxAedeGGN0OuN4rs2=V*!hCncUonn_O<$3SW5VXSo|Vwcm~*<VMcxO%!4%
z@gM(HZouwBFY;C&-H%0!T-Q{ndEAKBG+ATHZEwfD*?(&2^5pVlKI(geFMxhrAIi?O
zd~lm^a-`clT(;T|8&L-T_K9q2)nNH*X_;3hrB7v>(Mo6VB`vCqEwSnr$tn$ZM42>7
zoAieFZfCv4a7{O`Dg)*`N!q`A`#zyOB;CrSYNh;7N0UQxcnQ{tCHb;X&Yy|>nbSt7
z-lf?;)*+EjS7yT3O^NruauRHn3LIeJ@Csq?Z#%vB<C-EUJlStKtXedPCQ+UF3!4_<
zl$(F=aQxB0dwYbsCb<pngO11HHnEWfHJ-PB_)L(}jzdun4m?n}im1gQ#FhI6FjOZ+
zoWc%lbx>-JqIjwt%#PX&sOlX1v&b#VeFiA0$jfr0%a=CWF5Z7s8}FepzJBLvdzO3D
zKeJ*fFbBokWG*66TEpp(z{!2vq6k}+HAtXdZtke|m6WEg*y`>BHl8EVJa9(!a(RWV
zw3&UPhswDNW^06qEr_AkAIp%AEQD}iUW?5^noMBUBZ`~I2fAIUm-nMmv0qopV}ssx
zq}ax@OZFCJ2AyZlmSJOd@UQQ$Wio!5H_GDsf;YX&{R8cyqCJMJK?6)ECg^A)(ONs(
zgHUn3v$xd+l`)ok=Ol-2&z=<l0*sU%*`E(C1V?hlLs8n&G%%deNbA$lZqAo2nnh6j
zQB(`959i!RY*<(K!Y`J59|mR;JoW<Km<8`wcOJ@Gwk(lnhq(0d3q_av=*(pSfot5%
z3Oa$GH}Xyw8^VV;Cdf#p3YsJ7%m_t9sd&JBJ(&coq!cTwX<;*rs{!li_p7W1zK>BP
zdb!>#MOg*_2!qvm_@<))8R<bDWdBMoG+xc2Txmz7@#RoxfmQ9`66fg!7Haf&k=eYr
zH|kJ6{a2C{n*2f?^KTW}atO-Qh}TvBQdmr=7C_^5W#AF`rqTj+31luhRva0ATqwMu
z{x|q;$|a-oVyVJoZBYyWp_Tf9TwDnVV2-K&mTJtsuA(+C&{G$nI3X=McR)=v#<QD}
z2*o+RqmpYFzS*qtn%t0C`?nL}D=M%2GP%RC=bKwNkRfSyrN^io=R%+E6U#l#Fp=@|
zL=|Ke8WZ?1Xgs)3R^d}f?`G$b+3H$ASB%K0Dh|lx*N$sr%nb>2+Y1VAN1kL|-QD$L
z)jGa=BpQ1jZFx}r@_k1S0b}`Z+0H5GNc1c^1<^!4Ue7L0Ou>}Om3mJcL>yXT{I*<-
z+beDx^&$*nwv~EDLW`ak%=<&-IB?&%r4eJ95E+Fm7ZnVc$FS>=B-DJ_8&3Sx>1i^{
zJ;F`IkQ?`89TE?x`TGejpb=RH6UoGb<s^0I0m-cT!s*OVg%G{MOvCL<`7Z-w3Su^Q
zd?DDT=1~e-@KS=P;d7<%oq?-<(_J)vYrZ~Pr=%elB@?vHME&E>G3^o=->04I9ta%t
z(a4P-&t16@p?MlHHKm^cHC0w#7Jp*&&6QKG6LFr>=DkP~aKg_^R=<V!@Yo{DRB}<v
z6r?U{V7;eAMwI_m{Q+4Ui6&c*$EDjJ9J0twlNdI8mErS3=CL>Q2~bQ<q^<5f<vo{9
zOAeiKf91}E(R$3&qw}EWksb}%aS{x33|Yvs%0_ZRSH`~1()Z34;=e@{2lD-^&#-==
zm_&2(I02P6#43^bHdV$roo^h#_n!|A8V-of`_DD2Xp4-Ems4L+Fp)M7!kPG8<MgD4
zr#)1o2-pv@f{;ZP5?KFGDAT!@;Ss8g|IKQ=s#nZvZ<*9wIqQ^I4x8|dt+6Db&Nz<R
zp}6tsOF;kbm2aE6OFFuGob%L6E+X<OZ7=2qJlFagVOnqp-VtxQu2=5iEx5@pXUPbm
ze3w;QSFn?O7_cNndCzc2BM;28&1wGvC|mK26OhRkFrtWI?hovp_WbFh)<<r3dg}LC
zZ#e%i9=KETxpstg@*BZdD0_2li?x70$!Srx3({^TsU+8g_t8~90=4{3VFtE)Ew6&U
zjmcK)iv6+7x1O1L-PbP^ZW-TY!@=}Y+X_et0Ij;JM&)+vfaTcn_4@H?ZU(Y^_7m`x
zN}!;yZqK?Mn6}ePpC*VUz;8Xjl9|m*bH9{@zc|k$KH4@j`C))qV0wm!RsZ2<Y3NOh
z6dIz<QFt*gt|;Z~Pxpwh!60TtlRM6_{b+j5ckj+7{OqC8pHHuDVK|YI3xcVIB6Gxg
zV5TOb+LL{HSKi@vVag47hUy7J!TKB-ae)3Dd4oi&qS`&&<;|;E=-;vNgjWi4-eVC{
zMKK@=O*VE?a-+B!OAu)7h<rFOEkz(s;A6ILhm)(JskLM%`ACZu6Lwea2alp_why0=
zpLZ0q4pmX*7X)e6Vj;zEAj!Xi8F3J<+NZ=eO?^7)y!@t+h3yRgMrle#_Ce@z#`0_W
zJjbL7f8D1PNOW&870@sKMYluHk3t;_THR3X0G!!ZKiEMLXCAz63f7~UQ*eA<bqD<O
zS{4RH&&^j|yM-i63A0ri$eXwI3)E#OXl%vl1UZI=t0__TWG6^|fi`V)u;2^lz?16k
zJcf+peot1`N|yerwtJP;-?R|QK+cXQ6XW+}{f^Jbs>WpbSZ%vIPU;&XE&k_|)s|4K
z(Bn#U-Ga*Kdtbd%K0<hBWr_*Oj+@}WM1{TT5;wXMU+=<MLH^3DxY%*ZO<0=HMEP^(
z*H9KIY%(-KbJ2~063_Lz)8uF<u6yn76^u@PjQD<R{Tf?-4i!;jSfa2Iu+;&#I-!EJ
zSKc^&f{1;;?fq&Oag>kSxLBg5;@y_}$}Y5<_cqQA+nYV~!asyI`^i;vP0*u0REX>$
zqWy%@#q1&$7g>m`VmShDh51vo8>Pb!Jc_R3aFJZ*TO-12feoXQ$*OO=&eEb@FLVjI
z;6>%xKC;wj1wi>j0<_21dPXlJCj1+@mqY-toxYTiMZi{iN+?&JmEm>$o}<!ViA^JL
zl<R&g3eNe+lVh9T=3=6eQ7N}stH#z3Xj29YEAEQ<A0>Z;WI&Vs@_x%qi9p0AjEm=B
zvgW(}UVjX(d!8yAURw$qrtGYZFf{0h13+=R@5^S1TD+?Lv#Ws$S-hM%0d`!y)vMjK
z#EK^I2k<&8ewQ>{k2ZPADK}WH0T4ES^!o@xs**g|-s^~-Mn{{N=K@8N1R-8~UGEHT
z0}II>iv;6~)R%_*&Sxl*p%(+p+ds2e`aZSk&OJJs$NH0L85|ho8Z55%#@kBGnNtR#
z%w`nWCe_t*)#O7X%feNTF1ro(pBTt0@^j8{hJbOS2)&G5IO@7t@Vm&zl-jW#$VEvE
zb1Jn8&4^l$eg`l7K?D*PdtCVx`@lhXC!Sc;c#K69#rpgzx=z}ycRmSn@;-j0x>$_i
z?((zzm)~29K_Jl<ZM04{EgpdnK`^bASQy=}SEnWcouHXt_B}hFUo-E0cLt^5`ab;Z
zt2dv0D$A?sg{Vuq6Ge%E<<Czts8Ip(OszDKWYSX-hFcubr9AZG*7;$cd-FHTA<u&p
zPbfb(%LK9aA-cdbEak;4hy;x)QX>dxlmon4L+Jyy1tF%S*(?dcXkq^2v{i7sPSc(w
zaENDEWNuH3N$E4ZEhdmpH)3?b=NL<D8$^gEa>vm(b#!t2M7sIR4l=f_n#=Ux6$6%)
zS%VPdApsG#T3j8E8o-6yb#!EmUw^eVAg+FUgW%LzQZRzG%eDuegCR*nfAC_*bMnhS
zB5ZgiFW&BjZXVR>Q?#C*p+H66(Pp3LC2CBv&ZlsSf2STr4eRertrV5gPBKOpm4GrZ
z`zjsH-_aJ}zEgT-S~>5l_c@S$RuamtvK?Wg*KDT32kLZp&i-gWwsgsJcS`1nP#=yH
zr%PF8?-$?=Sn6{RdpY2d-NP?~kLIg>cjr;h{kZaUfozgWLf=%CHQl%N#aRDV$Nr_w
z0Mjza$`C;EG%8lYeM@(@7K2}ZZB-wwyju24wQAs=_0#Jv^y6cF>ulsy<Y^1X&A}@Z
z^y5^Hbti((TKoLE_B6Hdsz?o@amH|c`dw-V&OZ=b4E7&d39Es)SwO-;-oKiuU$;L=
z#=w9<?CZ(`Gv;g7<W><H1AF;QkI*XfJ=qm6l~D83NJp*zsj(JZweGW&)t|B~1<29D
zhqCB3JYKZ9DAMY<C#o+wTO*m;Ug#_p+T%(|sqlA`F3-9Y34nR$<F+^WlffHXr<6R6
zKAXd*@)nuouAbPA$8U4Q`xYkyInD3EJ4-F(;x9Dsrzj?IRgh9qxZ}4)brSD}lVPct
zt~3AMRBM-iz54h%Xr(Hi>Qx=id3huR>ztb*J>71l#qY>4;L3F-g19?F{=E~h@BMz=
zw{({ujC_TmX1T@?XRM>h95nI(VpEM4Bj}>oZ(x#wvG$GAet~tfH6<}^tzw6LT7NR?
z#x;mPrpE0rmB&IdF|1-h`2H-e3W+cV*8S(M%0F6^glG)~PkOT~Z;d$h?qp_k$Npd9
zG_h7eLoF%;EyC9u3O7V!E$Fc`w;tjir9(ktKz!4-1PTLM4$hT4^kV(lOVX^S5*;-E
z*>*5_7&4R95b|Sv%Zu!4z0?m-!mjoelh!F{#OXtCqOWDE$eE1h16gaf2M&_SWU565
z=~j_owSm$4Tt*skD{brglhW<q65YF!?&+E{kb1dQ?)}yF!`o2nznquNv+uqvN?pRK
z&s#@LcYECd%|*6lqm3EFRQCI3a7>LhN%KtuD$6&ZLTqE<Xx#>m<cF)bJDKV21BgCI
zY+bj(e9WS6;M&=NKBF<rBHBXx&q;Iu9=M7|Iw#xl%kH5voU`v&`@r8PLdHoYr{{l}
zjVbKoSs5v?SrIqc_;!N9uLW}z0cNZpV-9`%e4c}25wLCE-MdRg@oZGknmB|CjFliP
zlUrpJeK0cFy7<{w8?F6E_uq;FZ)KIKHxVrEy#JteFvhq4r28q*{I7?jzFE}yj^<y>
zOZq#v&q9ROw!&LWu)7%arRNJAi3Ke((Bq5k&#{#<^@xy6o+ExI(oI*PY%Ua+wIY*5
zAij1jqtRXDLt*YZ?afWuI_mYa7G~ULxt*a6glHP(c~}b>)LlV4X5zy^q4Gb?>jcJq
z?`*XzijRFE(b}65;#YL>1mctua=Q=nkh<@4AiX?Y-;aC9>)NX1+9Pf%r|Bh+6<_GI
zJ3dDe7$p|0xBImEIKxl_dmU1c+~<4#4r%febj@~t86dji<{!!Ks4Gim+i;&8=EH`f
z+lt5Q^5TWvD^9aNKCZi*q;}P;q9`YwTU-k}X52K!>5@%L_0A{}NWQ$_wV{6g{!M--
zPkxGP%7e5fys8*vWPL6rmRM?cK+$}KO!9b^MsbSv0y*{}&{iMKuJAF}VX~Xri}J1E
zv1Bj$V+=Vqth4WQbPdnZ=NM-N{rLSACt*{JC;OS?#y(QidbtLWqMPybxXaz+)Ue)q
zMe&*$*Vqc1MlOVNK10!mQL~d@Vz>0d<vJ&F4{&lkwI$k;rXa^nMee8q?kOYx+r9Mq
zcVast$QJ4ae4qNsRDudUT`M}2;7^k+R2F-OCerv&v#XO!8+#Fvp}LGJK<vmc+ws);
zWFz<F8|I@XR935-0?+bp9h<1fd5j5=UfH3nD|Y&uFC?;F?MsreWo+G{3ZQ;hV&Q3v
zAj4>;Yt;b2*b36r=U5+pPgQToey|o>DK4uK^*3hzl^L(7uU!*0*b2$y8UlQlvx`TH
z%&mVa=cLvcpK5yAIZf)u$G@mcx|>KR!COsPP0TMAAK`_HI?=u@3Uo2VOKS~H2tac>
zz}(lO!XAxWT_A0ianK%anxUN|NyKfw!I%3B873~&Q>60V7m+*fXBc?a*|BG5*FL*J
z=#o2EImd8dW%?hli*^e;=RX>AN~p%jU7@mafKKoA+qZ?ECT9=EIVjSiT9vC1ULKah
zh5GTZU27FW+A2~LLMW=iG}MsJ_m-+M6kq`kTLa0A3I6gCRuL-W8I*sL4*wrXqdF8H
zu?)MOliErHg>r`V!x`Z&QH=m3c$a#<x2DjSS3}n|W5T0{)}j!ez`dh?59jP!^KSRd
zgzRJ0MvEbRpdui!qi!yjBA#f`Po&wNNW6(BjOo&mOTxvOBKXCypgo=pmo<&~Lku#)
zD8F|&y!uZVdxE)A-RcVW1`F!v$<T1e$a|D07kl7L@K=W6-=^wRg2|q6ddWJz-w3+G
z=>)Fy`si%S-SIZ-Cxqzc`BD#s@kIs_^LN;yEe=Oe`~)=<%OoufsGn<YlDX)%y2aky
z>3NGxdd#Z&hQx@DN9c3dOgg&nYn8)8uNxv=wsh%zpk7SW8T=Ue#u!mm2~%HRC8<&L
zHh%i>iZJ0r7)#u{v{XCfw9KQo``4{{eNxX_SKeV-jCN8D?wXxq!qku>b|1=#H4B!l
zTf4M;pNd0vLdTMaJ#|VbgL2QShAyrOjN0#29g*d_ccx+wWn_`!fDL>Gs*iD3d-JU-
zGFvCeh0}jJdM^fMe$RoO5HgJ1G{)JL-hnPX&KMF2>*hmN#x{TF`^MzdJ&e4DvBYzO
z*V{40)>m|BWZ>$N>-F53Wg&FlzWROI0($KtuBMuR?YTnQ#~#5)FA%NZC*SUl^mmP4
z4~9sYY6<Flp%FjaZHw^4jJtOJAL`yJtd3~u7G1czySuwva0~7lEVu><E&+nOdywGn
z9tiI4?oM#$Hv8=VobSG#r+c4ab+6U4X3bHfMvcyaf(L&%8|r_tH&NIcTx&)`4}7uz
zABMk})V~ZEDtWq{>ll-a!U)yf>4S*UfZMF+kZ*;Ja&Q&GVCas&fHqM9<c(G^1Md{y
zo-GMd<2Ea?w2Vk+BK-h{Cv7^Bzf{lDVKdps*S)b)z)#gP_%{g}ekkk}&(-bT9LF;>
z&p%i6!Ha31@$r+GzKAEFV9>0w8ZheY@^bQicEn#FD$&g0OF5qw)<zpbsUpJF-N=A;
zcBa>BI2)ID&o&CH7=Jq=R&@~R;#L{1h)No1sUH7K1~DI2h_g4aa)J{bX+zKxyk#k1
z)tmGjKL^ql-VkvI=!n)|5!;vc?cNhZ87S+;GDZs9QJqNM&A3qFUfF&$nk_6A^Z5sx
zx}{t8Q#``TmSvt#E{4GnrFMSZen=jzO;WJ}2ZohLh){O-J0>hSzh=wiy2^g(6j;Hb
zNYCZ-6eye#Ye^(o^<0mfP&csjJ%XJZQY>WpJ^j1;aR|D*01x7!{mE?Jxw@kIo0MRF
zsKx%GA*aN}v!^7u@Gy;kfw)t{J7rpUop4l$n}#Mw3c~)%;E;~wJVyX^fxR8}HmKJt
znW*V*_O<C(#MXBg8>NY-8*DJD(p!=1<T-yUl4O)(DFdGj+b$;T5R>Wb%(W>xb`hi_
z_2#`|tcCNQBfqqnpTddu@)O~=beB(0dm8yG*gNfT<8eMXSuK<|Z>sSkf?+w+0nlKp
z^wAqm48e*FN4Ejkk_b-YXiPXVp)>`)?Y?&p#YKktpy-}Hf+}9;{b(}|Gv49CB;UT~
z>j{!2p3gUZ5YB7)Q@2Mdatp_9vMqvSuvZr!Y5~*$EpZ4-n4i&J4|D*h=2OEmYj~K?
zM$r65e{0Mq+(vi%<&@?;pQLF<SR{SEwY*xBCPr4K0ha&!2%C%`I^*71Fm`A7iDa1<
zqo5={<%ApJ!E-@<`k3%|FwQ?`ONlSzdMkOyaMS4uZP+{#56(uWi<)^%&L;Mzc_ZD`
z5}^Y0_|C9bDRJqPfvrOTNUKF*)$*OOw8Yx#E0`3Om2KB9KubvIJ7#0zOLO@w_c=iZ
zmn|*lH)L=(<UcSD3Z`I@v;wowkZ}Gft1$$=<gv;0`av>+!v-3KQbLIx(`t`1N}thg
z%ddY@TrrFAhAP<S_&eID{#}Tr?lK*QQQSjG`{gZcbtR8EP6Htkn#=ryQ{{M;j$-%m
zbGH9wHzu?6kHZ)CHW5z9*m6`k-?joJS<yF9SUew-p%>)qZ2BBtpcBmgpCad>=DWHJ
zNpkubjfjee;_tsoFQM1)r5gve+m)y&$fH$%A2lpnP8y2^C?0$FgyacA9DRk~wi<Rm
z3(0x7l<HNJyx^_>dS~i8^XTg6U~AFxVP=JN5U`R5UqO8HHI;7&FV_=IT7uc_=oj4J
zj<KBZe3sxzdr8f_TWDsh7*3{>Hu>Qu%=EUJ?dT^Ve{R+>0zLQheaKN3^6?}NcGK&_
zj|FIctA1pZ5A&D3=O^F25|Bmtv%@Zr`-h+=whQxrX%6DQ%67K<d=`j~?rmgXwg;31
zz)S(j$B+E4cH;j=9`SL5{|i;c|L;FG#1&5*<v?UMg0_18Fk^O{eO*9=yD^%WH;db`
z1LHa&?9LtP$>;R@bj@_n{Wu&@jDh|~dP2D%)4EVq%Arx%Pgb)aqIRE2a{zgHn`FPw
zNfc3yHc`v~tjm8Y><={yQ)ic7_19kAp4PP(VQLjEN{4^e5<W_$)D;vvB)?S-{`29T
zF{-nM=5K_hif<CYK;R?6|CsRDJMs9|L(x#+j@5V>DlafzzMlMc7bNN0MxA{RA|Y`w
z=x~g7JW(Vk@_pH!$C}?;ZHdqlp@pArYN}MNy4Es$+f9%>?q$#)CgdsfX&W*+*YEmv
zK0B|O2W7>~dP~&x28;7Fywk*)Dti2Uok=p0p`f&Ra^l6g+Ts!;5<kMr-dO@CqzVoG
zFrtPZu(LfUaV#eLb`zM{`@~k6oUYHW36|yN6fj}rdn1;P)pr2WZ};U%dIF>`q5sKZ
zKya;Y_xGc@WkwovO81N8Rf<skvu_%PQt&t(<`jH;SrE9s;z!~BesI9gVNjIgaeF(o
z6pYdKJ<X<$+26izKL5_(yX+_5)(-o1e^M8N<KDwWyIH@L7HM00ReO>ZULS4wuVi?E
zR%UHvWlj26kgiu}uItg(vubcNrKTIOCjUm(kgmTX+Yx_y^>`sb(Qe$UmdLMdU$3z>
zVIuLukN&bs-xR&#YW3cZtT`)2@82cF-{QSrkE{`{>VV9(cDzRI!0VX+EvMAg*v@pT
zD%ROL69YIh99>kNri-}P+8Mzd4}Gy*@g5vq(=wb91=N(^Kh!0_j-+gAZuyQm2g9B<
zC6KMUkn?Z*LE$SNPLz1vYc0nt$`CUO&ojSE`g~=tKUq>o*5U>{ZVRVDmybrMGRjjE
ztljf4fxT=9==XWrOntRE^sr`~@qNoP@6gljskb}Dae1WMQnvdN^3jC5TO*S5gfgv7
zx~sOwAwdS|#0l4H>4}O1+S6x4Dg($zfCoZBVToY2RuoE%O>6tS=n{kQ-Byn4xHMcY
zsHrY5e!L>%R8KUgmHx}-Y1i=!3^yeZz=Z!zEJIAbvY_|Wje1@j*={<|Rh;7pvxHE+
z7X&9&MKz^}q%>V>`@RSqmg8SaUt7?>AvKTd6CIFm9#}{WyrYM^M`AFoW;#>YR|g^A
znjqSeamu6F_!#z<w#2OXzyuZviL44Pw|TI+GnfhoN(uk&$=3e?|Eft<o7MOG>s(=H
zBpDxDau0HknW4vv>zzoF{HC6O)X&bC;-y-+)@2#28X1tE!BhjL>s$@}Fj20@Kk%zg
z`+i2tO#|+mgtY{@0HZH728y1DisA$)%%6sG0|+V;L^pDf%Pe{yO9=#=vw^Q(#}!-V
zG0}fbWjX?EJ@Nmu^>%O|FEfI$2!Ezd)z{N$HFDAfaVoTP3(0J~#GR7ibVCYVs|IPF
zDv=Ub9k|`hpxxIW)P)hZ++n4;J%2y=`Q-U>x$pgGL+GhQj*33s!2Gqpg`f>8@@EPP
zac8}g1e!d=ID=ouP!;4<=AETX7I~(CDOTsdJ1xM<z)lAtWQrMYs(B)ksE2V|$=!|f
zu6k=?-1g9oA6<a!-(4<k9)@0%<zehd&Y<WqN(%QZPSj6m3Bdbm9}Fy3rq9!^zc0Jc
z{^%$sW*oVZbr-x056j5OAn}j=KvH#n4$Ns;WCF6~XJ4U0F)=9yzY2>r?DIBxqL~?%
zs*966c~iU=O}<>t0buZ41nj@Z&w>nZtvOK}C&x&LK}=yhWT4<iN79d__G?0-@ux%d
zrc91z1FsB~4b9^;2)$-eJ>*!i5ukB1QVHs8X|2Lrd9~b{dAwZj>l@`oD^T(B5Cm#%
zF<`>+8Vj#OQusH|qpz(b480DX9bo)$H+bPVUE(?3pBdAtk!PM#d_Cfm5V60L>1wUC
zs$@WCgu}^HG5%zwJtuOLt;D#uwaNdoX<c@S)TDjOME<x|p_p+mjIFe2&d1%zEri+2
znd%hi^5W<!NxC6WVNgA-IrTNJ;OjF1<i9dII^=-2mrL8z#)M&)uoNzOG4Ydhq%a(&
zX*U}?rxp<L$apMv!rk`qR)bgNWY=yx!M?*#pK2Xl<7NAHx846J*MP%&o$Tl^{a*ux
z_Ybsk!r)hmHpS&jjdJS?d&a?V6!C*-l$mwri<R|>q{dUQm3S;@rgeGoEAN)JD!nsQ
zQzZP5Db~tmi<{C%pKBLC3k{$&Sp7waLs77Sjf!tKGF?+N4`%03ri%pk2>4Oh?r+?a
zxtc$^HMY?Bl(#lw^?{=nod1wno}%Y)%J%-5F_qrzB*W#qTHjg{rACIsR(WUPxq1N4
zWpIJT<B^x~%}?1*45Ve~PO;<T-__OWtGM|TpZz5eRLPwkenw!^`j3W`Va{=wVLa*J
z@&|QKf$0#rHX$7e$;>X?fz+IHex56%XT|Rse>k4`K`6)B)Pq?KD7dQgslWaV&-@QF
z=T7~#hod_!O*?FXq+K7rG9QN_1!X@{l{F>tRjL1Hi36&fV2$q=4h(<JgRBqGmy7L5
zaXx_&jfAZb&vfTHtplT&=JFj9mWPLq27x-5-E=H5k}lyoYgWr*=)U(qVecG5rtY=B
zss3`0Wh6!K#>RpcQa_aHnj092yPj@}Sq9YZMvBGbF`V4Yy!IOe^Z&+fxNm@F{v>1P
zA-RZ+00O`Yjx{s~BRtQshzHTBm~M+P?}nuzm0;u(ty@(T#N1B)#cl^B!EY+@L^B)l
zmPyjSl2S+x{F>;8;iQcPW>aq-q9tAWCB4G-=_6|Bxt}S=^`v7((Eht$Xqr3h3a~cZ
zEmRlXN6Fm^u6nliY#+s{Bb_c9UWz}H`%CJr;_^3;bYKR?SQAhw&@|lGxTiE;`q|>^
z1i>4G{41aIg@QC-<?l`tA&%t&^Y$??I=79Kdv^(5Te^LHEDSET)GFh5X23xlf(9bO
z(6`%X%2N~QeK3>1;Q_;0e4VBGpqPz`v9CTt9ThyTW@ez*k{{7Fz2Yp76t4;(Kf$ib
z^eP0TSg=6QIvaZ?KihJWKtDbFW<A7C2ArolqrbyhCZ76Wre`sw7x=@<7FJLPBj%ld
z{Kwh;rv6{ZN=$mD@ov1#Q*@wiw_(}>``yVz)(WH4@H!O5XUr8GnEA-o2zv|-PV!w>
zD$z@J`C4yaK}?>`eRdrT4RzG6P)DxFa+R^~-CE064oTFVNd+(_&dP=0wwAY-h72X1
zk9XBBs6_MDx{VN(Ka%X6RY6@@Gk;?E*&B&0Y`a|5xZJbhIsE;o0arjo<iz{o;mA|?
zYhLX^Ee&kQ%a+G6U^z8Pdhv%&Zz&RV>F$BiUq1C1tvdfs4KPptp5XE*53~#-u$QeT
zYon<qA`Tdw^lTe34xu-3)DJ>b?L)Tp0klPoswy1|5#EM>LHDa@4TrMb9yW@5M!$Jw
z*Qf>TfooUYrP|rD4hM}vJbdr0Et*00ANf6VeOjVAWzX(Pf)lypsqaTGQ%maKWvHjI
z`a@A>!D-s}i=h4SeB5ob><By3^OqGY>Kbwp8XBm}n6uuKD~v%`f>GWc{x`%LP-EC)
zaWm|E48w>Zla*3!w#{NWgfo_><0qz8#1t7|x@}GI(8INYTf#}g{poJ2;RhOgATTm!
z=MN?n*xKNC&7>7fRnf&_)yEQ;R0DD=<~L-&dK+Es&&0_uI?(mmJo>#+BE3BGC8@j^
z=l}h)?|G1hrOnGE-;-YAI}__4@?5P<H+p=Q_%w5XrmoYo<W-ib|2<BM*A<b=R4-4!
zNWLP(qmY`Nv~BwzsGa6IJHoE`X@I?64j3A1j?$^<5TLcJ5pbfautcd-nmKAl*Ss5)
z1;{(c6<8ia6a1DY!CN*2p)1*Y=y~VgPTzb|47a6WtJ_65OHf6um;4r)kL6@m7Af=>
zi?x73sj6~#?!0LB8P&R&!6sTyN8?tUE@v(fc<Sk4Vz#fjj*SrWK<S=Mja>b-#?y*y
z+)J1&0M+OEJiu%<S|@yt<(Bwr*$?`5U5LHZGfApk>4uH|<FoRAmff8Pso;cT1U`C+
zc^>p5pF?kQ&rJi?IDg(HrY?^b4y7sSpbhBevB0F1F|=J~IU}P3ZPk-!(7VSnO+3N-
zC@%l^xl*m)+)5qCSQU}Qpn0b<=Nl^IpmAhAdjFY?ENf+5ZA+bzeg@m+MtS#LeIxej
z^ib&`w3DjA1xzI`fwo{)M_q=3fgS?eNf?Jg?Zb($7T+%e=NAiC%d>dGHpVzG1b!^;
zV%${e0|Tvo?6uH-CZqu)jmy4o+<$pnz5PR>x<t?;Nc|lK+=`a;iq3bnyw>QPe+V47
zxs_;)Dn!JPqjg{0Y3%X*R9$dCC-F*`XnEdr1#j_zq1N1K$3;3r=W4xZlM{3x(N$q7
z@Xy14cY^&OLreY_(2ecN`7I!YH-%i`opq2?P??z6AXMtefpd!hd7gwEz<pl5D@YJ|
zb@!LhYk8w=-VA;T@u)94Zn=KRfsOZ^9@rRvqTzH-!QQ-?DkDkD*?XNIiZVEn<FTAI
zLBD_J=&eQh&gK?}ZDo>&I2`<?ksn^tv5Wft1=f+Q!sfFF%=fdEigVv~aB9bF&QLhu
zB=UUfAlkW#;JJ#AgI!gEaDkDw;XgFQ1%KW7*GYeuNq|6L=m{AHcz6o))w14_S37Zz
z;*V0AADk@}8hXwKbr;dJ3S$Eq{Ob3AE0{N5-&I4c)mZMw5zY)Ubef~1Ar$@}%95sM
zbB_Vv&=?ipH52{@X`(2W|7nGA%Wql-ONsf4ePv@jM1P~*2etDv=K)0n9vIk%q2Vfc
z9k#{~=`TW8h`b5bD2ixI&v$&2*%w^Y?hh#C@Yz_u$Ar8bJ154@Oi8y2W`dtM*gJIe
z<u7YIJM8%2v9ldb)>(}t(aL|iT`vv%AZmamakxQDIFC2S<$B_JuGxDm=}UTy8zi3{
zQLorHRTK%Ah$?&r)p@Uq)FrWr9E)}@mOMR~uCzhVKo#bP3|yCq{x(>dtSay(u>s4E
z<)0I>Y~(oX{QRZhI>QBFWXg*oSIy>_Z9662(NZ($rN`D3jOqE3U*)DBw{@I1`r{ao
z_}|CC<M$2JPQEIGZ=+IqYWHWq#9Y5yF_y7=(h#-K<8*J@&U&bi!puMRjxpsie@ibc
zlHU`b(56fGXj8=FH&mY)^m;V6kIN&0IdE@8<sW9p0jytV-G3+0AcVNLy<K(ua7T%m
zWsvfEH7hdbUf(K2x>CzNuP|882SsSThQ$DLX0J6WzL_bZ?sX6>B@FV;`tMhnDe`II
zCzbM{SzRrD|3;VD3N3rUp&073y^psWa7Z3q^E^5`TN})jeemHgOB){Qy`A=yl$fjV
zrwV9q88-D--LLk`s?hDDBCZbfp)0y1gH$`$_-jh+sKzlaEd`kG=EYGnb8xtK^o3x`
zJ4s1Gva;W{&(lim7_iM=K@*$lWzlP%@B1>LCibTrd)V2U+F1-jLbr2sV|HapUxF}(
z=LuXTeGNp2*B3>qHR+DEa{b-MK6_QJ`MLV|*xf1dyj%af#}WqrD5(ECBXb40&=ft)
zf;cl<3s)vtv@j2U`->UTRI}))&%Ip`9l<zx`U|D^5qJ-GnZllL*@J=h%-w_~^C^O9
zVizYIK3;6h+#G(o?S*3z_(R`$&Aj^haYJg6v)$Hgs^J%XSmn#HqB<fc8BG~^dD(qg
zSuC4Rm-1MTbe;h;7b)>u1t*x_zlC$6Nm`PMYQ%3}@9L{2B=Ew_4HY?M`wQ~D#V0a{
zl7@GC{id7}pX@3uaPIV{=VK5tUr!N{=;r}uZ8rWp_ya(}i|pw+F{@45&;*B|=7?Wk
zDgi%=9o<G-v44@$$-k1?vb1oqygn?5W|g8<W;(_7G>Zhr5I63AKqp_h8m^E6zgl1L
zF2u}1YdFCG<J1G}0(bc=u08_ogAVikAIK@b%G>_kq{&i%zae>3vvMl1!Nvi>E8iJ7
zymXr8zMkhSE|tpB;s41~j-mUfMA<7D@I$(+=!W)4LcHa1KC9{Io2uvV)!o*vLxf4h
z=LoF2pY=FF1W8<9Op}WFjy0rwe&(=nqc~cKr!7Nr^WTSjjNy>V%>NfQe2+?u`DV>T
z7C<sk4rsp#8q>YPn+v=P(S*$#>n9eJIMsXH>CF<*yJEXUD2oX{4aYxm1xP)=Fy+S;
z|G0q5X)z`drYR=e0>;P!Nz?wL#RPyf6)+j!it-;X@fY|1-svYgIoPT-awTst@UJga
z|79U0!rBNBCZnQKsX2hjp${Ab_usl+WqYp0(2gK>E%LZQ_xlx|tHcMV_s;wOB|h<y
z;Nj`^<X|GF%Pipsi;=MR!{zZ@*+h=8%I{N94e9wn3<+75(dO;R!o^0{Z$C2_-qj2q
zhxJY$`S)>TI5@bA^^PF%_j~(=GOex?w~j}mrjr>zZg;U8qh<q;zO$+-I@7ct#YU&k
zbAtdxyT`xnolumnSiH?VQK`TKaFJrQ<N(@O3{sX?IeqT;F7HS7XB9YQ>&^K0s+HS+
z8=VG@^P<+)^giqF*j+DAL4;5!hYR-Xe*>G9#iqLI;8u5y%5%}tNyXoCszjdL-AJ2G
zcHX_(gngb0S8g9&Og@48ECTaVsI>lZ8*uF{YO1P-BwfRC`O-EE#b^ed-nZ~rwdP#(
z>aZxt$d~t3=;+{GRGn!AKVp#K{@vfl&>D4070>c8U~eAaKszO^`dOnEws-l-I9C4^
zV$kH^tTz!ARW>$mn4C0f-o4Bvb#D7K<Phet552Vo_W1lLSq)rGrv@bQK6w?O^5xG`
z1O1mjj?;TV;FRY&HS|ftp^z|;&=1m@iJq^so6TSLAi%+2DCm<@Jk0yZKLx$KzS4&#
zhkm0p{J6$OI95Tl4%?mEV{jr$I6MPM(pVXF6T_egEiAFx9ZQRevPa)0@R&a#Bw?S5
zKv)*d7`>A?thZau{en`*kB`iPLMS1|-rqSRQW0*JvMi3jP%4+w*7mPrN%W+BMz6}z
z9Sg@22L}iD@!1Uhd-5Clw{DB~cajTd4f<d+9*O|TAHj(XdfeMu?5d7Q!+DOY)v=_5
zv8o%R6()|~_TYUxg=3@gx9X`nMd}8_#Z<PwFld$+#Jm`7{SCvod@4(S^@h52lyI!h
ziBXz6T^K?ojy^ou!FVjIV)Y}*sm{$c`iZc_ySRWuUqZh$L7)$XwRv%mfQ3m|sVb9O
zRiy;XR>(FA-{1bLB&2tp2$xicE!W9k4#S{1q5%9{K8*f%`GIh3bH7+>btM43+b`9+
zUhgw~#A?0Obv%{4X1R8a(@y{G2~W9Em+xBZt&`Kq_V%`Bg>Hk(K_#QFTn?Kny{Se2
zxAPuE+`l(xDM$qTktHe#Q}Y(O^=8mWaZeWZ5HK8e%W=(fHmpH)cugCiA|t>4#Z)IZ
z>Ve>mcu?9(^_Id`k3y%n!i3?de*o}o>>gmLX;Z;;n}+VFQ1JR^E2Ou36od3={*A3d
zBb)GlqjY+O7<`oX=HO{I@OfgFqJ9+2hv&*USv3oBNjTEsS<pn_tPq+#lk0X!DdOle
z6lv(O!^ZUtVSBW_S|HFWUb6j&wt<GyN#e>y#%9+oO#~o*_44*z2yKwQ;<~>6imR-^
zkpZrMj*n*`pQc#ziD{qzJ>hl5+$esEjb+iYR(?hYePa~KK1Jo9Wr#{$6?dwS>NF@V
z)n+7^I-mlcziqYIHw*jJFHXSy+;#<9$(mMG+79cOwm4;fG_B2Pr|9;i&sAOD_H=o1
zaj{IRYWhKEb2u1*UvpmJ8heSkPT4J;=j+X_Y)jC_Ya+Ori82-7h^4K~>I-JOdaBO#
zy5y`?jJl(nnke?5hE9qR1R>@jLXs9I9`J-wmk~$$o&h*&)<>QFg0W~?71Pfc6pm~R
zlXPTAv2dx?FzBu!Jc;~rwY1NGeKhSIJtDGH%$&R0!kIC)d+#t`K<2ai4v%0lW)c~h
zR=HO9`b5B<Xf+wj#%iqFXabFt3@oW5qsJEhuBr9xu$%rMK`d59MVh~8{AV{Kzg}r}
zNx^StaxVA!FR;GW$|>oIrQ^6u0<WKZ)}1>QwQ166>U3KE)Lbp#jUA?@!ZeNB>#GLv
zWHE8nX0W-U`|JXV2bipl(?0-W&gbdIZl%#+AUf;se2Ys8v5=?R#d^Nl?{V}hAOd5E
zg{b-X(x~I;u8Wku4kwy_&bBxgGPYkI4-xYDGnz(I*YtI}i35d1-iM><IVQE^+1$--
zG_z$ip9ToBvpa+Rc_N)Sas_ovzViwJVD(n&QfjT|@OXIkr?K|WeM%hm@2(mU@d>3u
zMk_m(I(KPUQq9@enV2lrKRKUJcBZYohK}Y@uur~pc=kR5Hvz(N!J)LptCc5J-_^!5
z32CVgUtq@0$662O4Woff$TMM<$EB=+XRYH+ruS24l1Zof8u{(mYUzc-J)UQ*$_xXS
zBT8n_UmlmBk@DuvLWPOp!hgH?Z3RGbb&rDnY|i*XpcSE1Rjcyn{<oC9ma!VqZ`HUO
zx~Ysp;6GNGWBiKPJoop$ij+4h9%r;@Y{VJ1Z=C<?lV5H99`q56Spx124jX8Z6@JZr
zVXkM(6<;R|FRM0tblTjANFBYs1*3AHDsw~}4C`_0>;zm`GR(1u$LH-7{sb!P#BDVF
zuRzQe?DhE0CgfS;RHrt#+A^TuHl$wCBcWw#zhQd5ngl)gcT*I#L;9=QeePw_#($zQ
z<+MX`W1YgIzvDEy&022;qDX(CiAA;XJ^Hji`0~q$Fh16uiw!5(I2pfyIn1!ZP4c(v
zOJTbVo+2O>epxzQQm8>6O><fDeHHQ@F4tES?y=t(67--1p53PQfD-#S^R7YcT5~K9
zDMK>Dz|Fb%^w9aq#T-GA{n-mxMgIKPHsPuonhwjdrT+7PF4Y_drF#Nb^@(u`OW+%6
z+B=rZ$!`$wdL$E9GqgF$LUr=3^>BEce*=oG_yhp2IA^gKb=m?-uCeq**OU2*-?od*
zPCK>cBhU9Ftlth<FPCf08M<$~{|-2Wio_7|rT0xuU0aMA%XzZ3-LoICu>T&HIlIv*
zi7;NUXH6&?w6#lHY>-fp^X6zmNJ)#zFz4nTEf^ZzD^oS%wFI(a|L{>AyK;3BecX-(
zx@w_87hOaoxe$kRG}{3)t%uXV80DXgCiAscj*!RHTY@^DAuR{ix&w4DL38{TS9GIs
zcK(}S`N^9UJUq56{{M+g8l{q!*0y<kyqYKJy}cX;miGe7&0nW76~nFHPS`b=d``4<
z-{`z013$tKyV_<9>@jmBWYh<}<~ZZm#lbLJIUVmXleK|ony?-o*BZJIPd6WsBAb5B
zZ3O*ZOFYuWEElL9_LSA%YA!_?Gsy20mvy;ljjNGO%gmFbXc}n>XcP~t?sMqTqL%a4
z8M!Roa>J?=37O(TNcEs|R*VO38xbylf7<xw4!3!-P&LqBY0$<UUG<CGmQ-W5NH&4`
z{%{K8Z7-w}Rs&N@ajnxQqkjT?fkmH{WqYVxXCzaLu`bYD)lIYN%hf^XaQKUG1tw{N
zDvHK-wvWQUZH3U!w)%yvX@7;hviY3COZ3y=V$8-fzm8`oK%**{*P?ca6JTg{O+K=)
z56a}Q56i^9N4Odme$`v_TYdbgVMNvW*Ah+H+31bV`PMy0h33|Ft-W<Xvizy?SL$Rl
z{V#Dm7Ln(?Jo%PSt|7gAKL)(xAU75Jrh2u!;f!v_80+O%uoVv}-nF#3zapy@(AHh}
z-`u85nPQy|b6LE7DiQ^=IOE^nHiu$iu5|v4ff(IjgQAv8`W>e@I^$p7AB0+y64e3}
zylJ%17b|^M)X}*z{-+l;t0n2ar%WGOyj}QlQ6*2#G|S!NGWDEyLHpU|S!W3M!)44n
z0(W6&r1lVrPEPxfDKeDM(bg)51H;+@&p9U!K2DE{7mM?A5%+$xr$Jx>{o`dnT~YEW
z%=c$24u5~5#gK^PeeRO50Dm}zy7C-93{EEi$4&RSbi+{~SLPc*0z(Xgg$b5V=46B%
z(EvlFfd~yLOK1-IqqCO`dP7vyXiR!hnaa`yq@WX*1miylQ(m>q#rb$3Sz+Psa%a_#
z!&kAo*;dz*y%BQnHn6(!&@k%SeV&oo*cc8s-eE-0x?_j(PDRh0uIgtIhCfSz<C-{0
zW^?Q<Lmvl3LRNp$*Wi-EhPwRD4HZT~dCLKYh?j)<Rl^+SN?Qv2;kP6y2G}UE%FkzU
zKKo=yRyuHz#E5PZN`I;g(g!sUW)n3!f4}s6-XK%JC)49l-vTtUO9UB>DoXAzyBfh+
zIlM}nN)JX@bH!jpR@Xuv<Y`z4O5QKVqQ9a~l><XWF~Gcs$qYd?w55k)-DM7Lb`+*R
zvHnOI0zk^}C2(?bT5G!p-JPu{W%<Hkf#a@cUCwv;M(ALSa6X2%muhqly6A&?RdR=T
zJsHy~aL{Be_=yA=MHv3%#>Lfvr`8iQv&kp3-x9=fW;BsLX+Aa^x6t^LaWzM|p0GKc
zXsp^<&hOTPLZBfz-6*u)9c}p6DwZN6F}0!*jCn)5hZeGIkQy%Sj0u(-SG(7eOwoHx
z3+N7-fq4auK1l~b-SGzahk>5ozH#%{49uPtt3MAa*?@(ys;PK7UbiiBsrZtV{OLrE
z9_5W4?B#r|4Gx3a?P%I{wHa<ATPVM>OD+T)x0Xi+jdp}dFYXa@xK}p!6fgr-?h{)r
znMqxCO&#7+xX(5-3@VuI%>f>f1Y}jqwQK-V+24Bm5!-e8D1@VTbx_bao=BY4awOlk
z9|d&HE?kb#z*T9@>T*kXh<e+^Gy_(d@1P)XlE70Lz_abJwPCpHE^V~y$9k$It0+q;
zD8LX8KwP3V(G2B5>Ectgt7gu@ElNFDbGJ%RteJeNHob7nw*!ISaBu=dBthb9UZ;>8
zCX{&1#2Ud0H$vX(PiA~-*xmXaLQd#vb=*TL9nb7u_hr!t(db}^N+j%ywYWtp7)7e)
z@o8I<q~p$k0SL_zP^?ZZ3ZEfNf~8rJ2!Kc(8loM5QbX^)Ocb{eakdkj>S@Yzb&Nn?
zo7dE3J|ZkCV)WnIn1o^x(j9Gtx3xZ6X!Zy-{}QnwMXOP2JPe*{7Lp2CWk_HruAAYt
zz%pxW`<Qm0$ib0X3koU6>b%lR18X~DaAjxFT4?Fd%^g}$w=w?TQHEmF>7B;sLJhlV
zzt+m4hkd;>Q03tFOBa+Qs93}u95QUiZlNdmeFxq_`42!_iX&`3&`slV@qGRC$t;O}
zgpm<X#BUmu1p5j0qlEv}8yu6-otn;WKFqqcHCaT4k%7acIh8t21%}vkzvz_K)D-z8
zRZ?Dk)y{F~k$9735}$d_t4v{Fu~lEOFt2C!;}&Yr=?krQlDDVf+R&VA&evKqTy8e(
zIxTAnb>O_-lH~QH<*x)*8Os?-CFZK^att{wNe)@z2!;5mZT+jL-II`vLXJFwUlMxW
z%0bENqg@K1907Ox&2BN^4QEG7bqghGze`T4fS)M71ts8-dbETv9Btbz%08lUxraC$
z4Lr3^VDm*xrnWr~&*H(Pbq%KBSf<=|8niyj9JC0=mHoKvk>$MyXX3lz`3a+a!K1KA
zh!#T<AaN*!fSvh-Pe^l{Z$Xspn+TzU${@dxcwF}9JReKFL3!a`6tPb9M|2=<P-s$y
zFngOHt%#1yj+fgqMbvU}Au?4A&w+zcjE4ikUZ&B>D|t7~Udw)4YOUEC(ioOyRa1Rc
zNBmR%9B4W0F3-jbad%tkB}S6wbVv-mQb4kr!8JcNShGn<)ovQ&wZADS{45ak%=8RX
z1k#azox;rit`IsdEkaD9&I)Ux9LI{=piy~XK!i4^cPGT}_h(ZD1v6seTKXw=nfdW#
zomz!HJ!o)oO-uyi2lsPtX!hz*7`2Ad(R|pUIn#c29F2}Vz=B6TlFWse0IMX|+~n(#
z!A8AhYI@_BKll(sU(pInmB{_cq%~6SSKAek>^NZRoLxudwPsjZoBlaKR$=W8E7JR$
z3QJeMsTBX=<suR{y+>u}f?l@xEEVh>_<L=89n&NM-uRbqS8GFRMcl;nWPZDbsHp*g
zme9#3;a=pDV(t&em~((Ud=y%3|L(F_{iVb6_Gqr`zF5UHTln=B8M_1otq9B1Qq9^w
zjjl`2E+k~}j2tYLXUxaoFSZbs11n8g*8XXj$DYG<fTOk5EJ^;Hi{)={krcqNRYQcD
zAxA4XexE*U_lePI60@)zR4)^YX0W>74FQD&^_<zQNm*D?ad4PoGpaf@{h>O5WUc>9
z&-AQZ(8q{zPa2{Uq-T#adAWhweF`#0sG?<x2o#&Kmv$hP-xqzBU1(L_+&!7R!X6mY
z0YiMpK55R3E9vdJY#Z+XcAy)y)iZLp@~3fgPf_R`A4v1!ZA>2fLOu(kmW=5*Rf&Ln
zBLs<{&Glrx%IF=~xfOa%%n>$M=0D)U5TRhcA;=)}xT~rK1kjHC0(E*4%?2x|C0Nac
z(5C6RhouHw&DC<zv3x2C#TV59^~PkinOW>BBw7B_J6}y;iitZqVvl;?3PvTs`OJa=
zhDay=ih;>k*J!c2HWV!C@dpPP(vKd_IF<@5p>h%GHQ(+?p;Rv9AJKPCz!$aFCoZ{e
ziyG?;6QTvP-0@n6-R|`lQF5K6f4>HufBv@W2-13Qu~iE<Ap}~Py&TK51sH`I!@9lf
z-{S(u&IdDQf3ZtQh0apF-16kKAlD^)zh9sbeJ}@3COdXQkncvbWU@{O&t@^(ZXSYC
zC`TA5<Wjk#tSrI2S2nt}cQjRU{`>i2TDuA51sZR%Hb{Ki5#0W~7M)5q^g8R=Y>8W1
zn!28RY8};p5a$Z|2M#3bLpCo|ZQ8#ZGh`y_$M?-IMP{-^mTZ8(Za6qEJ$H-o*_O5L
zQ=z9_2s~bPx{tQBA_c?67wP6s%^F!q)?L~rsU5sD=tI>(EM-$1Yfdr6m0XFYhWKGZ
z1#*92F4=n^bBeRR9>p5F^NL`%cpAdZ<~Rehd_HFvzAuvkty`G~TCt_*i?KD)BxHN3
zy}~_#pUS`}SiO5;AqlxA{d^g~SyFyXLA(&(ov)2>G5=|Gzi`?g#XFAkDFDaCHzeTU
zAiWm-saa0T!3sJm+`dIg2J*1saqh{oJ7uZY^XF}F8kbGhrK(~#B4;_1B!sB2OssAJ
zH#T42(cO}QJ_saO%fN0YvPkD%ssZ4{6VyNb5f-UMS4$#Ard6w1akU1WLU;-m{fJnR
zquRmUGEbt`JmsN?0i(C^l&P%F{p-+K1|)t{O?7d57O@!I7@?a1eYP{$K$k1Sb)}Pd
z-o;>T7tU)R4Zk@N>j5AI1$m7!)(beRKMtQ+rhKx}qtR9&olZz1+xmg8wZ+h-X&7fr
zu=tm|ph0|Ze7}!Agy}Wco8z4t>}78xWvT8!bPf_ry~?nIMj`cPe~jSxUcMN+8wQz8
zJTzg?QLi!4%5o4Ic|0D-;V{nT4>J9eu{uQZ@Dx+HU(=k|6&zW0vb83EDI=6%b`x;N
zcVA<(@?4-wg30y%2|x^mt^!`k01MF|U1y=t$66!~(_cqh_qFRK*Vqxjd$OR=Dh1eV
z-+_uD${33o+rSJCcEyeITTtkLceHLwvPgSEbdpW*_X*L6Knb{PCoN!yzm@vPqHSKc
zSc4H}AX4=;TWW9amgNSP+RNS(JE<#9{H@y~R4u%8;y~+m#E(6wXtxniKR40q3^p%0
z6j1?*?^|7=EV0=f5MTN8+F4zQ@;|cFu+VLR1%yGdLWbziHi?*;^w=&+IO7z!ol>f`
zLk@vZwXcZ5jA}q3hc+fd_mRS{gTG!KbM@TKDprQSxem2yU;P3z8ngiH`(@9}N12wi
zdYPPaM;3nihw{*2V$15Y)_@GtaPcxs^b6X3-&D~U(1&tt;vr}Z&1M_2PNJeUOI`cG
zaj#_6_0z>4En1QL=$Sns`~o?VGQAa7R;OC-92=ihi}|R)aZC41&Qf}?QV8mCe8n{b
z@JD+XR0ldO7q^=}g&ZiTfN1{6CKEL4DGubCO(;&C%nYSf$<`v-+A8OM8_XQol`o@Z
z2Kx0wAMZcx3IJGe7QIFVM*p4dZQqt4$v#v|ORoTWuZPs5l^EE~ij_eEEsr4hkKote
z93$!q(rL=(7R_T~x8Yu@XfpDfMn&ThCN&!$2MeJ%7m@K5HFck}F7ok8OTgeTrMB9J
zt>GPU0t13vQ4cqU+-#qtUhDMi9SKZ94{Jlr42a6&jIP8er7S-&P>}rM^u4d?1mr|J
z!f!A;YF?6z+#-HWX0Vo32*l8`=1cR#K%Ar$AH;O$iq%}9{XT|_<e!jiq-hXoJK(7{
zD_GXNQ{8=4FOh9frFU!$=vNB}n*cyTAr>rfzYtayyFFW}`Z}?`@ZN<K&Or!{M0I%X
zW3fY>mTi9*A{qo97G)ZpP1;2^e+<lGOO7NLT!G;*GpmOUgX1a`v*j-9*MwWm>qDDZ
zY?%b(LrkmC`IDic%Iwgv2CvDu@;IVCt|oJVAwn<Fk#n~!&);mXZR5HUZG3^LTZGW(
z+n3)=w0Xl|)-<^`ya~B6$<i!7!pE}fii9=p)K#YQm;I-6!~gBg<$;LvEP0|jh`aOl
zy5#{%v16!zFD4@Zgd8L5pD=1<`ZG?N7O&l)rlL@5mINp|03KU1;Q$;3hgirA!b8jh
zn~JDzRZAC$WLZytvdd@#N@=1Um)Yq(GX=4M&(rrfxGELxppbi5i5$KRSck!105b!U
zAYqLi{i)Lu-bn%#R6xoAIEzNR?()*%c->>LoL69?it_)se=UH)&DL7nc9$A2WNJUr
zr0&fds#PWB!ZR@aEq9j3kDiVEHJ*4=j?D<1+8|)KbecUrzq?811j?FUimJ5|>i~Wn
z6VlNZ9oC`gDSz%-=P6nU2P*>+EcaI<(}rqqcl$Q@;3d3k?Jp3uc<vUYw^WXq^o3sl
zJ&#AffslZ^O&2t!14H=N4}ksQH0KKe6gwn*)|?B?kU=D-1JOS3Z$h`@)bx<7xb1Qd
ztK=LfZy(EHC(MhA%L$&$sOI8KK9iPf)ib32XEQ11I>Ok1GKIr}F|Jco6xBs$ZkYxE
zFI%2U+0<(YTsCMr2uReZ-7W_Scd@+WxT^`7IS?;=-g67#5gUj6NiPe)ZC*fQcn#cz
z(^oN-t|4rZ<;opXakd09wtP(fLqkEOh<aD!(CIoMnuz)5>wz~$Vb3Ma#+MEN1Zu8#
z7QZ{;$Gku8fKm@t^S}XgUGmlNcRX;L;-Sa@`xjM~+wFOC*kY&Wt*eWFJ1&gJ90(PR
zQ!tpmcnW9CN{e$zqjkA6UxLNL?BZzLXsbr+vstDJa^ZqjR||qp7#jssZpjVr<I(`?
z^691Kv;Z4C!9?4;l{y@>Zm&8{$aLM&uY+@v0u_VVppf<sb~H|=qc?*L-ByS8=&*_~
z>0s7At4m;%yYa!l$RO=dyRfOKhDqP)Pr7)IR{R&DPpgrT08&m=^=Ud>#9`p^@Jnj`
zwz6p<0jiS09C&>N$31rOP*5faS`~8qDQGlEW^6t@9FFzg0)oalf9=~V*07lcFpv8B
znLt=R@N|EmAsEB5I-ws4WwS;2iU-K~E;^F?n`*`q8#w9nBEhm4Kt>oi;j{0?<ty}t
zm{Hzc#=+;9LSi6n&TTM^`dDjsk5D(`p>?0O4wKap7ZWV;Y1qE1fgv*Is&B#U7XMtt
ztja!VKpY*FlhDw;GHh{H!|WbX{?|H7SYV#L;nyQsSqTtabyc%E93-%t2LQR@sBaH<
zm=vbG0j=0&r@^l~Gdb-wNH#gQ{qrx7HUAejM#>cNBgRkkdA{4hPr1z3s#;&Dd^LfI
zum+;;Vv|_t9iUyHnZkmkQ#C8_)YNnyuL_j3A1C>k{mK4Z+d_cjQ<crLQb@Bg2M<Da
ziA!VZ)_=ff7Q<xMbMl~~(d<{V-IdxsGg2zfQp%i{KbzFM?I3ET?eGZ0_Yr`EC|ICO
zA@H3o_b}%Y!ie?OUy=XWr!$UV+*qOc{ZTqLWiN!-Is7!wF1t(n**t#Fu-Rr5Fo0LA
zk4?K5v*2C8*EcuiU-kPvSZR@sHAp3|K4wJe-{tq->HQs%=Piw+IJZb44d|q5;gdg`
z(fa?%Z7(-W0A335JwWPgJ)x#1DT#ipJSxH0E0A8L5QJ?VNz^tN4o7NE_S)(ydy$iC
zssV5O)8nR@u^e5kj2lk_;a0S29Z}sJ1~a8C6vCX`f5}#}-c-R2iq*_wPzxw_1SGy`
z@UYh1m0++xfB9>z%gQ&R4618e6Xiq7bRP5itCGS`(J+<;&yy~rNx6+s1rR+nsGPSU
zDrE0HQ8emniOA_o%*`ztC6-rbvfm8+et?ACv&+}@>r(?kcl1;cDF6mx9i)_w;mB%b
zO-^QaAh{|Oj|WJmzf?lqm&8AgeU`tK>N>2NgdpxgITi_Zuj+_twtECe64UF%vvEQE
zW4w<~{RKkKdS9&;r91JbzO3%<hJvgf?WT_aPu(qEEV$yhSkbUWNor9P&4<6vhS%s=
zOoVdGTNdY3BzhSzBAsKTBu=r{stF1S(9wisJ$YdAmNbc*N4Px`x;c#i{5BXqS*>!h
z{SSrkW~ZvEI$yOhS)q3i^aYXdIXxaO(RTWyLL2vs6f-P;+FrrH8C$~qtxUpVqOYs9
zfJUYslTI4?luro*kUAUa6Qf3=E4hyBC?q4$i1A`4rbo_4(l>;d-H!52{T!bkWid*n
z6pM6S`v+1OpEQ0!&NSt4I>EkZ!5UAG>c$<e+omUnW$FBL%t4c;FjecJ8cD8BP5|fK
zcPT8q=m;gcBKjWxQ27W3Csm=yLBDCfYe>utkH7J06}SUwh1jR&rqr>)dIdSHCPZud
ztp)9s*6O0T2H7;-&U__V^>ZT%f1?ghJ`><Aaz4+Bsk|kA?_l{-QP2>0FPncLfT#n)
zJ!IqGiuv1{S>Z<I+Tf_)9y$_MAV4Gd@z{UCW-K7s6~+2@SW_-8U0a~J>iONSNEIZG
z#p45=pWplNz&d9W+u^n<G^oDoFZ#>y$1*f>90EA*U-6bNp@@XxN7`8Q%U|d$7i3h)
z1}WlMfm1%byEdjX|Fb8GkOVfK35}oTTBf~5QF>#N&0%0l4<)!c7$LEENE)U!D|WcQ
zRSUl|<fspUl@2Cl-^Op>?!O~Fa4)`Kh!gk6uBuqQdSTBP*5fWX05VcYtKF@UGvowr
z@wQeksaeFRGgAH4;nxoiPifIcG)UIJfyM|pZ1~5c#ZwYb*@9!MoeV)KEU?Cdm!G1y
z5kEtW0}cr-4$NoEBco&Hq?=o7kO$B-EYVb{J*bCO0Dl;nIg*Z90hx|L4AxHi$znBF
zcY&_f>n0=z!MIp{1aTRuMRK7(-y>yCxvAF$M;c8*3{)i>$hbT84L-jR#g+Xf1Q9y4
z?q5ljy``Gdmb`3;7IrsUS(irY`zUvAO-R7^H+rFw2~X4(Zb?^zC_usG41?J{*fsWB
zDQ6%iXEI-U0*QcIw9ffzmkwx{0#wjJfowvj-}`3>^Ux1+Pr!JDyEZP!4^-zxa)cI)
z;crIeQ~i<b087*{<4yL^7_)O>QZ3K-yhs9H=qPl1sbph5TK}zP0XKHL6}Bmjw9>g$
ze6GQ4B~Q;?of`sDMZ)ppDu9t%zqRC(UYi`QIO2@$)Zhs~RfLkRtOHJ9;KTI{Oz*Zh
zi!uxI6+P`=H_`Dj5RV9OuBP1Ys13ARsQvRKwOikjGh*j8#l{#R{-hr-HrwDYr6!e~
z2<+c%&;yCMeG%AsbZSM+dc-q;<#R9s4wGJExqOO!o7-6tS*;Q>z~I!li|m|@VKT{4
zZ;9JM^oX}K27IDN>o(rx^+YUTEBc(z)gLraTe&71Rl|Qr1CG1hUrzcQrB&9iUNNj*
z^?CnYyagHFVt+hgU{75!8s2zN1pvUuORai|Wf-lSk~h?~sGPiV=)i(Vq*p31YFAVh
zg;l)&WzgU$5)Y_orCRH6stgG;J$(7vLzW1|ib?XYP<B9_am7>-n6CGU4)#HMWm<A%
z_D@ffN_PQ+ZIs0J8xW&0(53FNahHUqJ2Fy%>251->>PyIDOV9v<o<u$&Z%wil9?q<
z%&*?SgG>SKYltE{lHz`jh1n;8_HUc!U>8sxBK;SXV1J#qI7vss4nVzRK<EE}&mNqQ
zRz{zfhq*1B!kHSNU9|yPf~oyigF@z}q%svV5vBc+@CvS?!F{qznLjg@IU*}SGaOHO
z{beI6`!o-x_)+mB@r|jJ8e)Wlr+NqeK!kR0^%zJk&5->oHzJJJlE_7VLeU;-twyLt
z?w@Cfr1vu3-xXKs`sHFvy3|%7G@XN<JI$+42I<368X4$S`(>;&IRrfs33-kJDRdwb
zPvA?>Ukd5y?ifH)6M%!1NCa5FthJ7O`}PKOOscFOaAtEAq_|ms+zrGRnn|zr$;%*!
zUXf1#OcFE_j)1^+tzbx`RVI5BqZLHd4n%0dSV|@lIMP?-u<qdAE}jckHU_-rG{?D3
zudDh``pxe^oVlJxya_ymq|0Ssv)&0Yb2}q)?u*s-LWt-DJp9rEygH-_tC}whB_Iw8
zK-7S?65CD)Wmzv;!xKuO1&#I5`j7RiIsUS};_tdK>M5ZiKCn^Q2v|$={vy;NBvn7V
zQZ)&!I`X|<t#{*?XBEQx3)JqCk`h1*!tZuk52zV{kaas-_E~Fn^}0VlnJu}F#Nz<Y
z70YJZD-5u9>jnjRwfQPPpb4Y3f=CSi7!cX0-IQ(H46iZP(>J<b0q>M2r+y9wEj>AE
zv2Rv;Sjtt<kING<3R$Tv_GJbiVE<icB;V*CafY2Qgd-VdB?n}aXcIkoCnq4QzlNG4
z=5Gnyh$cfijvoBv5&@Jh;A8sbEBrZAzekYylr1hh*~O88a*t+wmp)Wf{G4g+D?=jA
zNjLRd0yE6#j-z>x_}0oV0j00>;@`KrNW_vg`CxD{5tzW8tk7)YTll);Y-Sc+@+CK8
zP`gRFOn1zZv1_Lbhb~D5Z3D#2$fP`2WDKK?O~6Jqb~-#02%#S7?TLK+m~<6F9%aY_
z#jJt9m?r_mtSQFQ<d@yxm3*4~5;}wUD2@;11Yj}(y%iOe29Gra`e8n@l-d5%J0f6p
za8J7iU<uNquJS$ZdAX2?R&xDb`m-50jGMAMLMyXqcH#3(jaY$Cb1fcUG>R$W+2x~m
zG(JS~1HiXd9634XHn;W)Mo|)su;tk6kuL@*Yvfq`^GO4r@wCQsH8NuU{*o^pB|~NG
zA4+guXIIsu&VPs$E<evtc-~Z(@?b^~*Nr?UiEF!8YEh=~3I0!{3whlED{a>oj<ry=
zfh+9&_i~HEVYRu&cIgTrGB1|uLZOF|)8IDOeawoAMjJh@`mwy6KI(eisL&X8w7AGl
zu$^6rQMA#xmlZ!!E6ATlyFiL2<JW(PH?KA68)^`xSeaIZKFf9`D9Bm=Xj-^H(}17{
z0yRDU1e!Tu?X@8>1Mz;FHL~=`Q(kNM)R9vMK!Lb2zeECO(ScRjk-ylwpIYaGyT6rZ
zMn(Jp-l>ehMBucVnFWFPmAG-Y!zRcHzn}Xlx(|^s!YZ8~vz+axUY9x{4?v&U#10-@
zrv4aagZr!^#2Bvi7J>8U2|%a7Ig=O7?|!cD{o*#+VYAQ=G@rj-Zjw<*MSB0+rb0?l
z^d<LS_&XdKWfl|~8?7J4l1K#PoF~Wvpzt^AhEiv*Z*(#tBpmu|yGbR3T#ToV91!AZ
zdhV_Bg{M~fa~yeOOEBUw`6Sskevld$&=TUPoE3i}^%sh;a6n^>ADb54E<M71Mi$A$
z0z)ihF*{dtsiJii`XJLg4_gkGo6&}9!cV8h`T&kr)-dXEd&x7P@w=~(07}pc6<3rU
z5rcba^JcI%wK=7~j%h@9_(z^w39UD|2Yc(q{<Cf9cL_P5M9M<#Q5(xvUY`0VC=BQR
zu#XzM`||k#uGW-fNM-vzg#vgN*0!7kBPC!pw+Y_D;2X-wn~DhINcR^PKTeXiIc{<L
zr*SMkFHqP2Qc9D9tZqOW+U2AXVBKSgjker73<qy!U%kVEtGtyeIm$R$YPHGWJ6^`2
zo1-x_f2F|gjy>7b$?(*ioaFh!;1XX#Dt)wsMK}MQE5ikITMm-7xtq*8lxUtsS$E{`
z=Z$gkwQfPbZzAlhdUQ<;u$zl!gOb4n(OL`h3pI?i!=vM*c?`_nR8b#F@c-h71Erkt
zOg^uJ^R=%^ImiUu-h;7u_u9-xotd$*q5y_Kgko*CL?NagQ5FMWE5O9o33WSST4IP&
zpx0CVRQl7rM)cRoMH3W+Pyj*-h6qi?Vi;PNpBxnS)R&wAV+4AqUmcMhCy4|!{2D_O
zhM>R{Y?lTAw62kQ%_;~{m;&s9W%5Z;I+0h+%Uy;5jB%LH&D}bay9}f_RH__bp;+~)
zRRF#&uxO1|z?eq+vR=<@IYZSn0^nA$yPXlT?jiAA(51)U(up?#d1;Q|_v6>+dja>e
z<JFdttGy8nbaWPj)|<nr{DJ~XeC7-@0DQC)3jF!&KvOm^n+T1YNVG>bnd>aJWFJK%
z@C2mZC!_*+^nqY{6&+BJ6BqPh!XAT66d3y9gRIqSS_=pp@pv(-Ee#`_LYVYb0g5$I
zlU6Qvs296SWBeBYGJDi@pM=f!J2p!emOjz2_{5U+(V11lVx!3`8xBvj>)B7Pv%hW!
zQ5!+YG9FQms)I%<dNK#FEJR{?-{`<`&jnxno3^$K3rJiCn6G|27l9|-fW)H%wPW-D
zUa$<RY|@6oLR$gregK>Fp3ZAAu0NfEkQORH73TC^Ne~jhcCyjKBZr>QeS}+;_6H$D
zssunxtPe{J%9}%<-Vl3MgO`*QUL|q#5pP~3CASB?pbFlme~M1`VEeYNwkKG^=>LcG
z|DozEqw0v3ZM|`aKyY_=cXxMpcXyXyArRa`aCZyt?(Xgc3j}vw=bSt4xR0L@y2<Y9
zS+%OZxmLe=`oo0RW5V29GYrtjceN2_mO%jmO$J)26P5iWk((+l#|6(1*f7lq0*d*1
z-TUqN4v@PDWCCc@blx{Kb)e|{Znp~u%_KJydF5*fRRRw|!RZ53W_J6fQIl;39imGJ
zU|{?{@HR5R2>4KnAYC-HeO{YEXoY^Z<5Ez>zNNb_&T6b4a3MPg+)r5r2fy|EMWaYp
zs`%>v2t<LlYH8Nf{4XfktO!wi)Jl>71QIIjEVLa7IIf_#M<5Ueqle2;1^~~buNG`H
zLElNw%tb5E*C2tFJ)%m9zMjaau&^o^{!K=s;y-q4_J=nZ|0Rvi1+=R11L@aq3YyH0
zYuJ1S%FNu38C~m9pg+G7qO^+!*y?~Z8d|bV3f{6+`Wxv)KXeEASSfA_&x^<WC*iD1
zo&w5;Dp&_+2I8f<tkCC}i1owuPms<<6i&O<U)7+ExG}j^9df<(nOh9P&%RTDd(T<4
zFo<UA)RlUiWN&i4lDGv!-Cu5R|FUbQkV`Er`FMtw##aXo53yi9veIEMs4MHX)<Xak
z@J<(HD(udv+UnXjuSizLWUt;Y@F8rapYQ*afTs^=RDSNk!sk(|F7AO6N2Lm$HYBD)
zlmmB$?j^@y!p4W)-~sE{vEYu<9{o(!7H)2-*-!0T_tdR@yeJO(-VF-HtjFXI?fder
zMEQ^OtQ;k&raFB768Y0N!9n;Pj|0C9WrO(lbx7yk7?k#ihTply@qs`d)du4Sp9-N{
z7?b=8lBSm!zDj}Ht?2#{?)fGF?)@S1q_(pK)t994L<(y6{i#zM1#A4NBHm500U<2r
z2F)rFe-Nh-r4JjDxdVat+YWNb?qs@3+}^kG(=1Sg`;O9|K@ve~>|=^(1kV>njGp-L
zN2(Fc@!L`OUrF^EtFYkSXsA6m*@?#I`8u*Zw9(Tl@F{vJEw`>=o@ihb3@;Ouuz%ul
z4tQ+Cfyv6isS`;9Q7yPP1Qr%@?{FfXPYlf<1;%6f<-e)aFhJ+OatedKW3#vdLfGuI
z4S+t7s6B>7(jUhuX9rMO9_?=5GdRQK1NFKSN?HFAv$oFbo}|GgiexUk7C9baS}#vA
ztX^J}{8<AQ9}^LqolF&IlPztDO6N1m@#M&0<cy~655h|YRH!|AO%^YSPMa1-48-8x
z&+>Ed358j$B4&2Wgrvdko@5n;(yCJ%(t4*Ix+dTIJch&vi^S%W9D*$u%N-S)Y4kS7
zYD~Co%HqQ%>m3)Him<8$xO3!M;Smfas})TA^c!q*p~vsH!N6P0bE=$?6O#cFb0S{n
zK?KCNOa$RVLE~mZ9v3Q-g#5nfbz6T`hSG<(<|A6KQ&;|DuIdffhA)-e?8#KbOm2R9
zMl+zQiDR>B7xGh&h?<kSPFkm2#JWb~ajGeW>{gdY5<rOuzrCQi1Jy2&`hg;N;Y5oB
ze5HZ<WV6jxA+w*Y+0Oij5vjVceLSsVnX`pyT212DI&rvSH?73DujLV&&HAS!Yef~}
z8RFv12^vx5t?oLw0Km&m;(L5g84l5h*&XvuA%XwT1k!cTUq`#U^NkVKal51<9tO1v
zw6s&c`^xN}er1L~@|Vgh8G9r=@}hKqXqvpc^^0x+nYEWkyqYqTJVugzVvW6__D{D6
z$Pbh3vom>(kT+yTw~ixlTGs#+LgM#4EhlL=#Utn^+ikGoT(;H6hi8Ss<RDM|tME$g
zHg+6+q~k0fV++5ls4|_+(PkWeyP{|QeyVYu#i|pDOa%_w^9Q=SrwI2;5<ho`%~vV|
zmGX^1*NxI&WqCK*r|W*+cUy^@UlY`K>TECxLvAb;^S<MY#Akv2M@Nz9f+Jto*0jLc
zN|dM&+Su(xe6iNOCL%m8ro<%^Eb`BmsF1{gm|H4Cm_H!O$R=EMXW<Vm!%js)s2KPX
zJ_?}G`(*9IP-jbGnk}{RVqtJ)J?iE33jA(iXHbWTE_m!?LwS7e=DN)mrweAX$?9g<
zI;cvNq_UZ=l$tzQmB?$Qa6FC~_0I2ew`^S8zAwn%T8pBkB+}Xnvh0v1Q!#9rmwLLQ
zvc>8`e?_z{5HbSnxjtSj*G*+^@z3dnit-{QgBaIe4UVZ@NZ&TWcO7+JINcqXH4|Nr
zj#Rx0)znck+ES}y@ua<E#)Mcdd9P~P6a@v=NyATLU8@D;6I*O6khrYU)zp>FaxHP6
zEfM?E)5>gie;&#)>aA0jW|&o>LQC{vOK#?=mWbX-%Aq8b^_sCVXic{o_sU^0&4s?f
z)CPTsXo8sD4}aA_G6K-<(ZRq%*^dib`0~uQyAkryy+N5E`*hW(v4Pw`Sa?gsYnHCS
zgO);Si&b7v)3qilG6i@nN`jHA5^`ZmXqeEe!?EA#JQ$@*+eM1^V+035a+htVZIVM0
zbiF7xmg5Wb0jD^=U&>Do8j!6aPyX}1wpPsPjW299Qmb>2HRyVBhxK?!mhz=KY}V`O
zu6^(s5}(oQwZyZq09Jr8&#u$C*fO0M`RBX86dH}>$L5auU97BMu;5QdQvQj-RxrJT
zm6Ye)m}55N*;W$GUcz*qd9SWmy+IBazLN^Atitjm)|uM_v4R+aU+G5u6Rv9+8)=Zc
zlY*FB=|lt2bkXQtBXeiCR?22h6?6sm{sjlxM^5{b06iBBgF@(fw#0CHD<_}Bn_@4n
zGxg(K;)Ez3DNL(H3bnqZ2pE~`7XKq(uZqUwTXH4dUM&O#S6Pf6)4;&&O%g8lL!n@j
zqA)R8oif!q>4}9?g@jP^i{z~KXGBEIInW^>05*!AS^DT~oRHV<=yJLRot2#v`Sxxx
zOtPb$Pevgz@CXz1@hnDWxIHvEN7t7xerTUzQAX`xhR2}BcWpK5T(_(=S?-?P6<g7k
zonVt-9J7Zp853@~NiVC(NxL)1Y#T9Ew2MzpW98_aQA`+OtLyu8oz_ULB9nMM{Vj_M
z)|kNzPp(Fq8%bu57Aem92q(YTs?zSlRD*V|-rQo4J;>BjLW~2kN*@7(R;437vnxYq
z7JfBA)j0+cKDoZ3fiK{hona*i1bkHIOAJ<!SF7_S3ikFR=CS9R0E!|%nrI$*(dWo&
zBxLh6wn5CFeCz*|{t6|{`%iAUm4t#+7qxX0Sf#(3cp~QW@z~;~i$+X1wuVhW@E95B
zZLchHtsho<r>l@Sc@RJpgcjBRgv&!mugcA{s&vvzADGl#&4Ui<BxRqx=5kuP{g`IZ
zu{zlsG?$`t$}#eJ!=OV3*VfGcGI0q!XB^V(81$zUEo^fMZ!C1`_zO2^I-~5mx+M-5
z<4YgT8;uic2tO30fPJH~X*qRPS4Je@B@iHfd7|B@w2o?X6zmTzYQRf7s7Tz^g)SzG
zL@D?`+VpaNF_KWA4fYyz2TCHDxndP`{dP|5dZ^2J-b-eAWX}redxOb}ue<l76=><4
zuP7X$OlMAR`1SnpPq%0fOltKMJKF6PmrUFbjUoR#aj|QrQ#58CtMG^<6Rq}MCg8((
zHlOC-`h7bgR<6r0=VUJY?+qE*%nqg8+{DHXrBa42De=zGY6Iip7z>+a;sW`Jl_P+S
zr`43UVz^Xr3HqA(F03^II;+!sx<2R7VW$hl9;g39*MeZRiR(<$OsY+9(<CC?zb`eA
zI5JLdyiFrU>4C8EyUP5{X*a=?2%u2V!|)j{Qp5Z~WnF(4n=CI|G?tRCpcRW4UU4-y
zCx$d+jaKuy3bphf*Lnm7s!ncC`G_?+k!7LpfEN6N6R+jM2nWl!`v$G1_AMq_GCnC_
zSD;^_dRM{&pM=|yh{2?wqEag{I)vte^qt2%I%2Rvp5AcWG~sk4Es(IBBvC`DropI_
zN8BUi>-F|Ym}R`e(<hI%GF5KYZ|f76U{TQ+{r3Hgs}PKph5M8E<=X55L+u<ZD~i!|
z3mezS2ZgWwOtTwGrY~LlVfAUT`$x3a?FGN=!|Ir2pS?QPCqegFJTk5AKBS>OmkMDT
zW%#575t`C^lFbg)njk;eTfj{)yJwEnaNhO4^XnIV1Ykr$FmkEMtHe8?@ZRNZ2T=M%
zei|7bxBdJy8nGtXy)DXrE~WWn`HRe_H$BOrV)JIfnl6FOhCxKL=3pSknu}lJxC`yh
z2)XYlo8SDFVyg^{a8`hoEt6oo%joQKi5@y!AIS`CV#9yPV8Hyh4|Vmi2tjWI^&DF6
z@tuvuIv=h=*r<yZFf=e4Ce(l<IavV7Z>ai$#0c`NaZWTXV?9={e&(zB#xL*gB>J@~
zzH5!#hQLwh5CYzKRJ)nqyGtjdZ6W>yeAT5n+Zp|l2&s&d@>#cZ&2nAv?y84}Mk^@>
z4Yo7Vds-os?dkulP1fXpD};q&bLqw`7n3zZQvdljSi>j{mss5BX9EZtLjZ#d;5vDw
zof$okNS$LIgVw@gJ<AKxtF<eDGaqFv0%d4>%ZskmPg{1yMj+$VoA>e)EMNCF5Lvx=
zzOhsY$M68w2-ux)1UF&OxK>D4uC2|6{P~{66Yb2^4?QT}7|-`ae++9hE<TOPRfK$v
zTqg4t_(oa4%H-Y#e>?76zfyk-L<MR^P?ToKQB&(wxkU9li7wQ$R8u*^NaLZETZ}Oh
zmD`Eg3RGLY<t`mspH{%f&kA9yhdY=twGQ~ke{$|mQ3?L@zs?Z&7-NfvJ1Y9nU*9SV
zD1G2FegJ31o-ZHU%|8P77Eh%3Qv^iZNIywKi$v{`6&GV!kJcWkHf-X}f1lcRHw+lR
z^~}zX52GZmeuwx#+uTWY$1uo~U~Y88j%1^`M2C3bGy9<pl0!Sjg>g2W;T@H>HSB&m
z1J^#}ly(;XRJ148<sVz=4u0YW)^nH!toWk<?$mY<<lIbxsP8I?<PHT^*@!eZEl(bN
zf+;d!DkvsKZtDQ=RA$eoM?#t|4Qf~?rhyt6U|of#@wXtwN5|I*Dc^Jb&vV;@a{dbZ
zkN2xpT=2;2jiP>=mPLP`;+Pc&*Cv^qVBkNJTd7z23jUbz=qUo?#?*IG*T>yllpKAR
z2Ls%z367h-^!hDVX<kBT%AH7P=q!#hBy}>fU{R2V4)Ks-bG?h$%3o5(DbcIHHHjqR
zC$p*MTon<jj+^6=!@}*h?)Ondb@mVkL}mAKN&{0Jx&tv<ICOC7W)U<To$Iu5#X6e!
zU%;nhA4V=nZIvh{Ry+y(X1t!VO8kDcLc%h+>$Z>?9{FFg9l}`USUE(0ptGqX?HuW!
z=o5|zJ*>I~EyL^_(vdLhLZZO^r4}jC8Fqx(Sr+Heyw_2tStNhx9gYL>>`~<pBWt)r
z`~e2K4_1#^m8honL~UZER80pGno|x5U7`bcYMEg5HK+7Kz@Zkm$sdW_tfVp#jM#Nz
z5^2VND5?W2_~THlkPJwy-*Mfup8qI+a3Y^#qHj=s0v-bo?Fe8;n+36@=ixq&j-2*O
z$;gSe;w0<?o`_`x%mle3d$`y3FXtCDjiNi?ZZe?$O9H;pcgB6Hc+Zt5Ioi2My9pIA
zdC2v><}NeEG@QofiJbuBBA9b{fWGpf+Wm(=*CHF-LL(bg-pJ-84ClonD_HfWp(G^%
zTomnupJt~1d0+y;6`@b%99kO!oxTAhumdK;{8)`3SsdmW_Wq)k+{ngo<*BkX*trr$
zPb6UM>@pUqhBF>nSDr~_B-23BTJLe9e%xE51kliHXjN@Isjt&nK82Z~pgml%B2Wai
za(r4X3`W}XAqzYns+i1#G^pRD*XzX9azdLG_P&QiH6!Xf0!%0fIIKOnzK3R61{r?<
zvoplcUoyM2@wW4d0h%7T^b@W9Nu+B(GS*pR{YHFPd@~vP;xAyB-Ya^<A8JN}9zyaG
zAW4Idi#9C`3WD4OPmw$c<ooKWja-6Mw<B!Ydt(J>{AYDiPx!t_>z%(h1Nc=8*H=$f
zlJ9s*ocrO+S@A>zsR^KnW8_G^=8yyG!^|u;*aD!b6VA8GvY!rxFzO_uh$;UIaG^Yr
zyT9M42J8b22dS$Tvjv#Er>Jhy0oNb>zl;@f?s|Cq`-}^UV@%++-yoUs5w4>jYWuH5
zQE3ID0nE@j7v3$$=WY!Vqu?RREJcg5o<~PUw0{(kgn0e`EM^|BcKb_J3Sf}&-QS*x
z$bxq*5NmF9e^~&vq905~Si!`Y?vX|&Iit@{z<eR-7s3;$x>N~eqp`9+eqD#q?6^@l
z)3HPUZ<Y>o^I$bGIjYKKytIV4@#(Xg&PRxO4mqPH@Q}H|tmoU?**x!!S*4`v*Vxzt
z$RmT<fM`OjXEuI?GSE|EFoSVG&~k_o#f{K3KUOLd%)!U%afaPyDUlZ#x6{E&#aXxc
z!Gl@J)55xHrMjCBbk?PYioH>`N=)o4Z0t*HMCHdfWabA9mz$)YaJ+VtiTU*n9AS5W
z28T(^){53?p+l<CW<7IOSVD!;$va;P$5jqZFyMAu>-75$g@F0`_&1FAjkX-1%0(R3
z#^~v79L&t<+!*A?inSYq(E$34WMa3V{uv3IiV6pvNKA~9{L5;WS|09~ZJ?VNXul?!
zy8ElqT&|M>;x7HyFT}t~T{5QZtSD8W+Z=estkytjW;V`!m?p9ac8Gh>C=M&aOaUgZ
zzV{VUF2@&Fl5_1STzZzRo{bK(HzFY>6?m5B<+(Oo4C9*f^{yIh9&_yVbtq@bF0*)b
zI{;ND(wThvOQwG$bNs^|bp++(^q7f&_Y!CIAw|gucn#PgC=FN`Q1eXoN=;7SshkcC
z@XR~%KRvTSGH>{Jcg5EnF(u>t7#aGOqP33(SV*oeAEV7@o%~=DS+22+eeXz|a1MEi
z;)>{lNmd5w4(Q}FSXqUF5(U$?$%lUC0c5jc-1aDBWV{v%{@OxvrEI6g4^bdq0m(26
zjfA?Z@|!<7^`(3_n~twzcy&Yl0py7)ygJf4<u?{!wz0qfKaS{deaNKKafUlmJ%+1Y
z1VT1?;&64a-8rz+Emnjo@aumOg=ONj0N`{0HL%wok?QZr&9njYM&*|9gaKnRmuSrc
zVt51W--7({@h=%ua1Btjky1f1&+3-opr90MGEj2HvBQ{YCK3%R$!;G4W13zl#V&w@
z$UdIgEmVUA4nsj4u@Ux`c_AD23*KK!7}xI#)u~HlkFZ^)crYKN^$Gr`X!dI@8SG~w
z$NeJBm!y83*4E7P#q*j8@uG1EJHKd(e5;q)8);{KEOs7m0t@0^V)<1ApB+5gDn&N{
zrWlp(Gr$DI*BIM~X$fs73<Le-UnmZA^HW;o)10jSis)RCe!*s4>RBHfP6Mb*$S~jd
z3><eDtz%aomr(SsvwupZ@d17Jw2CgQcdK%F08}Y}>t(x4!K+uAr|(XvFoMeGMd$tV
zt_<dXiw(Q;bmJxk5~GYHg;ve;hb^<!98MI4`4FY~d>o{V0E8+0U@Jm>P$sLS<iyHf
zU`8nzT8X~di1Qb%9Ew^N`1!klZx;GfItJi*LQShNmrW+uOTG}Iu93(q1kPuRLU1io
zv#JNb3Kl6``pK8W!bVo~c%{<n7-{^;ARMF!YjA~@>=07=CuD!>DVBGaYmYj)S5{6a
zK0+`DM;#!Y5OkOG&A-IO>aB*f1Ex#YYMykJ<-hg~QEyRV1y&0X_`DBr=x)^DU1Z2G
zaeQh`mzoB2R*~>kJVuPw|41eyn-f2}56R4-Cc{q7@QK#rRZkMKdFQXcb$uj4KRp2R
z&8v-F8u{!#cE^eI^bu@HPDg)D;DsK)DrRG&a!N(|F~azuqG6tyX`UWT>;e^L2Fxh=
z9cxo6Mlrnx<(QrIjf_-p!V1VqvAMqjuDH+6A^gyZepC9E$BdQ>)~-i)r}JqA<ZbyN
z=&-L9*6oA9INriG)A@Js-{S`A;5=J~Rff3P1+QurV(U|+ZkJ<31aBK0-NfloSy6Ss
zZqg-OO+6oE)r%YEQ3?1=85f_;RY3iDfU=t^=Qbv|>1?Y|g;{OTsQXdt*I8BYTsfj)
zL7M(AgwM$h#&6t{Yq$K44frA|*Lzx8;{Q%h8?t`SW_}<nUup(IJP)bPhtABQztL%$
zKTtN#?*o<1g>=3h#iks7QfJE544GLUeC4O#M4trfv8Xw0)?@yTR~&+|bL3H66Q_;n
zrk}9=qU~JbpS2YBg24;ImZ5)~60BCBTNPhlqqMWRzHcnS$>RQe-mWO*kGZP`AjVZ(
z%EIxKRHm(PxSwoeA8&Ej=lfFwovQ`%CxSN<2Y`{2+h<}jv(Z9er@xUF_Q+aAN{PN)
z)FE0JGz3NBG{{;FW3>Q1uK3kiyg3q!l2St6qo;tBquI9%@JDKlo*RI-Q~Vi-fdQUg
z8r6x&9fv!~@7K!@C~dtxpvrllLiAZ!ES_&IviZq4hBjX&N>wI$dc}Xi)AW$A<^W7|
z7;m1OSB8~}DwM&&86<NG@EVY{L`EdOIWZc9w-jPl8AN7nrB)q$($|+ZN&);HyKJvf
zU7hXQEe#x=MgI)zZQ(!Rn8x79|M@9@G5!)?)|OnxUU2fDZd#uwTOtRX7hq$sCjv(n
zVX|QKjNtKbB>4u$Zf-TU7+78KTIVL3m_3Eo69M^VXC_1&=u=In#kaEKLU`>knHUHN
z29L6Z(;pyU5`H|jDgO&se%19`MFk>o)I-Gb*oX?Nnp&cS(>4Xh^_+!yO_xX*hAXla
z`Po-U<_BC{?kIe$twCsL){o#ycw_Ekz%>-7C||)TzG2ucqc59A@`6h!RJP{<cs(k*
zp5p{<RLZ~FBwV*?GO{B540buLrw*>DR2nB*RRGZV->=8<bma~*UQR4h2E#LcSAnVl
zG(*6Fej{@jSe=a8RXNd8K4QnLtR4=>^awj695P*G-$xPYzk29sc(9GC^fnOMX7C)<
zY@lu_#3!KV7yMs+p8q{-291dAeX+&?bg>Q!K@pg{g)%#&XR~6OHC>gK46C)f6MxmN
zvyuK|^wTqcmd{8)WpfgI+S}@auvzZI5P4B;MFMJ7V7jcJHj-j96Re-WZSlwk48HLl
zrww&R(O6sH051mB^vJu7Y^Kq;R(wX!9{P^Ft0G8DuXV2eVJQ$vULbVyhoTpvjS6wT
zAHcYgjpo<PCA%quBAGt$anfw}4Tejl2YYC^0D~!=vYj;0!3G|1!cpelDGfj@LmAa?
z#8~|ZKb7RPfVn(2=dmhG$w3gy6$wGV?2sK_X_9`Sm}$Df^HQA7OP9#NCCA+M7+U>T
zXAtl$0R))O%~8<HeJoIC&_Xh!e;Jgs@NBC0ypTDnQIT6JA<77+D;pMO0BnBLngv7g
z*Y)l^8E@;=0`mYA_VG!D516kfvuc2AGb=MS9J3Z%nopI8(*RYTas1KDu6YN$dVn%g
zwYCH$lX^7eZn!K2a-l3p?4EAUw=2#9tU0KdJEF<65;#k57~8};!k8AIN|(V^1ca*}
zl{ygX;1}@sRs(Gg)0tFUZ1$A20J7Oud1x0Oo-3o$#PpKyS$%;witZs?zQPoi9X2tY
z35wM3_iBsZcYf45-ZH>;gbRHXe)^q+?CI(Ht^_dzI;_O=gw`A}BQ@22;WrST-iALo
zlR6+%_&abJ0e<&Xo^|LNdyg&}VOW(A9O;tm5sNSYC^MXLDgnHJA3`ciAo6?&^3Nv(
z*W_4+`Nq~$^yFg@(+hUft(dufrrvn#x8lH_6<{Nio(d6L%I9zVhMpsV(qQ@Oy7#@D
zxzW`>twzoxMdL8!4?q0|vXHv&Uk>Xb0}ry7Pz~RIx@(h`bJtVxkP7cM5(*pd8qHX-
zcsKdANe^FU=~e67|4!bW_HT}-iGyEIu)PmYfdaW^libMy38}upR2))9Q!27dHHDpq
zT$K$dfnVVqV3pTAJ?nFcGo(;iF;P`LT}_Xehz%1Zg&kpyxd#p_>M1`)Djh~hsb@Z+
zcJ#Q4fx>#f7mZ_oHme^l`@sj{ej<xcfdv(Glm&{@e&TTNMR*If;JB5Smy^_cT3Tjh
zDz&j!tGqd-F0xcpgx|q{a|XvWOnB1w2hKCWMu(sAxK{!2;OjCimtHvvuy{If?CQ?f
z2|r%V@N>GRNX@iREyhVOXpP!yJAhWi@n9fJb6vAU&K$ZA+-DLWELJxxjvx6Ku0%rF
zG2oBS3YG)Xk_JC%&LzLHB{LZqP!b3GqoAd+Qhx&s#zh$A;f9?!Qc^m`xfA_K=jFB*
z5)st{mFm<T$`qEQ!MoJMiX*bA=AohKZlq7NkLBn+JIEww3l#laUoouc>9Jv?@(qRE
zUC>|=ks*7NA(^8zhD!}ukr>35g{gg^p*k(rpzksGLi>Qz<T!vru$9;U(CZ~06JTEy
zzJ597dpT`|E<J2uzC*NtsT@0$Jp%lH7CVI%x|GO(wT0wVYxhPO8a!Qqs0IG>cVnQ<
zPKM>=VIZb9T?^O~LfSt)paArp`_5vMf4ICJAFwe%HbJ5D9dz_o*=HQo+zP1(b~h(#
z8GTq_*1nsUW2PlVr^N{Tt9Qt-9%cL^hWMwa1bmGqd>W{$#FPOjpol&vRG%ew)SI3L
zR@w0I?w6TF$~;=Yh+eQ#Ot56n78(<1{Y8CQlcD)n%<w?QX$0N}czlkmL@;*b2PcO;
zn)S?cF%k0k>ECpyr(tQNl-Tkw^{4&w-*J|L{s`@irBp1Oqyf8nB3@ntCiSL^^vU%R
zy^mWSR#GhXt5TNhWqvE52H2f|K{Q=hz;<#XRmc;mHTeu|v_t<1>H;+CaR~fr_rv8J
zte-y@leHU_lCJ_KuCvMvdyT%1<z>%8&2*#fq_=P4nL(rY+BPy(QkXyw(3!&Qa;;{|
zw{E8ZeBYt`2-z;nrL{TPYSBA54+9LkwLpzfc@=1*Oa5g7vx2XDuP+Ti7U<?|2@8rZ
zRGrup3i4Z80jq+bh+dWOVxVrs65#QXwIVpyMXA-QndIekq;w2{Do1HVuqiS-GXFfs
z3m`(CPTD@CLB$|Jmpw?>omRKw!>^zS@D^v(`FskT3B(NidV$|2r4DVLTAwPe^hkqS
zdz3juVo}jVQxQUYD$GPsK2!Lgw{*n27SRJPdkeQ2<BV)URMEs|Bt*2)QW)<(@u6N3
z&8Z*iDH;_CDL1$rlmW|qF1x~4dOoo1)a!9{`lsIjza32GOG;`0t@5uLc`BpPY`nqz
zaL6Rsos?)3IiWRZc1!FGGjA^H@41Gu_Z!6XER863L?TD#bN<;Gpg0mf#P!Igd6sb^
z_P3_O+Au(=3xx;$UCM+MhZm;C+BJdk5QD7NGN%Xqf_Ht(<ueeF#>zQN2H66DE#qpW
zr41|F=>}7N))cTrvXV=wl7#iv0sJcVfxP#v#ki#&z?2dbP!n8%!y}mC|NQ-|m&k>h
z4i6@ec&h!Nxq1S2)HPEj|Ah#sLT;^jhVxvl1o)JIsUJki#)v5jaG9i?46g>7;8iYE
zzu$9jUc!Ucdpau9y$yhv>sx*vmH>9>Z3T|nJ81q3*dJNUxWW9x0c?ba1~59Pz%oEf
z!y9&9$qM?)TUrh+l*(*`M;Y^MLGYWR>pYnaP_w3?;F4ZBJrM5n4OD4an*<*Mn>XCU
zuxs*qq@Mg9(X1lr4!_C*0<tSG>4ZzTxO!^hr<5Z%KSKO-xIT_GPp?FD4MO1Y0jYN3
zDkKAj^-R1b?CVytVdW;NWU;2_m7phg5x=A2Jt!$}nP@UbQ=O}}v}K24mCJSVJK(Q!
zx96w-8ZI$-*am`r_!_hIdmV%0u2LM=B$U|_shSb%2qY%=ry)d+M@*|5$PTW2hN-_B
z6HV9!*6GXZ4R<v^%D^AVse-!$-(>$&d}5C=C5QiLjwTG2NA{Q+95=_XHWGrWgYVY3
z2R)p~*?udV&scx3j-Aafl9!8#-X^Jm+m!;g@nCQNmfGE2X)MCJs;YA!oC&P5rfyc?
zhE^qxj-|kbPa+ce%&AX?v$+9eo{<qB);|R>O&x%Z7^Bj4i$;9Y|AEH(5+YBRqn0AL
zr(Yrp`rLc9nCI^%DaPk~%^?lRstx2qmswfpdkJB=q>wTD4S7iBEJ<J&iN?umy_Aaa
zRBjsuaGUaUh>mBeodLOTO(uZylcb4qLI)h8Z|MLo(4g0+QVw;yw_{+%BUu`P(tuDa
zVi`>3_Zt@2?lTyK!Gz1LsR(Br1zwZ#29uFw@9s_4o44*od?pfluNl~UlngR_>ztc=
zz&H}5$PqE?>M4jse;xWc&)(=<J{Pl)_4OZhrlo%cJ_dEr6O~d($L+tUWp}y8yA(b?
z-GPFmjgR$@&qVvTU7w_A-^U6c!f)=lB8P{ljEoE%dP`_I%m02skFu`U{p2?1bHNvz
zkk7+wk#ThW$!$FMu{7b!fm^ktYru2;VD+-F+OW>9u3<1eCI8^WW;KNiaLp-nTIM>x
zWIdpJumezG71O1;LRdt}V1<vQN^FWVh7?NcTO0_$nZdya;OVqJ|C!3{|09?S|0o_@
zk=N_l83-EcC6z^nH$A~QH<QS7{t3a1jFUsaJJ(6}#j$UIDr(2gl}AtS#AVm-=N6b^
zmfvc{U1hRF^CzlZtG^-F@rGy4fAGEqJ=}pYZw52?ITCG4&i8Tn^`k&hD53I@I4^s_
zHN%fS<AGY7m*;0figYDYdjq&`jFrnU;;N^>(vM?n?i~2DuAwn67^LDOZ{pj|1wkwA
zqbvDuDNWaw;8z)JkUex-)xNJS)nMAjVHpj-PtaD-{jnfZ6+gfRwL&%gwH}eLHl7Yx
zQfqQ)Vn)n1tU6zueSJBgcE*>jFO2Bud@pyvr%A3!MbN>S&u@RP-^Sb4Y(Z{4!UacC
zL4HNSQ*<V69E>OjF|)JTZ9WA4OcRe41V<F6bmudB5-WB%(B;<W0rj6EJak-Hvq#wA
z6w`MhNq~#?>oSmjg$thj7=N*p1+~=P+8BTL&>hkoAklFA5G{w;Nv|8`Cg`6bX~!cd
z^uoa<U_>EM4|OBr+x6aM%ou*`M(XMNwB!UrqBB-`9XNb>m1E@lVK~%FOzrZrqG<re
z+~u?}frhqHk5Ra|GSdmOamrD+j;(hXR5nr+2f}`5cH8ut!V%C%h&Zr+`el4<qzqSE
z8uGK*M;q);`H88I7s42kcbzEme$yT!HHZ1ww4e2vsOfKlYd0sGiCDB7qp5Z2f<98u
zX`x~bRT-G}&r^m`u*YjW@E+{+ztO=cyPJPH*_m_rrX!H8pd;+>H{kWr3mw6*J4v#y
zu8W~m1i5<|jq-vic{6={HFVR(<9T6>&x34m_K<vE<xLvDz<UJ2JaTe5@}s^Vl(`dg
z4{iGKNJ;TRa8v&Uf{CCDqGSJiVXM2)FiC^e6s9jZnzSWhyzMz*O4I92=+6}DW=61d
zl)O-1rvvxcFyHFQ&3kUA7b2DZ2Nni_Y0D%Pa>o~55O!~l-L7(RUP{{)be-3sW(`zu
z^IU#S@m3x5{W}dp^)ZO5>HT!uL~y?`m<x$+TId!Csgj&PUi%ei!eFBDdfOl9psPi^
zN;9kCieXNBawmxNES8v#$t%<pV~p(zaOQ6hJ0!Zo?X@I)cc?yy&Njlt45ASF2_KPA
z4duweVAW>B35yV67{1KfZGC(_CDYE?hU^->31+N(7t1;GbEa}>Np%PgbtLR*AZB#q
z^-IiN9bV6_(HN&N#a?E@*kN1jbcr~A@z1iI>d+D(#2X(>jw9?6r=v|*_|@w;q5Ts7
zLI?B1^sS#8eR2X?;62LrkqeBtzLYq0c{wTBJy_x-po0EnsAYC{MEoQ^i>fz9ua`@m
zvmj?_#~K^cB=!$ok+N^$?@kcR3vjH~uqP0i!bG`6d~!C<6H|F(rjdxfO{eg|BrnLU
z1O%qW$@#wzq$*|Dp0`^I)k7?oP(cu;F0B`zb$6r8iUZ}9W}RLbOsz8_suQFdWI8yz
z2tkZKiALAtt^<Mg%fGD64M&M)<{B*?=)qMH4f^NuGE#<#IY1Rjyq(2`(8eC+&0o^d
zK>W71>2hh?h#*Piq3cP#lU61=x}{Q7IYmp$qe=%p+=gk*IM69(u#kWfLOi`e+|S+D
zSDgmStP-l@bJ5R@d%5H$;($((VhpfIf2LB=rmvyL!Z!xKW9pt8SRi?;6y(iK=QyCF
z<F>Vh9&1qQPvG*+!y6`@`8>hvECctsm{KrW@njS6XXxEU0I3uq;|2xN58!@vHjoaA
zIsxprlT`(H>y(Wm>`r)_+-8f*d(z0N51@`amG|lr&D~bmPd0Z2w8NoU%U8%(Wd5Ak
z4{=>pTqSn4pVv2NJA!RaTMv;&R$m(q;<wd7g5wZZV{I_Tu%>UFFDF5apFe9VkJQ(v
zheM4s#p~jfYd{&po_pQvPYp#Nz?mrgZ~-!9u=#wdEFr`;1~_w9&&w}c1x*yK+?g;M
zNMm;KobgmPZ`yOt4rPD-5X}cdb#gDMM?<J!5-7e8@PX6S7aP46?jnym*s(e0dRIIC
z)a<B0nc7d`pl|R#>p>5;f;wW`6RhmRf<*<zbyclSrl)&)*Y6~Y;8#mVH=genrx*n`
zrfHqKFfN~4g$*z|ASOP(f1a@@D-LO`PKgwgl`{KI&G3005PlV-u5~9&Q2$x=wsi9$
zmB#)Yi79BP2j;E^8-8QZV$iCe)Y4({>195sH__&aWF-|1gbceA8frsyWnf5-Wp1js
zCzU&GWL_}D?B)io;JB0p1?7T?xlua1;I>Ez2&llJ;7@CGxval%RmCr?M)ZkZ!M$~V
z*`_kR2W>Og)~UQ|ZD!Q_dnR~0K!o+>zUnAFK}GbPVoEIL52^f%q-kRy`DsCOr^i{2
zAlTP#3GHe#Uo8PXqX4~{(gc_?s|1qxQb6_=PVUHo5LN3e+79{%SgKqFOo(0He-r5|
zG|S?)!G?0%KR^PQQNL^LxSFith+Z}Fx_F^?(oA~e(AtpJ+mHfMQ)z&3#SeCIxA3!`
z+=Yll&*@(V>sj%^zl>I6|Em<KSZP7W=}yh(-lCe$2pd<&xD9qB)cl6YvllEi>zsBj
zC|FfOG0|CiRV=az&P<+jbp53Cy9e!igqXlJ4Ai|oouRBcV+}+wDw8B4BhpDbkKewM
zs5uiR{^nUaB%K!>fm~lDLRVFByXBoH6Fxq#vDN+1M^woL#=#`yj@vP8uvVq^-*u<%
z(SK^!4S@(DbkDhY9X{%HiYS*S4}cRQ#;EXM*p}dSn;j`j!lG_1^gE3MX1SssAjAgU
zy02T0k5`4J1(L;mGPN_92EA%|)0kD(%~*{2o0wjqypLtS33gr+so)xzyqnVx5*ODS
zW=ls@BA<RK(Y@j2e+nj1cQ~s-B}4Yb{O&w`cDd~%gUeoZy^pvEaaE9|@Bw$$&0)Wz
zxDxuBU{fb-186Sf`sDgOx9u(JC8e1@tDi3iA%VcV!3=k<*4F(bxN+&sZ1ZRl!gtou
zn?X`A<6e>==TL=oVYN(=0?D|1#ZsRX7T9%F*q+eP$&B%%skHJseaJUdrBEmh@k0$e
z?PkJC353i~E}PI}D%@=v5IrrPa1B#WUi#^Es=cI=q0%61o+rMXLi(=5ZAVUM@aET$
z-p(bf>bL50u?TgTo|VW-k8LPVJscpDi$w3Sh=TXD>IIH&zx&-68}&gudNJYeB`%J)
zA1*xV9Lpq8+#;wz)bf_ipUK#QG)sKtz(L>fpCZ7RA<3R6*OT4eFc($05tn&D2%$ME
z?7STwC3B3?MN?qdZ#xSlwLMf_<8<SP9w26M*i5I5Vr6-5=kPk*xH?5J?8N3f)ToNw
zO8)Kxntal~`)Q_SZlDNbqN~q(QsPhx>i$%x9Q@!&c{Zh>W(_lXL4SV+&;)`a)3bP`
zPHoIlFIr@ktB)GkBR<Ic=^CT^{kej^qDck}gmArTk<lDdw9!)3lW&3!52pP|J88o#
zN=5vWw!6o7;r{0@n&w7dE4z3>nBvUZ_NeU!i!!ESiy+{nZsn0P2zy9gnyv>rX&`(;
z>Ju)`8JVa9G`}d8YZoO-<&s!ab4o_%a)pEBMG3<iv6^Js4SX<IMt(r`gxXQ01?v1A
z4FC58O5x52EHwVa1G*s!!i0PVl{vO+fN77Q-x781e&!=y(x_91=xHpTof6uuqiMLV
zKXomG@A;%}eXU5Lgy0>C$eyx{CFcQ>Z=iL2`vn^&UFOc?)v69=Y;JlSM8UCj2Obf8
zp~(gPuku+q;h^}h1PHx9hQaKpU@;jcTP)I|f}|woJH-*(um$ZHABgHH#(#It2fSG9
znuY5Z{r)5(TqK_WTHoT8z3MQhK^jd|J?wj!04(8sqH8eU9+n-u&;>Kakp=&Dk{Xd4
z1x_9w)rCf&OXkwa9FHB|FtzTIrI*1~t<wk`oE}aIa5@ur7KG<Q=0z8(w*t`G5fiL}
zY}@roi6Ihh?Vucb7Gstb1x#puOd7jC#H;Sq5kz=N6oxHhEOP4lFngVXecJb0zhyU-
zStD2-c)9ScceFz->HX%hH(h!5zh5WUt{_Z}WXfD=1hom-?j4FvKJnaZCy^Z@p$f!=
zqYM;fFcm!t`wZ2;JZn}1?<(e{mP30V3vQdZ?r^gRca#}=nkLmsMkU8KE{yr3hwj4f
z_^*2O0jbS5(Hf>DC-cP)+Z`&~Ol)Qsz?c^6ry=j+;w7Sj1m~W!wD#lAb*E3D9HAf+
zz`s78##Dm`LWVD<i21WoWj|ZP`qcig8WPs1d4HX9`xa3cI(JmNF>j?U4vEK|eoF0P
ztpSyaEJ#Cb94>J_=HdD4YdE}RMXi!~{!>pEX5eV6o=vP~bVSniW!VyBF5+(g10L^F
z31b(;TUv6Wx|3imax&Fqx9xBAPaJ!onlH312n*Bt4?oMivYgH`Rs;geJzbzgV#DRj
zhN(C15c$ng*v>DZcQEL*t+{6@tA8Dw(xpWj&6Lk(XP;l8UeE23mVUhk<mLG{yMjuc
zefW7$d?tcB#7`-y1`;uw&!%rG`6{kYdvr^c(R!+0+-;<#${0|}qzZ8<FkS|hM`_8O
zPicDrdrf1FD)B>+<DZMOXS}ZDbe(@XH(xJmG5+SUo8*#I1WO=8(Qte@Wr^~))M@oF
zzP9~O5w&Qe3Mk2<-t+!<At=!Mc@V@-3ds1|K*iDwTmQOOIq?f}tqB=79bM+W?Sr9B
znd|DWNCsU|he+gt_A($k>l>0UsAru$4nsqKpY-y$*}&K`2H(dkrzTG{UF24i|D;QU
zh-~#Fur6PDLeSto*nH8x!9*PB@}U!;?<^446^Y&<e<0c+Kv!?GDG19U!c(o7{}ljp
ze7c)8%Ve@-<uWg*BJ&AJ54tEL@?cQ$g*<Sm1h{Qt$5p1Br58j=ZmG-gXUm^y$+T$$
zPg)Rd^k!M9cKxVPM`F<DrLA9(wl20!@|w!3lwP2s-I!#2H5vO@PS<^ZKwDBwwPJI<
zeQRmt?VXi~6)ZnPKg!}`_Sr>BQB;4uWn-BaWPu;Mm~R+fh~96^$|oJ}D!4w$^a74p
z?g$PJ{=ro}j`)Hkjm0<udYmngs;^ut#!F+;^YKBAnld8(Wqk;)M!=qKd33fA-c@nH
zSX%{`l-__~x0&jATV~!$(GDBGx36Q0wrU(rTMdmTdZWpJc<~1A=KxEfPCM&-h{oA6
zLE{u3*BYkO{$vv7DI=SLu;$y8S!DL-_qvG)ke$&u)EJf5MrMZ*v%<0(Uh)~x^;AWo
z2d23{3*BUim{%BX@{Te-Z6|V8>Ga#%uNIEdo-O7NR(cs(nu&cDVALjO&|tjD)%t7`
z`g@nmMm%0W;1GN>nAVitW-}tfKE)l)^@@31y7@{P`A@fiyK(ftu)RWs!B^uuS$%@O
z)yCq~=IDecYFchM@iwq)ju$D4ZqUGsK(ow_!F5xg9twMH{pzsWV`i1b<@Q~>&b#Y8
z5h&Wr)C~ToGy;-?-TgTaV5{tK-euF#teV@Ch)GZ7bn~zv%y!QyJPP$*n2NKpnKdsL
zwgyxN*!GQ)aRIY8vfRN&NWkU@TR~xJA(@RuML}L6qluFt={KW0#h)jQy~9ahr^{g$
zD=Q37JeU~GmHZ!gKdsgrW1;lwe$BJ6x_WvoFU*!*oLn2X`K}FdMks<o$Gy`==c(nU
zc9j)(m4$?Lo1~~7n*PQwbTI9=IXhW@Ds8i|*P0%KD$!M&mYyF`ou3hfVyW*zW>a7w
zK%lxMJ87lN&#NjZvr@fj;^qzTBSJDTBVhA`fD!UfQ}-^i#@5wdU0GdQWn&OvuNgd<
z<8*6m_$A<W^lhScI~Z~q5@gt9zxsjPb3B#J(cEb6>s$V#gNBXS7#CN58?;@KOp9TU
zV2R+@+=#ZJXfdwJm~DO%5izl%eczy3=DOQ#VbaUq$c{A8NPhYz%TjN?7V&L<W`->V
zvs3jOJ&%D|abB6-VSXDL^zU__{w<Tc@}@(I;FAoGlewqW2CMpt_vd+F_YZ8dth%}}
z-|**_7F^t{yo6I;n^1<it4J{Pg0S0DIxIsjz=t95<#G;Q7D_BV8g3uB<z!LXdy1)f
zSze6T(i}5084(?g2BUR_)r8GisF}tRoz!);b%oikK3=Em)$#&MI;&6O{~rbvs34<w
zATL(w*ME6eG;9vJSe0uguw{fnX+SOD#=~neCfO`a2HtLgmz8o3bjU_Wh3}u`WK+hg
zHyiuDwDU21i3t$aYJVsvmfAj=n}h9ccC>1Lkml23l!|j@(4EsQz~fZWN>rr?9djZo
z!8<Cpyd|uWEvc|a;d{lLYzfouP*`8fs!l{h;!<kP3K_j&q?qJQV?VQ>q;Qz^sMv8I
zzX}z9&2lqTEeAY_h7Z*EL+%P>k(UY?K{~*<RIv<D&bd(_@mTzg*7kmkx(QXZ7BY5V
zvb{SfF89$i2d(YlC1x9ua1Y^FeZ%)RxP~cNcyyl~&G0}TPVYL83pWatkmfwBaBZut
zS~&Fj=?b%a?at8gXVaxFNci80BIXfq(1p{2?XV!jHuu$_j~62yO^5kK1T6Z1H*3|-
zHyB#`R!Y~MPXbhXzegm$=Ura8Cc?{_e~4sat!;llM-D7*l(*HFg|uvM_(DNhUx@Cz
zQ-o-#U8Lv{ij?Q`1GD%~0^Gb}34IGGAUfc04`;Q)+d~4jkMHUj>%|Nmn1Y4!+e>7*
z^L+=c<*hLXtypH19d-w3dztGw`jhq=`jc61GHy(n!_AkoMg73}tL9p7cO{b*nk+;{
zO%`|tcB4QBaw0I9x115mlS`dtCI{kUWobxZVl>0jX{+7H_bvs<b>-OD(rOMO85(mk
z0FF=3wn@?<Gna+n%$itH1AOe;e{K24Rs={LPe&J^v?z+kC;~>SYP-(vDmTd@{>WsY
zbe89X77;RszXieYo^1Qn0dsk{V}16ADD5`q>i<9Bil~&bCo&*Fgf8!|K7i_=bwc3d
zNL+%pcS>eTTcftYH17nqel)MPU%DL~u|J)LO&1l79RgLtUsi=s4r}JlsEXH=lJe^v
zx9ZaLr^gw*)@2R<i>H{HmaaE#XIKR*jq&+I_3?R+SAnUCZ%BkdJ_D})vh<g)rzUmC
zDdnZvsOPpy6?0qjuCrz@>~OaK;Z(PsYN}iIa!<1?SKK-Mgii4I)`5S0Lh~tBAAdhi
zNMW<DV{L-5{Nkiu?lk!F7q>y+#nIxmI{`HcPGA(!5=;FjBkq6mK|OLGUZ5_T4hTfu
zmc0DQKF=PwKYt);9|K!up(w!dgq~0t*f^b;w>68rf}Ntaz`F9*#y?g*Wulzh?fyva
zJUQ;G$4VTp($D=c5%a|3ex_h*>Fpk-tI8G6y@7LM#N~co{zI^X_)cI<G<XT3OG>*0
zp;Sf6)P{<k#OiCjJ#D_+9!EvO%-T;Lhkb%vf4(B$>@a8&J#3mmD$Tz_W`0)jc--Fx
z^HlgP+K^E8JNI7)^R#1Sf{UdA(~b9>>HCjY!4KrCSlteYtkxquF{n$>vhcwrh)IvS
z{Q&4Ho~mQX0PRI}r<0TU61SrX34fnI8y|>08df3D_=YXgJdc%*f-*A&=9t#ua!kZ9
zZ^8o-SWNTe+Qht9<dQBvfeK%8JX3D&ihjj0=W8|lEsn<O>5Guu+*3vkBL7pBi+25K
zLwaV|Tf-CgMRD_z@xK@)2&7s5qbI!nU)VS--e5))mwhB+<nrKfO@{|0+}qHn!GAVi
zEWZ+TA792Z7LZ~>5+!;BV@%x0$bA!(6ngb7#8B%7*MD1^uzXIHxnU&3OgLT^|AZuk
z%sBhYH73V{P~;m>ogZAjvU&VvP~2WCIAn+114<mx64Uu<%4(8qh7ybg&pG?`FHMhz
z7bkmAMTw+YjN40Wbc)MMQtaV1?s3~`CB9NcYHyZP-~lIdMW|$wj5_tVX-0tw7K@x3
zd<g82Yle&cBT!8R&Y0HPUqunn+R(mUN$k?kP*+WP<Vst|D1|o&BS_Nvr^O6xi_Uap
z>g1BZdk8ZwPJq8jn`Vc`6T%z(Rm^SE$*q&f6V=Fbd?LM7yI_^pKndYOVn_YDRjHGE
zXd8?`HJv=NNZTMBBa!Dh3MBw+)0CJCZB3`e8BI)7PMrazJAzda-_a#ro&m+{v@REm
zR;56J3f#IIeQ=EetH+aLoVe2$Q8XO5L{dCFoP$hwxy!<-PVS=2PeY~u7OH3<@0D6q
ztLf|{AZyToj1fPG*{?`gm9p}8G0n>TAyZ-?gIRuOzeedsOBtEC!!U||^3A6sAu@X3
z1yx?T!-e8!-G5iqLf>_d2E%@$mLPD^%Yx6!cn~U#e?#mRFISc=kplyf%}uVq`yHLu
z%xn2)H=J07w9xKFq^aTr)2L7C-zZES<rrz^Qqy)H)KuPmC>s5zbB-<%a`p(t@O}cC
zkh)J%T9(TT4-b+|;BKG=6^Pys<s2+Sfe~FNgvq7W9U1xe&?1Ndy}Ygoh^xe9*dAsI
z>-HU!+*4N+GOPo2DC6Ex<6u_=Q7Y=H+FYg4(0DbB;J-f4h?~8ialb-I5CxI4$pig2
ze|uPmzcfxzOkW&KtahAsKV1><tbCRL$@3~dH`XIdf0(Es_9&P9+O}|d6tO>2rV3Ub
z8e?MB3`jx9T6rol1#LZ3+x4U9oOOL}#vn(LIO~dhDuCrH00rlr;suTAzf_GR2j;1A
z=Wu+;8vSMV+YhS6djg#kba$hI=)DyWgbS{2Ofgz8#Nj{%t>W$Ep$;-2?B`(l-`MUD
zw!uduRLceyEabP*wrHS;`aD5&6o)#4b2pF2Do`*{g4XISo6W*$jhjecN?Jr}3@eE8
z;#o7_z*Ggc=fJpGU`R6MyU172Bi6a_*1Zs`mRPvp^wt&G5b@w+Eb=0#Qt|W_HxV~@
zv21Iu!D`wYL>PE~t^+7~B6=`OuIYIe+jk0bRv{rw%&DQ+l2YWbI&b{&5isuu!uu;Z
zc>JWKlV7E0Od*(cCtCrP5^_h!)BbPT6`HrVHMJOsyC2%1p_6C**^^M?pg$W0Tp0U%
zCq+gB+M<GrX`mhmn7ZlieA~f6M88nMLyWkNNnUqT5t7@lt|}}Fk8tWeqj~cbP{v=l
zap=0QsR<o^<CN^9+v080I*!qNZKxO^Vg)H$CpGVXmxLJ?gE?nMnUWZgPafYV)eT1!
zw`__Ni%M`(6;3N!#S0SWj+qN$1ZECm{+cdrngKLc*Z@G?sn8u4Q&o^jdY-iHngHQE
zmXD_pmpZKUF+fi&;J7y2h|fZ69{?#Dh3zuJTc1!Z_a#pXk=CmEU6(kE>vM38ek(Z0
zu)}+FTIhp*yxc(*u?O}xL2V<-F#1d6$x1W<4<(0fdIWYblfPuld@!A>>uN3u>v34l
z6}i-QaYjA_xDNs?DT1hib{l^1F81zx{OT_{yS2*18?fy@U7_}gOm+){_5>4I5cawR
zO6{LfHEmgVm9N3n>KF8QTp=7$c@L~tJa%#C-J-yNqVp^6Q>un|UAA3eh!g}cO9CTg
zk~y;YIR-*ppkLF=Zo94akw*TMUwa;|j6{h8EN{eYtpbjn1(so#UqxtQ9Q*zZfW9f*
z+y&J|+nme;*8k1=rcda3dcu_!-X_YINYsfS<}8pp)}#Smob$x*Q9sbu)A=J=z+!ht
zrXu^T?2HRyoy|;R?@DtWgLla@jIsVa1dL6n1#W}C&dt&77-My~bIKslc68xzImuH5
z4r`T4KD-<JPqfSWow6%X6QIhRrlpJ1ZNZt<wU9t&tPrt)`uq3Y`p9*sX69owFERy_
z+-pwbk;=e(xGz|e6jz3tmptIvoWI{&v-3JULFkh`g{Q`df>Ldk6ZwvMc@p{`Mt{kX
zYJk$;A!Q8idpfPZgjiKpn?(hfwztqw&FX2yRl)bH_#UlYc4R8_Jr+3-S!1Z6b39jV
zICN_oeAfH#6prS<Q#gU3cmF>dKA;^Xp(-`dkT<_2{p1h$0bIUpyRA?gTi1YR&7O4q
z1My}@2)fN!=47*Pcs!Y#ZE{7}v){X=YNmO(ogrXc!XiG=k|p!M3nHCj_kCo5&VLW&
zRI{F(gc2i5s(+blbf?vADczgcaR&=_X>0Saaw0I>^izczcx(Ri|EN04u&BOp?eC$x
zyD<Ri?oO3bkOoQV?q)zbL`Awqx&?-Yp}VEKLAo2>?f;w)@A<03zUJDq)_T@@?%#c{
zcdssyAo}_UF3#;jQkhgg1=0?_b2Aqqz=_04Zb*#sEDpA2O(FyGyYj@*8%VxH@Qi56
zDQE0Z=ydU_espEAJ!VJrF~p`uL@+{<ycf%?Xv<Za?iK!NELpqxrO<#Z|6syeID37{
z@F^Qu$ol4Ht0-)Mxa(^5m%_a4RxO<L3?|zD@#$y-c^bi|Y?`ZPy4E>oF!B5u>dFrK
zQIC@E^nyeT7m2_PV&@kowwV))>k2>6WT1&3;%|^pPmE=*>#bsC;KEP%p|uZavK(7`
zy5E)nPy7tX02T?`tK8XAyc1m7wm?+dgSrByLW2(XPT>AAtlAm8hsI3mV2%(s3rBQZ
zAz#OwgoiIA(B-*;lXy$@?O68n{m+EN0G2f}4mPDq-22axD+Hg0w?#ERzrXscytqaB
z)tF$m1<7djuC2qio7MbCGL&_HV;0ja<JdG*zC_N2-}qT#I%a_4lhDsI|K=9}mJAyC
zhyGo?8D7i%X|F<`Rn9!Sm9(+v0jSBT?@*ti7*#9lJ$ijcr=97nh6;yV%(H6CK3*=X
zlPq&NVhdDUo!N<y_7;Kn1K|Ha2Qz}bRB+(r#Vub_A0uoY@v7ZMXpxBH?LM6afK?Hn
z=jMmZTP>o%sCP&&Nf8|{8;%IK^kuh|^p?5lavi5JZwW2#*7va`<``hp{FP2NybM_b
z+*QyIY|o=m)|Uct^vrRJRDo{dKyO(ABN$Bo39|4tJFSXm2+wQ34gpUC4ZdS>r!v*@
zyih9a3->`0^GpI6G-P<*BMZIb1fB1-ig80jwUOE+1V<t}(isSSG^$mIHbDt3`J~A2
z%{tJx`Sm#W--H7T;OF33Qr8l(=*4IE!tAHEqM;Lfl`%Hxd<D0x)d;}SJ!ZMV(LApu
z4j<Z4YmIOK3&e=it~<qnwH?E?R9`ej`E;*(ruR98>AU;tiF+5+__?=064ln0VH&r$
z6{Svt0LA*eakd;?xwL64@u4?^RWYbbs^w4r_yZQHiv%QqM%Z-*REU`eiZqIl;A9c1
z{z^~J7++4<a$V7k(#;j|vAI@zNW?U*WfRHPI&xV!j1UkcMX_{CP+atUiNmLUs}Q}0
zH^^ovcP1ePA_;O*@-cZ*)CLcT5VCws{kQo8a^4LUjA5}AC9Jw+GEBNnzX*VJ*X-V*
z^g~=r%XcALY-WJ(8GkA-vJkJrVF!n00;^+CJm212dP7J!#B=X?@Yw2S53W4&@c_M?
zj?EaHct(x}JIn2ZA7cidV_H;_bXiW}HuftdR%^uhR+fP=J4tgGvI>>^-I-Worh`o;
z&anH)vQx!Op%)&1s+?4H9jQC-RFRBkcO$t^A}4vXCUR}eGwV~<x<qB4=4ryP7p(S4
zR9J>M%{Pd`8BBmn8=f#La>_=9zuJO12x|8~2$A4*)*?^b7hfjb(0Z576>~mshWPX^
zQpQg5*Tkluat$A(qQkYyKT-=h|FMC9hN-$d*nyRD;sdG~4a%*9*<ha1OP_Cz$iAci
zS?=|0J*HoNZVXx25yDTj02M3Ciio(`<}hwd?ph-;Z~WQNkjh8IUHh|{lMR7tH%EiI
zYos}dtsHb-{J~261-9~E*!%UjK$_v}y9!t2Gu*6*h4+wAoTX6XXHPrxX<u+`7J@I4
z!l3KYBlO1SO#3U3CIAE7>}{%!QA41Q>}B2)5iKj~SI^qCJUd%zp52c7#MpT04Iqv8
z>FsSl$HBVZuI~2$c?{YL<%;mo#(~9D<+bVyQ^xdQ;jtIih%lQSiZGm+Z-Pe{fS~hC
zOWM#AvgzLZjZxy*RF!2N0H2|SR;qgoh*9EZ&%=kKx`6!&Yu%;cRs>-VK35!*l!gK$
z>I6o>(Relr#WeO=PwfOj+s!XlAETX>y|3d!ptwIZd%=aC0G4ux9@Py&Y?xz&QVz5Z
zI^HHjjKxa;kc19@Nl)5SJor0TlLz?TFbij^W5veANI#5fIHE10mI=QG_kFqf;tVnU
z99e<D<5Qff5C;bdrGmgu<Z6;AGH9&?3YW0?#Ta+>>kJ=a3OV+I2?AxmxZ_n+jN(Uf
zU2^Cd>?s3wUgk$t9Tm#+>o`!v8Vz1RG8KTCBKe;3%o<N(`+G)ioZ_R=wG6tCB0zS?
z?sVyumwU8m&OpI0u9hKqRFwBE#HVegi8|nIy}|!sP?|kYq=6M4i@!Kf`rJ>!jh{nD
z3!*D8`}ttrY)=m7RNj}WqCcKl>sNfO@-V70e_uT7A|xhezrjR4wpce5Ny#Ik7r8OY
zCU5et`1A9D{_Igm&Z7jl8E63Df(DJj6S;a$4UX>^B}`IlM-K{})~zDeZ!no5V0aX-
zm3tJ4Ymrv@=Keu_w{KcZN6!tFK%l)om52%>lE~%iD=<Cc9b{1x#R!7~CUI%cr87N;
z_>Fz$J5B$h#$nRJ#}O_(C$^v1PC%gKY>SlXz;Wo<`qG;;w_)+p#)L}c?#M7SC=<KR
zr6ed?m#(69%75^3@VZ`63+DTFPxSjq79dg4l65-2_8_pB0Q-)W@r#`K7_n-J3lTI|
z?)T9=3HUKs`wz|nPpl|L%TrZJU%VWb+1|KFoLTe5S?zQx`{GtUFMEHPmudGi67&N6
zibA7Vy1*(4vcczC!yC%FlSeqNv6=peaIE^T9=-x(iX|R{bAJ)AuVMPPcR-mT#9cAH
zA2;kF%gDgTitwT~Uw%8|0WD_|5@HXi6pr65mzbFMJuyve?JIWHG^+JG+Ho7YcIuVS
zVY`qRwt4M$c++(9nuG}BGtG$!@_LOKF3xKhn6t`U0BOY;>8wakL}JC$&CR(gNi?+x
zqWWT0<~xfMg441JP0z(k>K&#pEkmJLc!-=InA~;^o+lb@F9dqQW9?cIP`+>Z=4Ctl
zo$znD9-D|1fxED8+CRQ@@9(QCq(!~d@8LafzqKb~MYyp08FfL0;LQKPn8}cHE9P=g
zn<AeuEcEc5bO~%t91YK3`m~q0aK(v-lX<=Cm>F(f*`zkIT<^|Y%S)FZ$wu_xq<-E9
zc1#{|9Q%W8u)Aqv(|yO`VsqSr>X#BKeG|RLKJQWKK4{;vI`kX~>Cx(hZfS>Zey%Ou
z`$S?X1ju0P!t(g$SC^dXorH!1y6fw!Hi+DlTc?8L(Lt&D$C~r{t#o5TDYRU36^GO8
zOp}C2iP`l)*lO4!QO8)OAUP-yg|fY+4?-3%At)7lSgtFx-qnnH&QMWKE4CV|Mio@Y
z_KO?nNKU$8NV;Z_L0t(y^$-u-HcM$ZjOiiKI7u590ZZt`Q};!PJMy{q0I%!j2<eCT
ze&GIB*168>BR6yZH?N``))@<zZv#@jyJkSl<x{;P5%^H`&0qD2Ac@B?+^pS{xFY0q
zX^|qYN%rSdkuy!>at0Ky8Qr|a{->q$23Vn&3H3YeBL~lLYMUZ(QY#~OSECq;3Cl=J
ztQnL#X$ro`HN=y5%nSUGF|bV>#j-y?LQdiQT%KSIeIfqao67Y*_20)t$_NNdj<N+|
z^iQ;RIF6GOFF(KovqnHguv<Cp3%@`)29TOE-mhxt9}}K=cDzE-?2ANkdSVs%c!I1D
zJlLn61mjN?|4gwq7zIhnYq()4xMRjP$k{nLB01vVVD-tUIgucdB2Pg{uu?qA`9LK(
z%F}#bqT(g;Wb0b7kaJ^(4?KMnJS%}k#HfZL6D^*`=XBHL=s_N8=?HEu5I9v54NQFY
z5mgd>&fe%E!86sHdbRL=Dg}SP++D=p_bUb}Km=%rp{HL4HIYkM3<3quO?)nktyKb#
zsLM{=Clk+J^o+g|w7Zc(DtBN83{e3N{kom}%)b=SRU{);Y|3%dCjcNLjYcNINF*Rt
zwdU}^yk_@Dq8KV)mL?QrCkJx2a*IJIYHLlt91oKn86KA*<uPuwyUGC}>qbrt$2d9B
zcSR%zF!iMH!*{@yoR-#^!vr8%f^@r&b)@^0m?|I`y^CD&eTVu54bAToff@mAVeA_p
z>(6nBAGqwndO%i;^CBEP)n?xr<*H`S8_QgAs!Y^4)@hHh0G^i?HzIv1*}9;#XFP{u
z?8l$QutHvq&cuf~G_-V1i;#WLYEk-B`=zwjRJwRWu683<a@*kvF(C<To9HIkAKoB-
z0)_r^3CWl^Qhn1~63D@JBt-iX=lF|6McmCxRV4%-w$I!<Cm(23e)B(~DU;B&vCZkx
z@cpd_&8SpOokecU(EWs6CIEDQ3X$I={q7Sczbb+YCu^@8t#~`Fi-!(iCf8YymR(6~
z>>t!!Qd?uddxiW${gC)pi0``XV0Nfg)mg356^Om~wO#g--#U+g=V$FN#|NA5W|O#B
zIby*bg2rKX$69DmIQTm-fdnW6&3hR7f%DB>S)KJUV_J5Njl1etO+o|_^jlvm^5>LF
zbqRA_6ypbejco@OL{4lj9ZLUT@d+M<dD@)qul}h*Y>V(io1LMvemp&L=*?=xvVGgI
zglS&dxr4wwH|LP3pnzMl>oI|K(SPWH+kX<9Umj?n!X@WEk5d)@_E7fU)>HWe{dRK9
z*r@6ObtXDFt(ZkDozL?%vOPL@6dE@mnmJ%DrW3}Rw-~9c=51hmwXlaLe=_>6|0WFa
z-$W`4jjJ-Zs{s}Q0DSIt509(2Ky)0`6yJU>mb}>5et2_B{oTtnuRam@OI68V-kSFE
zG}u3TXjRkBl3kVn6+Od=8FsfmyH3e9K$=e2m}KXt*yf>fcN9br?2Fi`i?j|aVk+)#
z7K2^rov&Q%O%ig_Xwi?^aYiRqlIyPZeB_J}qmMhUTpRzePeM3VNLR?Qo}(C{wehsK
z7>oomxpOJ=Rbxxv9pPns@a2Jkm$p*^E1+5Z@p?V>zth?gVA$AaK5Rax8yuDU$+m-m
z{Z+E4h7#6n)B@W|BY3*F$z2jx8F_~WU?B#w)mpwO_ownvm?C+d7B|q;!~)?weNUcQ
zS*Zqu!lzWy)|#4$5i2+`c?${+8s7p~Qp&D4>x&iN&d$~=is`)SqL|~1h<}KmfrKxY
za+2|Q6AL&GDC!l~!VR2J0s3I!uWtd?!EY${Nv_gJ@K&%zH};Hfi2HXqkQBg-1e9#!
zZG0W&_8PMj0^%C!n7p$-t%4h+kOwb}qqg%QVg$#fDZ^c_Kc%rs2;*IV()sTPu5W{1
zBxF!+Cu`l2BiA`G`!lAuUuX+H)z!@ko#&TQLFwy8>9hGjJdk~}(nC;0ZxSF&$FzoK
zg6=<`Fj%YGkIT0U?+PNU!P!1%5|B{->*WKN5&-r1!A!H*2{gqmo$x!wR;V6CvvfTP
z+*;h8j_@f{8&r9}F5{Dh0A~mX)|@AZ8W6G{1FCVCMEle;cDZGRrFO2S@0|SO-PS=`
z`Ui<=WWY3WsQtHhwgbIue{!!j64dDb|3Z*y^>~jAtW4yq4){+qzhP`^FNj!eeWPw}
zGjRV<U*TA$z~PN+y<ca21Zce!?!+eh&cm%^nZ){skmQ4`=d>g?yd3O5s5mmaCyXyF
zYHF7F;AZ&Wi2a(-K!?66SUj;fuUii-GdAKht#u6hQwBRJtuARA-%)iX1_X_}$r}(C
z`~x}fUU&RHE#SJRUV-pZMp8icNTBHEut$m+yZExw+CW9ejquN|nqCE-;4tgPCI_C5
zJ8S@>w!Ub}k^O_@THHnNgSqrWzg&y2&Ap2VEv*DlV-dUg@zfiIFa7!$P)0UoZ&b3I
zg6NS3WFy;?j<|DYZhd<bg2f)05lfJ~%x0!Ts{OetY<}5oZ!BN-Ekn86CPlE0j{>!e
z@ULXnQ#q2+aqK6@P_NhaU}O0dCPo5i&(9469s;fY0Q(Kgr&YJqnFK++o@kon;wNO~
zvTZl06LFUA4Ki=e^&M?LK&C~{`{T|BAT#8|5O8R-wbiz#5kurS)MPJ+P=0*`oVkek
zx~33WJxOsx3?;lxX!*MA=;I-RHU}OIG~f7-wd%A*feC6OM3(82ltsu^E)2N05pN#y
zX?xc6u4iz#X4O%5sNj3%j|c=HZ*$aYcV+)#8RXsA?f>?4AL46%lM|$pT4j;_|M%%U
z^uJ4dfKcCibUzy5LD1&(cHwtC4_CIp4%}QCS?UMCDgOzs_a46rmQ7FI!TAjR9m})U
zXe>TpnhU^en`rM!i{~SyRz70-*rHqM<J#~7xL*cYz`nU_S|qBI35hpJUNFaL!o-q~
zDE9d6{XfA)NC?PhYtApUS<_`orOvl>@Y={S09L|R@5b<wt${KYAkWgF16L(ktVYRJ
zlp<zW{|x&So%0&zzyK_Rw;28@7(3fWga&{DHa{pupotRG5!U^-;t9FS%-F`e0vC|W
zXJGLyqk{J7Easu7U0m#<n?bRi%a&H!+jk;}v&L+Yja+@M(V`;P^!4le)C*)*HGe$=
zLIJ1?|C4~-?qX5|ty#$}x7`>UQ<B4WF1F#LB0}I}D+2V!qu1^k#oyVR4&se55qN-E
z`GcnV^0-iO>^1(*Gys=v<%*N|Q+3Agju}hgMy3}yO5J;lY@{EaTKn1y%pGa&$T`&t
zbaBhI!ZZ{Bax`Y`Z+m7ZqK48H8t4zzS!b|6kf;XL_}~sHwOj!Y!3$4mdM!aTA==g2
zZ8OX$E7Yy)X{laa$=5Gd=h`};uO~>B&C|a4<cv9Pt=V7~ssjvs;Z$B=LU-RE0rdW8
zWi9vb#GrbTxnY0DXTIPBHm}FEeBZ+*td_4|*E>Y@MNg=tc*qHl{{Qr>Umkcp0yd54
zMoz4F0$9JkizCE9YLrFFl=a*<g|QRhu)Lw9SPnMB7qYg_?-Afk&0eXLoZ;(&kp!Gr
z%S}U?<=G;Ah^XqpVf3IU!3`OXHVm`Nf1<z}T^HeY3UxBalgX*Z0G>a-PVyV$7Uc2%
zlMU*zUpKmH15;>}ySM1|%S5B{xPhfhve)J-WM*5b!D_+cJIk+{8)5aP?P*b<A7v8I
z4+El$x27zkLn92pm8A={pE98t{)(6rG1q|~;JChOePa99VLwMZ$L6=eN&KO7)o(-u
zl7GmUyVNVWEWICth4Fl~aNq?l>-+*PBtpejJZ8g1wz%X(WLt>-Bs<ec@UO8d6Y7+L
zyq-qJQ-0d)u#jiSv62!Omf~IFmDKItmWmyIn>0fVe|75_h8v%0R%jt`Iv?*&IU?of
z(n=ig{veU)Ksj1-jY0G&RZUR^<u(Q&bUcr>$;#-+fqH`5V$VVDZUFObrc|vI_XQUG
zc$=(BKKg~LK1@0fg_3`au&eow2|gx_WOVO)g_wfL#*iSN+lXs<OnmwhoBo}Utuzk&
z8YH^hlihY+YXT#<PZI8;6Msxtv=pDJLc;+;hxzsP-`kgc>D5@J-$_4)QNUx6-1Yi)
z$-}T%@fW^m|HXr1DrUYM6fL${T}v9n%t85q+7RrEJi@66)SbWji+ZIS_8jMLBeFjM
zqVnn?DZ7ps-!7AVR@~d6|IvZG|I>lQCw|Hy00A%V42oYMuS98SmoM>V@msZT4dok8
zb_QdUcb93BzkLblt3{%HrxXodY*EN1Wgo+!ZD~m>Wup`e&G>+HXWXk$9XW&3aVbFs
zeVMPQZ@0;eWW2CnPZWrrENEZ?_&>>Xj_QA;k>e4hB;}wl%2N!*6!1q_#3mDtHk%+3
z7DDTM31ErqKf-(2^ymL9)jkA$^`BR||9-LGM7aG?+VlM@?*asPncQ=Hc?;G6NyzAw
zLb%T7JKK%wi;^6Q5S*FSw8ccLU9bzDHR?4l;YMwQ4>S_R7zu~AKOz6L239Pl(Cu8=
z(u$B*WH#IUtdhUZoE#AydI#ch+U@$;xhNQfL811=zH=PMRsti9L;KLfVymU!2WV!?
zvOtU59tke|%n%bG3@pig`Xenp`)9Y>yFQL^2=n2DeAZCHc^+h5C5hD4%j6V)%l@f9
zCa<*77xnrT_<81!)!=ljj&_CE-42B0Mi^ef0D*S23$8>;1Mbdi+*Lb<Y#{&^oz++n
zO#wbY<_DVV6QUU6^1p6MuMlo=WNDExgT(YJFk5ulO~{l&6Fa4CVIfs{oD=5GMkZ}T
zhptL`_7e?i$eVO54kMg%sX2ym5^*~{Y9Q)yi0L^zm9g9lv;N_&6mX5|*MCA}Rl$G{
z9P)Ux!bm+eT8Pp92pV5{bz9xlWPJJg7nm9486IS$K?}~Tc-88DR>AnC9ucvjJdh%e
zp*EeG@A9DOp!MJG=y>~U?J`I&{zDuQOxtC3G3REJsl653UT1?s@^Lbm3!7eFb#c*X
z=tl(32WsDtp)@(`(E_KJeA-PvGMu3O5%bcU73knOeMlvs|M&0Yh*z7edN}&qtfh4e
zqH6S&^k{16(13yE8-@Mq31J49e3`M7*tAb=N_Mt`HBDMv&8kAZ;djLg;QpESed8~K
z=4tdf4UJ9!>o&yNizrr>a9ZB<p|NaZ(x}X1zJ65Ax&GOw&<`z$_oQ=qY|S=n_@y@M
z2g`nPoyUGE)!*alEhf+<z58hE{%236{NLd^(`xKzmm3zGjE-x;|MZoBB`2yZ7Md(C
zJT&I3od22=(KD14J&&2d&-x#T1Z?<~Kfn4Qsa;`*%Jyo1TebM~MgHiYA7oAMgGU0m
z0Txk%pyOH>A=#PYX|?E=W^1lsYe9;5ZNJ2}OqKR8Aw3SxFB<_8Dy4~9I_*SE1i>^G
zD${s{Ec^_Hp+=MWtinp0f(V6Q8rtBcri9~9!(RA^jvc?Z!{gaLS6LeC`4<r&y+Lz%
z5N6|Y#};spm2|L=k+*2^#owdFf#2`y+LM1zufN^v=V=6(A5SgCXJJx=G|*4@sbtWo
ztJZJ|zs^4*;!a!St11o)=RBI(LR;cS5rRBN+ETVZDY`~P!<Lqm0PuP!e!z37=?3&?
zPyl2Il=>~-Gc+!*wh<8O7?h=8Y8}A`ATdt7<d>=VW;GP2+)<oqF5v6tCYOYSv*sA~
zOk{j#@li`Y0M)Tnzo99Am}MV-!JR2~h<1oywyJg#J@s%WWl$TM4MUaotpw)HDF
z{C4O~s26cy6|>C9<m&C^Tj&%cSQ|6*<*$63duKjMf)2=pfcPZOg>p1aWot$B$LajR
z^nh+9&t+Doff=@i8NNa026R_+3@qhhB)EwGi1tP9C(rDFyLxPofjJC>E=3*B4qm#L
zsn%KSB2f6pNp9Bv?QKkJq`$(x40BOx{yF5gzG;nhV$Z|kF>kuJPk(_9|AnIUx<_ro
zaxmz{i}8tzy7SXw*2KSn_GsCL84jM_;%KNaD6VvbsVS$)M$x?a1AH_{l2%(FVGeu1
zZ$*s8IP!zn*o;UyjG>E*xG^*ne|h~Z9zv#tswL#y2_bAXlfvF7m>*+4jJF8F`y~u?
z$H!!PP5oiy^8N#<cYgy46-VAUxfiy=!-K_vI-mHCkD7ig$(LbO5&&B+k4kAiUF<FK
z5kA2Y7C$O25SK5{T=*Y9xFN9p{|>N`YS3FF^*J<6SR)_6YV(Z7e>l(o5s`WQg(~|s
zs|g@)#E3N*{#RT~?>(xYzk4#x^ZOMU=6D5y>r+*fmA@*9bkg6h)7JB;pdk|>rBNO<
z4`|I1a)Lar4ZpxsUCGzhv)Jg#WwY_Q)^ll&E{wI&KvFJ%@X+2TdQ>wy1kGj4ZYjZV
zB0*wM9!*VJ5gSwP$q^t)XSd=*E0Cv>mF@%vClH3-e0F@N8EANe7$iQX$>56?H7-C5
zJ&L9gGhb@*0*7JqKcfT}Fu=BX?PE54jH1CtLHq(>-RR(GlOGN*!L@t8+WG3Y{i+C(
z*k}0-fVH@9neB%FQPcwGn3o1Xa$0Ex{+CE>#+Yn{P9>8!YLY;$gmLulfW!!hS%~r7
zjxTHkVGsbJqrV!V<Nfj95M%naFNmA(Iau{OpC&<Ru!X_+yQcuN>>suiML@gYv)|Vv
z6r}jVAJZR`DJF!KV)vf#EmtE~Fw1&T89%8TuQuMBMC1wsp^~aa)`{Aqr5n^nA=Y#>
z2pi9Kn#>4TKgdhuk5tqyXRC&U(uE^IH?<t^j&@mEe?75#Jmvz=y$Zo@e30&2xCSd|
z;rJ#)3Q_>QZt|@E3>mU6`;K$JQGaFL6I`7#5dZuiIW6u#IW0MXRYw>M=8R)h?E}w;
zpr1lk4GCb5!)+#dGWh94BvaD9g5|u2)9o-k_Kjh+&i%79k!D`m^*c3v*G&qs5HtW~
z%=mQz$Y52(yO0V>NJIj7As;{#44n>EL4S7g4H*N3ubiJC(~Y>87!1EiGs{xK9UG|9
zJ~}nf_5J|4A!@%=z;lIec2~NvE@)KtnKZJmkjKXk0eJ0AbdbRa{!VWWb|+rA2fqAD
zl&Jc%Ebc~>7+Zq5jnfkhW0&}Tw2!Y`*>4mJ=teliwGWr=*(Yk>esnAa;RexGhH*zj
zuRo*Q1}s;>LZpxm@SCVr?CH#nH>S13SnQF@-D%h;a?MT|DlbSk>yBiF@yms&6XO5Z
zQuHl9bVXOILCYT~Kd`;oT^q5^#kh_*)7LR0#!E%PU8JR=c$9}6FE)BWz&l8X?+zP}
zK$EvJH^4sM!L9|<kvcQ()wq$un;9<SMj&3bpASjAaYn?)bRLaeIBM&bk}@_fd6H~7
z7`VMwt0|>BeRhs0*f)@q`zcVyG5-`}NB^4rNk?O}DZs&HxwGU#Vl?xu2_K#E$`Ce2
z)@zCEr))6psS;#GNq>lc+h+M7XGXIBOUp52yxm}A%_4)x@jR9aR-1L6tpD`ps;Cm#
z>%G+O#QCIQs%Ot_by}z&{7~Zqo?XhcZv+LQ=XT=MgfLu+-nG?0`2NI{ggqF1(E|3B
zZj?!3sA7z<#;?^PLxU)oh-~*3uad(U#Wq;0H40ELd?Xn@a4|<t1uICL3QObmhuQW%
zgq6H`K7Bo0f7(iKHjS1D^;n7gA^Y_w;$~nvNzyHYS;+&*w60pwFZy4r8K*PoM?3R(
zFNd4;wNYA&w6TfyTz2E4;|(c<QDm`}IjV_+y+N*uK!&dEW4ve$#_NB~TA}(P!#^vK
zUtX?aA_S_MCin{6p~?bO1aHf?G8!e7v|ZXbMLonp+1Gvlmy{NW6>}2MiYm#XX2>GZ
zfxN86@b6fORmlp!+ws>=efi@U{hNQ?*f}!2yxxy~M$-U4Z@;kCaAgDuJb&_IQ%#g%
zQ=JZ!Isa?4>|Y~nkk8lj(3Gmww|MCT_6oo*a{g8j-3)(GuQ2S>RQJ<{?PEE}9}*q<
zZ_QhyOz@~CJ?MbA)e8`^Z9UiNgi{MAlvPeh49%b9|HBmRebSm%g!3(SmwYi0oIor;
z+f@H4Cql78;p~-HjhF$yQ|U=FNj~PryMYOfQ&uB**$Q~a4;J+31Q5LnKwU{qWXA^_
zmivYaEvqWh?dFq<SxRDC0@QMY2m)hheZoS5E0+9jU`2H)sv~1aT0%ms0-5PT3b@{{
znVIly(T03H1g^fQ8s4}KCn)?6sVemVqnNn^KDpFBGJ`M6Yo4Y@=|+2bNx=5qo5QId
z=G}n9sb#f<C4o!Zl}THIOL~h54iJ%kVR6PGq!+-Wx;NdZr!f2SNsj<h5deto{VI|`
zwJy_@Q6<^m&{~N9!Axqq*UPoFrNZG;JV(+1(TEPuLd_bDE$X1HYkvRD#rCcQTcYr9
z7Dj)@O>L^_8g*jBy^FpL_-gu=Z+^8U)t(>tu9>F|%;U5&{{v||3Kfqigs($zmU6A<
z_mw3q9BzijIgBrUhx}G7Qvo4Yw{QQm(1Apo>slxJ?2&Hn%P3MFKSYvD`1ttlVm|16
zA;HAStC6AT9Buy$t`R}EPA6%33@&at{x5RN6Fv<0CLy3{)MEAxuG!iNQB!*h=}gKD
z_it6acl1&F-fzd_*dQ$^CPqR$G|mIk&2oYU6coI#6wP7P=ab+2qm|AN9!hg4bmaIy
zyeXmN9pkG-4cC<;(orA*T_{n?pUpwbM6^{BYM!2gK*GQGGu>GQ$u~9no0;c4Um#CH
zuQOAYQCh34r2b~N_*XCU&+LT2<Un=K^fI(JyDY&H1~_wDD(B4!`BBNp1^&I*DLajF
zLLx%x5&<90XB;FOJYoUQ;k5c?|GQGKDDJM^oRQ$*us(~-Ej+Y-mebpXj9P44Re;-L
zX)7&oYdyv<6tAPlBO)@Z>a@#(e6grs6T`~Zep-_ALH)%}X|_*rUIzXvwGWlx@XabM
zH2fV}mR)HPU37?WG4$Dm*I48W#8)rgy|I_MC@Hz#eQ)mO=4|^vPsA2BWl{!v^Iq{^
z<o@1YT47jNJ8n)IVf40`dc3@5ZBjMl?M4V&TQy^E&_Z!heGFVI2EykAnLw3M`8j6P
zsD8ZL^M=YohoDd;Mvcj0lYvnxWiiv|BPd0%kdHd@BsP>fLup*f?tLKL`lpcne~+-I
zn=B|eROhQ<PlNymA}}vf@g;r7+%`6Y`E~1Ki7`H6=hoJk2RORgU~@GV2%=ZCbfA$(
z%x5egyt0*IEWdo;VhB0)?A)xS<=kKrhKQmjiREcO8_CZGP7Xmi!S@Ud)qj2!*}ZSo
zut_=Q=P=xaxkif#vQQ4du`;&^n^L1M?78!{*5<8$S1KmuRmNE4;4ckz|NG?)_xd#H
zXO-sReEG4eXA%Df-hptZz%)0nN8jJg3F#V`pi*s`xNfNVC3>l!p3?UpEz*wB$~~5;
z!eoNT<L;?Nz75e-4>9^OyZL|f2e|3pOpkZvSSG1foq*3;n|J<gr_Unb5ewQK=y>)8
z=pfL4w=4Qwra83S7(%YAT((*D^>xljN<172Ls>}cQnrTB$&CE2m#RRzJ?Pdknohfp
z!n!!Jac45s?MTW6+w-m>qgjVTW6ngza*6Kka(@@dOB6*SJ=8|o<=L5RrYz3q*IP#h
z!-mNhYm`es5aX6teaJx4YLOCWtj6&-)>}rcobLy`lzWT5O@S7+)5Fx6Vm=Iwf6FE8
zbZfuN&WxA5uWR{UoTYzzX)4$CORI(qv!WD-I=Nns?LjC-M6hB;Fy4(Ih<<Q+<d4uo
z)NsAE_vXC!Z#V5P8SzmH9oa=91#eVd9n{2t$bB4JmKJEOK@#47FQ|<woILt=680;2
zsGCx|aWJFszb#OM(m%WJtrCl7$p++F=HJS|r<gb%Fq4^KirboXrD((cbo<3_R=!qk
zy2N7Hmk}HpnE`#`!b9}lWc+D<cg@wH57ZBCzqpEkFmsH*A60ItY4!$r^Yq|K|09HD
zi*!s5^vZ0WKEy60RSy0M&g;H8wXp)n+AH0f1Q%y>h1kLclCYb@JKp7)l$!nGfH#Wq
znCWHn6pvDmcBK5m`X8%R`K)eL>P_F0iY4}fyP+uDw~FzKRagCpA{?eDQ{V>B)>l@y
zNI#`XorY&CJR?S-f7~Y~7W@z<4v))b;;p9rd(&krXC?46k+`Gz-)yRtO;G0!AK&Jd
zgMQVpx~<DXgJXl8qOG8b&1vtV2vw6|SWgTN_(c@sY6m*Hsc^Al>wLCh|11&bD{Y?>
zf%_P66^Wn;#KFl%RiO1}p9GCAT4z)Ew<9)_y0`xM|Ev*-o|mG^g1oj!%bLVmE=Tli
zrkWo*BPpxED9{+gH{j^P&c3M#BzY{n5y>d9_85H%{}zUojxe4e_Cezt1uZ)WDGrRj
zssIU64ZEn@Sv{KMTZZeJ%rHMD%`SH%)~>Mr(ldpN$JAxW$I76P&<UBYBAfujfDZQ`
z<Hfu^OY~2y|M_B)s|E*BOAX4M^>$D`a503&P{}<M2;V;(6WYu!u+>|1G&747<Q*lT
zUTIrxWvWr4x|N6;cUvL$K>vKg)u09kU)oSBnGmX6tVo%)p7RRofL+k|6szHj|Kpy>
zL<jyQqvDEROI|i=zG-6}<@;eHFdM0X1ji(uAnE}}fOF+Xi@@Q*pRG&yL7=&~&>7;1
zOVh3tZM7CRp!Kh0aAQ`3>*)&T)#3L3{!CEFIs3o$b#zya6VnwtclFTu?i$vhlC87%
zl%DTMG4mJsL_ur_;X$jvX}~*y9dwtF9;!95D!4sUdTG5wWluc+JL^1xv&VVp-;?Wh
z;=x~c(X6`CLTU1irr|4xE!I(7uoDdNyaZixkrshv5%5W-G~@;bgv0_zlAV<B<6*~1
zoh+MYpqYgw1^oB#<(4En6mR*eB&T0zRb%}7iw(QG3*}_whEV64Tb(S!S2GD_;9vf8
zAA!z<hAjk?gh2!Y)A!t%=PbE@!FNKs^WP(x|BwH`Lbu*|5(+{57wqtF{s|yM+k(-M
zJNM+FVDh;jVi1S3HVEeKlnVcneHH$m$}Aj01~wVhp4K=Kmd7Y{`{ex1UjWf`#2~l7
zWM>>O`u6wylP4hGBzMOA{O{a-R1?lE((*G}X{kxY6;SdBvdUtWJNpUa*d%przha$l
zW(Mxj#E4dAHthtSzK=L)eU*Mx*Y3yxJq7|l`YBq7KFBa>R?c&b6j4CCc2{K(R=}8(
zr#-``@BgDwF!3v4U9!N$$(~1J-rzxWPIQQ&-*83RRh{(edwsfL{{oHP+qUln!&pwq
zq_@Eghhe1LQ&^Bu)Q)<~{B<D!-SozhE3F8+KWL4^iDMq`9C#-u2i!*u2EkaW>HY>E
z-3W96P4TQ~i;;8fL6(g{Xe3C4>S*u^qQ|H5xAQMMIJ}-M9|)j9yQZc0;V*;6ivTR_
z!Z&)hiu3?xLm=!r`1W50LxHOsQ1Y&^6YK|iIb5FVj*i<z_Mn9Ttk!o_WxpE&#NHJF
z&>P~UiuXS-5QCdqw0|I0NFNm8{ZC&Y<G*AWFr9Dm@6#E872fQV$uDMyk}8(}j6(7r
zRi`1^?XZ2WCTB$>>488l98_BmB_v5$O7w^1V{?7q@Ir6HJ$h7)CM-^uZ`6c<b;cRW
z)q$+G{SLz4!zW&+M5$tl8AA-@Z75hjXT?K?H-V-jK+GpJj^7c_FELY?H_UcczGt!w
zF;zsBgeqr;77N~oSq9=Gs)moF#hcL^H(IZ{*H~5*=T1i+gxTHusdtuKrSAvNh_p^t
z6RDv1oY|RaF%ma0!%?GlAEaC`whFWT5RAxLNL6{8<|j4NfdSAofO?hCT7K3NCFmen
zS%2Z*DB`c0<}y}3z<mUBX`UZCYqz+&DU3RzLm&*|q=D)YCp1eslJNayYaJe7ditb~
z#@VZXr`!uoA4Zm0Oo>)Z^HQPV0|A0G@uzzi;CL{(s^zf(%!FXZ#uUEbq7;s{RX*fp
z*Eaz{x468DOKR;zd)o<;pW%vHERbWN3|h%wgpA+Ti>xD<{GMb^*(e37Sf)YE92SK$
zW_CgVqRM`x`WUdgsrUTG1P8GzyeltPNZzHSG9nro-@Lpowwo`UQ^+qAFpfH=p$(93
z!MR(=Wtp#wFxVii!iR8P<9!8)Ly&XX>mLYg`hwIFm(HvNmS1^5EzbJDQ>Bbc){_Z<
z5(ibOnhL<Ri%q}%wt(@mpF=YImzNg!+LST^urWALR+>uhO@!wCF?U)bkJocAvgd=7
zE&N2-=;s5GYmz~X_x)B;d3FOaFj#&RIh1WIc{O=2-ddfN9`MuLak*6GMh3)z-=?^r
z`y|z_kAO~rHZ2bAPGOJS=~k_hUr|hEZ;=w!5a0zU@!Bmx>?ZT>w!@++ZsgMj3vcmk
z6%pVw7)Kkk@U{{+J@2NQUM@8uCl;x9gf}2!=}ypC?UwqX@r4>F|8kC2!ZY^L&QKfq
zXh<b4p6jZ;9*b-N8;jGtU{Fh&U#;LauG=drFxN?&z5X4t9T;%(hakOENqlw5GtKl6
zp7oKEd)hy>RXuRILlaD+no{6Cpm;V}_V)j_QZ&myA_D$L%})bBwB*?ob!SRiB^TRB
z)DctHO3YXM)vOrop_NGB&-iyftA3UAJY$+%1mHOWfKB5ab#pVOdj3il(OF8e!jY?o
zY)CUGhtLE80zJblU<n?m_u}O_6Y2RB7H~sR%4hv)sJ_m*jb7gfZ+o@xg7!5Rd&&SX
z6MP`?I(2W7GTmj5f3%^+l0}xjBh-Z-He)xuoVgWcKbdkmcA1F@*L(xUZXI}TjG2kD
z%k2kyKoXebl0u;KGtcEmrsiL8DvS*4^$dzvKk?H`me(PQZkzb`(YE>VbE)uTojpS`
zK7A}C>=c`)nP;q)8X3!pJw|0hSTVlC8|_i@%$yMLP&z;&Bhg*ObeEri56<wlcZzk-
z@DXI5sH;AwBMg#8SgD&rZv6O*waIvg%6y-s?KwNL`}KF&ev5`@MhmsOh|Ki3-24KU
zb7yu?`pkiXH9(Tj(Q<9FKjS0{1pHI~ihz}<d+vOTVFW3mUxgDp8hSfIO1NZkKL+;?
zbZl9hjnJ>SeUWC(-|Q*^EzfeM;)_@71m9tJO4_pSY@BM8>3KL)0K1g6GmXzcGUIS|
zK;NJ~9L<>{sH5I1OgW>SoXmMiT>l*>DPFJf-hkHoho(dT%U)sosAYs8P7o9GRFp|h
z&v(to>G{@9r-_LWDQSv{(J9f~fqyd+!0>72QPAfhw(RC}r6D@c2tevImVFQA*HG34
z@6O(RL!R~gt0NqSJ3sHJT8{>O%8EDTe<B!lLpG(z2H~v_B|?&4_J^KoB5Ys7gm#gL
z>IeXbCozkGfE@YOE~~$do7m7ujMC8v#U(mrM3*IZ3bIDoByUEBk=BgLTS_no5AFhc
zOyS-BSr$`UAAo`P`Z_Lr(B*{f49@E(Lc0#fqIO}VX1PKklwe}@`8WdjJ|bkBPM8l)
zCGdr}z1-qFthP~oYQ``P(*_GGMqP`T&qNY2eqVdLRf>GH2%}0Ld$o228#-VrtKIlT
zNXY43X|U|@FwJKQE1$`42+Uf<(2!Ev0MINpDxP_p<R7WmmkZ*Mx%sRuHY7hvZ8ohb
zzFTZiWZRj9elgqvfT!~}vQi(`m#y#v&RV74AbkTkQ!L7#+W1>y%5tlmB=Z9Gg1=PO
zD8xA($7viPYC%>MkaJamGF>ogfE|AZLd}C(d>j-_oT=sJ2bNd8F!z_kEm<}PR`_4L
zV_*HJ+T7m#?_n^7&*t`u_84r{{p)p3G6K(sf#t0YZloN#H*{qHe8=wQZC<XNfLeu?
zR+-bH&%4)=O1v+Af{xGJ5GD%#U-i6W{4atdgDoZTkVsH*DxpBT?Rm6+FdYgI&Gt-Q
zDr<f1tsSHqO(~>90Up@l<Z34Jwm_zHHr$_1IulsQ?y%gAy7$!;CAXue6c_-=F0{&i
zcEzA#C)j~QGBTInF54dT0WG+*Pw>|$S|XNp>`F(b-e=>L&;h?y|7O`8{t!f^AB(Hr
z#J#W;RTBH@%zq!e5(;>}F8q>PETicVwPKrX_pX-M1lV%06FWPslkJ#MnTNA_pX(6=
zqE8s!Rf@Z|gcCokc7B%HXmXGP4tIR58-)h?JdJe;LX?%IhAsk6X;r%t?d$0XH55Lb
ze}zJ37;C^gCgk;8GClv22U63z3FUfBS2=^S1FT|Sr@vh<v<!c0Ou1_62^wyQ4PSc!
z97lttu80L}gv!W{<#WuI8~O2A^Z{g-lU1b3tgYT`vjaUAps{DTKgAF9O&oygxdEbV
zPYmPhsnIN0y08|C(PRA%GN!R*TIHLU;8x=6iq{xHLV8DZwAF-@6E}#9-HW*YG-!uO
ziUjpQ7gu=%5+7|o3(Pd6JEA#$fpMBaNvwaeiQio?($-jGUwoz^An2uUeWIsapas&J
z55vI9ukUh4DHxW;rZ#T-SbAL=oAqMqeT=JJlN!lK1(p7vsad20aNdnD9A;#Iai}q^
z{!*meoDd}JgJG+Pun0)fVSBvzg{6?p#Hf7F472-q82El|If%q72Dk?*e7kx7SqYM`
zpu0|+5~b~@Y2%Fq3W4sPm<R=Ev66SyDHuKxWWVS6VfCRI<a4@VTcF-7b%*tR`98|P
z*9OsNxzkI_ND3^&dm>0P`L4r@w==>%ul*1>*{^~ll+>dhq3{vRL`xb6&$~|peYILU
z=o|z<cTI<?ot^h3>h$?vH=IOb1$G7f$}ATyV0{~v@lAlQ@6P+6y;{p=8yg#4v(@u~
z?TihC^tNGeL!AP1!nDyIgbt0e?~qj&%LDgdR%X46yWueWXItmhN^sTG!#sCGWm%2h
zGqJaHpN!A}frW=>SraCCg_QgSzl^ps>*xKeVyw!tb23G@2}HAhl_EhaOFO+1FG}=n
zb0{oI?;zHA!WFXW)4lnVt`wh`&#H)^h3D_DR}5fWLxN)JRKJxyGeGm{oAk%6|J`i5
z9|+Sxm*fADmZ|`((s*pF#71DMOiWs`%*hnND0*kd?(c_ZO0tOI_D5ny#nW7-&qIw!
zvAL=V0o31vO1erv6}4n*^&P`mDe!I;&=A~*26FhAe^-HCeHUjiyGV>N;+lc`J3|f$
z4lVI$g@fT#7Q{~hhd(TdELpl10obFgkdJvaH$1q8_ojUL^%yR{-wkuNn>HDrlqV7h
zux8kb`UYR#&|<>D%<sZ<(bVXg2>W)y(wT^(=h{-ESmM=@_G@#F!N)jc+z0B76hf%)
zU27gHv7l`F>CU%4ycD}hb0JncNbWk!C`L#R*xk*hYlKI$-mpp=?TuqUVxVayGNT4n
z*wsOBkk0{{mz}F<j+@OUT?_WdQ@namTy@dx2MIrZV)gpovz+lAQsGn5q8_f-W)%B#
z)!V(rT-@pAUL>X<0jva}*)ai$xiL72&nY_mL3o@<==SwxDD8VGaK4^zCK?g|BS|-t
zT897P=gfd68;-l<7a@=7jqm~?Wo-LfENA=*@fqak=B%$oG?$)x>_Z$zrixRjwhX{W
z=}AdJ&ig4lK^YmGB^fNZ<bvOw&Icl~1cYrKb6aHKic+!+02W}>U~ms;d#tRimUBhh
zK0aZr{^K5oW43;cz84Dj0bNrA{8M>s;8pfSO5x6bO^YJ#J(jn@_Ef&g{%WGx*3Rk<
z7t9t}gKQKPNUNa%d^?&g>l9924}?qS6($OvvGS7iCsf^7M2-5!ZpBe1#CQ0fDrRW+
zRC8WL_RXFv&a4=S0)~CMccmT!g@@Q-bBr|>Y3|ysLnc7zt{m2~#abL(@7?Rcx^-|O
zyK?x-8PyV(-5?q`-#U9U(1ge!0in)$?9CSYH2C8_h;~=G$YUM&2taLGO6}Q5cy^<~
z*=-{!&Sa%?6-FP)Apl;?p)N5sT7`C4ThHYnHDl6VayGX9W!wH?h}q^{GlaA3C}!T3
zWoaq!(F~t`?NO$B1&^qZxl<DvJYPCF&9-&?=CYYEH#d`Ijmg+VjH9bN1cMtxN72`3
zzGm+a`%$oRj=CPUlT%Iv)O}aZVR2<bUK!%3VF@R}ESRM>f__^1BB8X<J|um%dwOt9
zElLFj875R}?5r5nsX2?`|C5RDs~oRfVi`<_Iw;#Oq_88e67M|paz9y;3*4A#T)l)C
zF&SGv*%?jmj44E2wxdh1Gx?blr`WBKBsZTh_DQNELuof{Zyc__8vHWA4*RR!pc}UN
z-Ti;E+8qA3v4#N9O8D!8O_4%zNRoX5j)`GF6xHxV9AX}Z2b8Pe`7<Hs=x4oG5@vU}
zqZ4CvACHeMIKXg^+wr?vG*bH|V;!B&4ryNKmNe2rK1b;f&vUzWBVL79r|+3X5n?3C
zjeAH9>1T51InWe>G2mMZ`g6&7%DebGU-9^@c@IM2@Kc4XO2%dXkp$Mw>Y94DJu4S!
zFl$j4_KPrxE$`PT%}+LjPp)?(NJfv;D?eU(q_n8h@Y_nXU(KNR;Im~D;gg;6Xg#s`
ztjC5s{FJBSbVPGyp@?{gl?y(XaNE$%w_4QckuT4Ng2@|6IAEI=?8b3kDqh$K;7Y}}
zlxl4pKY#}x2?T2Kd!v)p)BT~=ojFv&d^h|#k_7Rnba&mJ_P#tRp5kK!!r0e)1@hPo
z{;%!4SQV&cwjKAbVRESbm{&ja<OLufdO>VU=g;xOTpkWRqWg7qss+h=TjF_c>p#Yw
zF86&ma{uNrIDZkL`Kp-^f`orD72Ze`TL7GCv1|9QvwB|tAt3Vw4BiaP5bs#>inW_-
zhnP^3y#G}Fy%o`Sms=>AkTy`d-u-|DAVa}XAL=zb5FrS<z1Bo9TE(|SW2N38q#cnH
z_zK~?mGBKdJ74>noUY8F+}Tz2a=?tAJrfDDF$UaO1-bQo4Pi7LGqxfZ;}x+NO%+W3
zWcGD1Lq<>POkC`r@gPx@P5cARj0fff|7$Qc6iP@+4W#|PcHf$FkfN)|I&Nz`xAha#
zQ(~LDlYT^VZBwcCLr?|{v#;ht!ImS6A{hB-x<gDmV@J#?4f?AHfHN@ckO|`|!J_0r
zWuSdXEFcSjOa610Sf|d-cGbeL`nW*EI>HKH<baULcrKEInI>@Q#sYHeIqeb$XC27G
zLMSSY#c&tH-K&-BS$O5@n5?yzY`!Wtbz*1IP0O*Q_vN8vTOn(e+2cpWklq_YFpZ2R
zLnOB8Av)PL7fI|Jn2(W-PFBR+UGn)npsbu0-*yr<1A70Qvd4V8m>VWV9UzUv5=uxl
zR6GgxbVoC<+^%Qvgnxjkj|;@r#WD)FKARf~e2^88pOOlpQ1TQop7PSyynSvu?Gl?u
zxUy`L?<-q{qUp{Z50K$l@0D8(&`N^lskF+|s9<sn*3Z9LJ20#G@83;cXdtYNGlFi6
zo7(hJD@$UDGhh1h!FP&xe}+@Kr*v#uhl+g>bui$*45R7Q#plUyBttbE6aH__wO_pT
zf4FTu8-CIOP`t)pXyfAHIMdL8p6a%1HrcPG^Nv&JWXxSTWa}d*0iKvZG_1GYGvt#w
zI8K6N2Q)cVIbg=myGW9gL2D&PW{ar{=!6G2`ESRpBH5^@w0nkwTass$v=T3N-mJt7
ze*$cks^Uej(}oP+LEvZ>PB;$_<T#xY&>G9G$f#Yuifhlvuwlb{MnuQcdzKj7_%IxH
zVnj|PdhRtEq~-1<^t3P88R>@O_j?4umrTFxvA|O>&$A0Ok;@@%B~1=IE8=QDzqZO`
zLNYQYC@HB~-krM}fRHH=aoq#?4#qTJ4;Rg0T2WTFIFJ+u_eyn@F~wNWv|~13v5)*y
zGOAbpAFRqE{@KT2l*@zDHI^H9*sY(^mLe&;&|+iS3LV*Q{*MQg1<c%<9-}tO+PAom
z>!_$Jod+=&Lixw+=lW^{VjZ@|QVz86?w4^!Z7EA$wFE$p%rVwIx+wkV`ON&_ydDJT
z__6Zrn4D4%0>74hf@<%7Y=!V5<HPj~Gs(!rr5<sPk^M=;_CMZT#si)gy49Qa_x0gS
zDf}FKF*KEjb1`3X1CAo!RKUcU=hSINZPLlWpy4KDIaN<wjK1n~Ed37k#HBETP^=mD
z6r{cPAcvpS#|ZixR39DgCfVyhyZO$yk8guEy6M{Y031p+V|p7Irb|-h49KUiJkAS=
z5k{@(gB>uj6qTu(lW~}%a*D@GQ{^rZ_~?^wO3R)=925*F=(F{B1e2m7z9i>l>;00m
zq-X8z<j_%{W~#fAZ$M4rs}qM@59-_od&B2D3>=o;a-zoz=7uxA1fr>o2ZOv_cO-EI
z=&>p32k-@gjaa~<Ofi@`GB`9lzWIlZS@MKIl^$4Xh%q-ItPLT_Zb5>E_*!bsyE6lv
zM_DUut9Cm$j(u^Xk9tDN20hg@qRVd;FkS&?mKdj<s(VQ!y7HPz(}KR&Qj&#r@pSQ!
z>sS1LCtaJq#OtjsC~}7xUberofWj&@rsf>1!Jd*Bo(OM$r-a6TFnvJ4e>pi8b6{*E
zaD7nw#DVCz?C(l?)e)x#$h$J_k6fmoZy3kr4Nk@nrK_$B%i_SpY4A$S%sv3eT>}ek
zwSt|fIX$E5Av5pXal|U4nRLR@H-KnI)a2&758zoh;PjPT*}VHZ{U~juZ^}_yot3$t
zs@a5qqgjQ3+L2U42;9<tgK5ab#7MFhAFoYvYcNL(Tj9&kRP-k!0HnO@$J$OurC!E^
zE|&;=h!dDclhe~eJ?a+|p;e;cDfRbzV{bXIqO%=!mDn>P%7|hV{7y=2D#^1am%U~3
zy7V+Sc^hlrmm=Da(kHcXGZE~B{-#{@-%4@l1ivIksWfT=NX%0h6QT)Ot-6WX2+Vi7
zqvfBvk8)uE(O;>3Oz?H?37`QTBP$tuc!_gewf&H|td4`$SzgP$PbR4i8#tEaB3UR#
zUbnY<<vlT!hCLq8kWXKYQuScqqCv)Ci|4%d9OjSM%#Kg0>VkhD((4W`>6n0Tf`_;R
zZ8;KLtIS0)(FFoI&pJ&soWM4#OMDZztIEy=m=abn_(npm5hQ9R7rf3tpxpx{j~#KW
zOyOOf<B9F<ZwKg%Yv*f<0i%63eJ1^2VCmRDEB%ssRYp+q;<(Kj=zbY$jeM@JhYm%?
zoI61U_^b!tKd}$QEIhL{N%?24uzj6O1$`q1rHxYw2d?ogd&)bcd0Nf14c9w1E91m@
zD=xldp1yzHpmqNS890k#&SeXrLou@D@@lIYmC6NI+>jVJ27$TZC|9k=Fl7=BaQP*U
z<Fjm1^Tm5)@aWCv&_nx3P1+U&<{oU=2Sme!^!zK)`o=0Rdv&kUl@E_^k&N`|(9+Hd
zCwS|UQeDs^Ty>mwL!MQF>sdD8^;6gXBkL@~s@%G8{nFh^mmnz8(k&n;(jg%&-6@SM
zx=W-Rq?GP12~oPcQ@R_@Wc&R&=lt83T)Gw3TJM}=JkNbg1NLWcr+;69#N{A?t2pX+
zDahr{BdNlwXBZs{xGF$w#jUg&QGE~fuT6Bn)36_4uXvVcvh#JE8Fc9I5g!9E{fcyd
z0F?o~ClqN;0UQQgg2u}zKI6d>a1c*&hSky+-i<7_Dwm(-_T4+5ffxcP8{)B5|9fMc
ze)$)x0A1yo@`b8*HsoL;biWE#Dv{%OqOu)?8QqK%yndN@c;zBnKM?f7u231>PjhFg
zi)IEHn|JxyOo)K$z1}g?;B^7Gysk$vWR#Sq*X-L%#}A1dhQ=!?WPv;67LGhSkF#4K
zZXe!rgMo$nsJjW9p)U~_AgB&rc6cNvMVD%TE=m%VQ!T>6<oUrPL(pj%CCm)jQPV&C
z1>B-fs@FGWFVj{0@B73%N`lilE?+-+kYRap9GCjQD7SH4iM|Sgu0znx^Yt0oKeUI}
zm-(h%b)Q@^i#_%FbJP7><=CNP=btpTkHV17r+k`z&)r}MgWR)(+zsakJ;{f2S0B#;
z1k006zUw5?{RJ*iI&iHDN6J(csX1U^nYi_2&}$pj#1s|51O<lyLGJ>w66x>c^k9)1
zZ#aElkfh!(gDyNM&>wus1#<_7ocaYSq4WKdiDsdu1-n&Z#urM+t)}p+U@~&fwI`HO
zy!c!-hJWatAZ_3aI6ZX*FkaUe1)U@b!pd!p!?q)UO&1-}7e9L)!C#07!*IGwY`my$
zE{gQl{L09bZ~E29moVunmk_Tr)7P1<x9%JHLo)_*uIujCI{)&%T=F$N*)_<#Tzw#c
zMx+tt#oUJABr#{y!shivQGSCZhyf`-1BT7yYIwd06tMnvZ@Hazvn(5rcFXSSuLl=P
z4mnH_Wx{&@GybEcaxtpnh1Co(QPI(0#5ZN&Bh4VxwAq+>WGJ^Lh+QV*hroI560Pa1
z_d98FVHND#iN}yl&Ndm}5eyJRy_FR<Gg4s#e^VlfVQ^Uh+$59Y%JS*ZYKh%Dt7$FK
zqGf{1xm12j%xh4k*;@+@CY|<Z$8BjrpEAz^U*uLfP1{S<0h1HRIsuF;29xt&dK)3;
zfo~-|(iNV3b++rgGkLEI$<Us=e!yU-hFKR;Q@>hs$Eodj9m0((qbCvzvGJ#KCeq~%
z{`6yDz%1%zPKf1_mde?(9gM|?B;S6hj*r{ZYKabsayW<U``B)#P>AsRJ$$Q&Ry!av
zJ$vGxfb9NxheFa}*D;I16`T|#pfF-a*ZKFf8`L$D*?7?_&)_q@`cL3ByH}<8@4th1
z!5spmnfTXFcrf~SFl<Z-6%98Z514Il^CQ-ArM?<^e+q>K3EIpUfR8D14mTs4s{;Kg
z2Vqio<Gpy()yI*Zgs{vH5Q`@v&#r+&z3v9M@(-mg)_P3$naGGIB&$8VT+iXyn24I;
z8x_@`R|qGpcBk9Y=bpTo%&Y;gTdI_d3Gzk8MLH#AGh<;-&w9^^W(1tpr7?K6&pNdo
zpmPw<6w&g`L3Oi2U;M*3s@!AUuLuW2HCtx8*c3hKiyKXWwt26G8CA=pe}ZMJBdh+p
z1^^8WsE+8f8j4>3Auvf19ck6s&h8=y-zU@L#)EosN%@LF_=-|7^(b$5%si`%ZQ|t-
zeZ}R*9{>tdld%B)?w|T8f|MrTK5;U>rs^tQid&M{`f;Cw2s>;l-Es9T+t+*Mdu@$e
zmQBj^tEmSu*C+~X5@hFpfltr*zF`Zd_V5LP^GzQb<tH*BaKsA(sU`=<!p$c{n2+$|
zkJ>i0-1r^rmDrwfRMWvN0s%^dfsk7?>8lcO{p{0x$tbO)t%6$vC{M`plHfgMwX~cv
zCjjI*pqO-dF1$J2;~A>e0cq84Ge{yP{|X??bOoQ{%uN2If<G^^Y*x(h6i;42xl*O=
zD0_aSemL{t;aQwOl0w)V49>5dXye@fHmqC!dIekejF6MV3MnBT_%Q8$B^>Pq70Of9
z2ZwAL-XLN>F^i;p#3l9=N=cK8Wc{5D_d198?vSF40mdifo7>n<7#*pE1_9L3A8ODc
z(67v<#lWJ)mn1H;M)2o7JS>NSZ@R;VT~g9xDD17#U&o|NASRpMFxa>4OB(q)1o{Uu
z*~p8c0*L`~U@NqV&DN)X9a`u0JvY<n<QonL*nsiwmup*cSP{%j7C17#hl93>QJg+J
zpY8OgzIr11R{M&{={JKXS>&%Jl%vau9$R1%8818jB)2PL#enFAHycW6aAwNRWiS>G
zFi2<CT&ZT;D3022A-EABpZ?}iqKhQ0(m)Q%H?IM}Lbne~K`@~I994H0m>@+(%6FBF
zVq{!&P3fHkR`GytP@tUWb$7W9r)Onlee|gOqdFC4V}3$PSG$B^5tINXQtAsK?_YDh
zhnpO)9_DX7Acmi9AAd!mqjQ4I=B9dq>L`Or^jJ17X#c{`+ro5k$Jv(lncT3hF*5QA
zIeL8`YZScgO!bLR{-S)4Qh!BUky&4Ir}Sh`q{fuB8_DEMvaQ<z!~mVE|ExUGnTNY$
zcE4>2pQEknOZk|a+OKs@G1$(ZrO-W0XV!Llpzr<|Dvi@o-#;xy)iI%j;|MZ##R#E1
zu1>3R@4awV`~bRZ@lc`J-VRG4La%0&SGPa!$o%exN#Py8XJ<T+(7dGZEfu9Bv1sNx
zjEIFw%e=lD=_Ns}lj&hq3rR>u2`-Xt!yFU$WP+A@yT$}_oX*=e<9y=VSTH+JJBFD`
zedK`G)o&K)pXpWy3IlEh(T72Tx<#dseJFdklH}Xxx%!a=q{Nr8Kh+1zIJDiQ96)&j
z3+gf3N-3fcGWYAtWX!kT=6)GVP$PluR0U${{WH_!xG>i_7ef$JT)`z4xmG&L(gaXb
zy59OI-u)LsNZN^dz&3*XR2&V&zdG?yPClfUoh5@{O=D8TM&${Dd-2+{LGX0Tk1AhH
z%QVO+Q1hK+pvn{n<{;vcgZ*+Q9J$(~+*pLtGTrjWw@q39MYdYF{v)t)Sj{(*KprQX
z?EedmXlt8&H-XsR;gQl%Z%&0$(jrbF2eRo$KShjxPLuTmv$dcgC0$p7lNu2V_mSrj
z13U)0IGzb=v>!NQJoaONhq9ks={5xzdrLCmlSs9__JE&his$yb0y|<eW-kX~p=_@c
zD66`1)yKCiFeT@4GH)<1d5$@h+j3L&v6}$O{@UXIsh{S#_{ijRFRRo7SfKu9{x%>f
zCW|P852e|~nUR2j?ZYj%#Ptc3=J9S|n%Vy1&IjL(M+pv^aS_OId^ElMyqx=H6KrMk
zOg`>V!qtv(J2(Q)mk9^Wi=+yaPaTjYrH<(`9s|S{g6mB;&|;4F74XX~*V7BF)}bre
zG-ruPD=sK>#CgG!=ktvpc!LOeT-5zDM%c`Yd5NHb>(=D{Qa01Gcwg6mKmzd(8g}9V
zO0t#T(ObE>wUVnJLrl)3rLs_IAPOAYY_u2@5wT7~V+4L1ClyD4s`BxXvA%vv>|={@
zEvE%}VT?+0*vvu26&fVd;F);m_@`f#@uY{3|EPq<CMU_${T(#f(ruob8N++aIj2v{
zc0mjJ$^c3CA=gPk0`QfD1kw*ga|T^6nFhhw^-#PLYrAQW4hm&5C4w#z(qb)oYyZMc
zjt()`h=AZj!>!$V{k}Fvpj~|zAV4545%#As=MsJu^!}?V<3op{*YYS7rUC~WA|Jyy
zN#!|~9WQg_I+h=PSyqL~`c~6EPj4dhDg}x38TkaMUO|z#`Zls%cJMP>*^S=pAFP|f
zS_-<@t?(Ub!oE*}k-5CQ2-_nyBqUz9``i;@IKIbh{nfUS27>Cm9Nflm-o1lCsy~G7
zRa9RP=94?+@<U*|ey3AF0<DB}3<#o#6zJ>a?h`}&{+m;7$rn$%WP}+cWko>^ZH|^j
z`_SLOjRzb;3HH@#Yiwl=$Oa@0?6>C2Pms$IVbKV>6hcR(#o_T0burNvyd&(6Z+FwG
zyj7Knc>EgSVV*He2qXV`nmo_`1>u=Bt53E@_5thp;Us>F(;B4fT>!3;K3Qy*2wZp+
zG%9n7Vxd002@(}bPjpd9f;U0=ZW-Zb)(dv`ohT6AhZ;pY^Q1L1M2Q`!s<LfHRB;<n
zA0GsJim!jkIpMjPK7I5Z{Uj88m3I2oWTlmW>?c~pHrySN=J;J>hxK7ik82zy8?h}B
zG8l%lP@nqk`<u#0v)Iu1CA_m|z|Mr#MUJ*t$6~*k23Bv!xDLmP<-{<&eEsdi5qXnB
z7xmZ|cqU*+#9}u-)jM{NKOF*B3m)y@{7qnzDy{7(J|5R{8eOrbcoQZsc4dV|R?A^`
zb_oV%7;nIRHYwF;rljY%<Z0if%%(;ZIPn`7UUD1H9u5j88+1|rQ9s7=`dIMx;p{W#
zyZ}a|GKp6QOX7h09rFm5IrdCGS`(`LW#BnUDt3QLHd$#_nbq5Tt3pyupN2%LgT#&P
z&dLzWJjh%bi}4P?nDUWX@J=Jajxw`<-g6bMLltMLDL%J7i7U`*L6nPCH@gTl-DFFL
zQgc=hSDV@nKZYjPL}hC>f2j`jlnex2uD@TKk1*5~&aU<|kJ~>5E<dSg%l`=~jFMP0
z@%1Mx_7EC~uUH~laj1UI9Q1iP;SmGY8<qrCbP#4AC?6@e_sWBMIKIXp6?OiF8I}LZ
z9OkNAl6!Dm4uj{5LIfrhy-csK^aBk~@6klfy3C31%1XgEqMHgu?&tWRI9MVU?*s23
z`BY<Uk@g!~jJ9|%7gx2ttcnAZOe2GF*I`E9`z;Q$yBCHp$lw2fFO35ERLeDUZcqNx
zFguinr#3W1mhhKAIWx$z164Ej6NLi1-`RrQ9D#94aQcnt&PB}{KF=0{R_b?Lfc{Ie
zw(U*_=XbEk7JZu@u8&ajtiL(x`bgw;FyQiD*%)<G9b88LJPSCPd&IM5$lW5ntm*?C
zKJDspoab4u8(sweH=_I>uh?X|{*y(zuxBJW9}Q@IsN6#yD}>?Wi~Kz<k+`^tK}b(7
zXmH-M1J)3+)nNf5gQA<_iAX&$Z0K>s)$h}HOFn~{rDm`BmPcri^`5+yKzdzsgpPY6
zWmVGS7Il6zfV}AQ<(X*&+gp6fCy!VrPs4ONm~-S^$pIGS&U-+2NLAkjQ8e9yVruU_
zyq^B4qMON=y}{r9Ha^V45_k)$)oSfvfFe1AkLzzaMp8%ZxZ-@xup5i2ItDSxmEggU
zIs;D8yh3`A#zSN#<MllA5lR4ZN=13oc9kZ9%}vq<1Q>**Wr$@B=?A>pIbOZ00}Znb
zDivG-a@1--B^E9_&WFtpHok<!GwvIaf(<<D#WCLZ#%m1fy53Yu0@49ZbdNY$K~b+y
z6fI#Q-{3~Shy;Y`EINZv8S~jplr`DXizxxJimy?yltyfwBuKoA`E<Nb-)Je9eq${X
zuZf#9!2zv&Sjj?-e?@~&L4bG(&?PVJ#%%4Ruw1ZICUJeNfDO%(BA#*%{hFH6xr0Up
zWD#-wr|H0nb;TZIf+GGElMj7kqs{}Kz-aFg|D_Vv3%=oaH=3AsZPpHeo=JS2$0~HM
zJiHWgn4_eb4|r?-)jkS+FQ(sXuw_`@EB`lK_>}f<xBxT`(>1Kng;(<fRlbH@)RZ{P
zg2kDtv}zj-7GIRXAOq<2NQ3S-LT;wEi_~s$>JM$7R^3LK0YMy68Rtkm7EpAFzs9NC
zAy2B540;TDIBQEpvj=z}jZOKb@zWcSi?$7kV_E?Q+pe$Qt?Uv4BL{crrQ;P5X+eb8
zv7W{30{NmkWzH6Gn^*>gvPj<nb|Q8#g=6YZJefzsUll>d{M(`C+TAV@o9B<Ooi!n6
zA4i7KX)>uLQ<ObQ1>G)mv@OnoOGPwbIZF*4o5SjmS%`5W2oELn{71Dm#378OO;fZo
zGw388o6cg!AQ$f`L(uRI`+~*sy+hm9k(uy(2P2EaEe>P`V#5r=3Y0Hz6#f(T)cof%
z*l)S#!D&_A6LFyi!6=#sZBLDB$wHKQEj->OVr`O;JDN%y%d>`t-^Wr|NKVdw4cs!^
z263BrKbaxX(bSIkz^kgizJL6&hVJoKRBpd@-z$v^dLx^gI%6lb%$~vf(kBgA<hRO%
z^G{<odcFm~3&;Y&=pZ|_`Ko9$b6b&;D3S&R(DmHgD5vu-a92Rmg71PYUOdv!BsW+}
zv;UEze+sp>WjgMwjV<l>3su{32)*!H#Jx1^d91=R_=6xXA&_Y9K&^`TL0Ui|%P5yk
z1ym_VYZHYU1QUKNmjd@HW1?LAT9lKnWr?~<<6Y_BH{bh%CK_*kZ4cF^z{NAx#-8i<
zps6zbQ4>tuxa`rx$UoJ-n^8lHuyY1pp5K3s{=55s|MS0Cfz`~Ju;*3N>A2>@v+_?o
zlfKiL<)$sS89JGWB2pr^@AhCcJ(JZ?NZQ30;K}Gw##6Shujf>&_-^$~CX27EbiizD
zH1gX?>`PDjCL<#J-(9kc4c~z(rZ)k}WxbNqZ0ZZc$hVx|M$?#XqJKuiPIrpzHOY<d
z;W#za{-|;e#DQG*L0;$-pZXh4w4FMZjG1?_PiqJ0l?RAr4K!Qiq~YBQK?^8;@w29-
zLDEZEp^|lFAGe|hBTt5KvR1jf1?h)6p2U1}P4Zml&;l@&lbgB)4$dgh4yhku|C=B{
zTJLu}qy4_%<h{>`(K4m}DxUAYw}%=RQ3^})FEI*6ob)pc!0mmgVx_RmoXpSb=%|%w
zMU$M7MSZ?l3PdBImq`;`ySu%A3@d?$!I25M73}2d2?)8fOG~>Ilxw4m=N6$47Yh}=
zim1-iAFZ|$052t<-$frrREDkh%F~xYk>z((281ABvk(#bzDnSEB)H{ow#bSJ#5aoe
z^C1A$@TG6jM@<XT1Pw_}$9}Shv_ANy9i;}d#K|O=X9_d3-e1svBgP&hn`xk&{2kiu
z3?=&4A*!-4LoQ#U1um3}rGD+L*tr|{BxWspq*&>hvqCctWUsW8EPG8<m7%TxNBrBj
zAvz4H4Q%o_9Q#iJ??A;#!Lw%aS>-rqb(#FWw*_;P&%4eXy?j|2j?KqghRx<~NS(Ga
z`^844)_CIzYt?lK<POMxHF=qJc;)|}UeVdEz{3dX;@}>!HI#@si%%NC*R%>Jtbi*%
z2-Ph}S~L0bYFK;}WdHFg<&-^5&DaG52=N#|gGB!VGsm;KChwtJ_Q<lo-6tu~FW=)=
zbo7Y{*v=O1sW*%ut)HL%=-UE5VSfHm-|trbBRMwtCTr&F$$IMqd0!H)>CDMA%}F#8
zE<4_Vf8qLF@YT3=*a)W`4v(7M$Rvp2P~{cb><}97<t<xZcO|>Lekn@<X7?ww%$`*p
z=6^~4e&GF4KY)AZs9~0@E`4M{!>bKcPDyTExoJbZh!1TjCm`P3-K7w|JyA~7Z-EO&
zSqzS$mu>dCZ+_^I%}!OUS65f|hJANJnGt22OwFd<*dMsOETGQcUz3og(a`)cJ3f%E
zGXha3Ca5rc557QXtU8gEX(G-4j+t+xx1XI^WU{!3Gs0nqp)5N46M*`-Dn9|uu=&vV
zFTS_p-2inXx>k*QZYwMPhb2`&$n#3YX-wB^y3&qTdm%$_?DT~X{fCjUwOJwaY%&kc
z_04Z?no827Eg3PJZ<J2np<x;t=K?rH)yYORC24(tBum@(ZArh(CZ4<^lTCH9R8IP)
z;gKKya`pZsbphsEB(kY5HU96PuyQKICIpW~FL*~Ogzs_Ih;F|^s9751;P{O%z4&0C
zVoZxgTmMZ2fCb%NjW}pMlswYr>v9&$?;*FlOL7rb3t*G8O0p(B?0%DHw68FxwKu!s
zcA9Itb&F%+T2D?sRs0fo*svWI;M`DLmT278Wx=y+(=ip~lal3GpSvd2oYpkPSG0`3
z4mlf?F5*VB0+r09mfMH28pFKq-h>nONDatg=ail#g3lDrS!GJTZz>bNnf1^7K?xW<
z7;`dKrKZ0Kq%>E(`C}&G{s;V$e}vt!m#mg}<A=WIg=<^phRKuI3l?_ujmZ)0XJI|+
zw<Yjy{z^b2#yP7^DQLK#3^;(1K)Htx+y5^dv=7C{m4&{TX3~<CVrW8Wv}K=~L%Q3Y
zB}L!Cj-w>(y24qVIl>?sQ$W0ritj$c0Jrj;>Tf>yZ2|u^nbZBi7IuWO{C#mPzV}@q
z=B(M`Vm#;hOIa3)WBf4{^vgtV(SC3kBphtWXpw&LJlbbCZ|g_%%fpCMGrn-IoMW7a
z@{mC<tEYJ)S1lzdA)yz9i?lfT=Ds~Sh>{I@Ugz^0$?bISy7y-cp^#D)a}&$w@VG`w
z&V(yPsb^2lDNE_YwU+tUtX;o-_kA&~MmcELTwo&x;MBH7bZQE9t?AuzE^ij*S?%`L
z0AdIHP~j>1)i3=s4PJE|+<vdlnZlAz`+)Sc0C;VlefmE0zh{a;w;XhkDIRFlVi9W?
zqX!4WV}Pf(V1d+V_)%#1!yfb_KGB-ktu=dZK`|@freXtJsTCmK`fon=b)ViKw}ODa
zXe9U{&Nj(|==e<&j=6ii3#5ALxWu*wg|$lmdu3cNK(*1b9?ADeu$Xn-A}*+V;I8lt
znl*PTf}5-uJDEnX6%&GN2b0lT37*N-yO-=i7hgiSG1O?G10RW#TaQ@KhvUXpK*KV+
zy||xNvFB=kDhUti*1mRdmipHRzCOKLfF^nD`g7p*`Lal*^?PD!NleV}3o{!qoPav+
zKC|)xIv`d0Vff42%ht#jpT5W3>_#3t-x(!x<$tDcbG!HZeRp#u3$N$woWx6d_h5A@
zEpndE>2@V#R;>H)IZq-rVv<4iKCq?X{9X+@3oZ%weZd$>;uS{UmgG=?d!Pl{%b6PI
zY94YidRQ|*%r%oXExL98N|laErT!KCHvrn5)j)@YT@IploQsq?nFgupFr(m=0R8wC
zEQr{e&L=4!h6>B&a*C3*@Q7<s46c=(SDi<2!J&EWcO-MjG<bBAgzBX9P`d&@0=rPg
z=T9Nw)3Tj?2lDc71eWmK@S`2T>v?Rmu;z9~14*ifDJ|6w1~<YZYHK%&U2dvsJr&SC
z6~W+K-h%&3p(u(=cc3g3?3aESOc>?DS5zkJu}z-z;!G{6r%t6hE$>?(U@$&P4a&ul
zXnmT7^dU``Ai5X+p+X8yI5OHE(zOVfQIgoQ%RUARH_|!g0(ZE^LY+je^t<=D>1Z%b
zxz1?9ufjj=^f&Cnj`-^|P2u0~ydR@EWg1*Q8Ic!(gFTZ;aopI`ez?^7Xk_S{`ULk-
zU<+b0Nz<s=lu`)fRJ;;b@Jq)x@ux)0&Q03b%4^riIHc%XD?)5b)AK6j%cLg{1@$fT
zl6H>B+M<pn)IRWAV#Wb}y=wrTdBm_3?nJ@3`sIozdJfiBi*_R>K3d=E&7)r0DsbN(
zMpR)f$~a;gym|T7gX=`5;uT{Rwa2?S(eM2-o9Zs|?L>#qPwBzGw}srE3)kcwVh#`t
z0XxI)7mWHMexTEFqfte0ZC1}9+pGgl93;9&8*jgXKd$^w*(qMJKne1|Tk#hx%5(+g
zm|yZ$a<wDrNNI?kOT`*Ry&Xg(>&)#DQKM8{Rk{WLcLo_4GEwJDj_}wY>Vg>lrhtF%
zP!%N_=s@&gR)`LDz4f>vsP=p&fzZAg+IBjq>DX<5K99~YBYqKx4(KMJt(0Tb`T>2;
zuZ*ydow&szrD&#=+wz+NesnsUo&l;C0YuzzxFp5D8Yxn+iQ>78i)&t@BYoUW^D7Ev
z3LZj1QT8P7pT@ErF?KpK2vPH^p5*ZWsi}|I%wTLD`66T<69SN9UaP6wi$oLuL2jGv
zY+${SRQSSn{jog-T`I|o3$#+6gQSa3S})f^hwSxJ5J7vsOpEye?_qZkT*3M}oR5^}
zm}&V%h~fp?W;mn=V8};47LzEK_AhY}4Z&M9B=O?&$;ArzuMT=YDAh*{DwQsEZg2&V
zH3H+bpLRgk__gI0C*;yr>jrM!^3Ddpt5rDn-_4&bpmAJ?t0I*0-foE8gIOipVSj2p
zvfez}pih1Gd^8`JAOg8Bl2pnV%Cld~Pq!iQv%Nf{`n7&^_&;K6Sjnba6SUpG-s(Gp
z*=e#Q1GgXRa`iKjtSXO*!q_VRg(!prjaq-kuNCf8uCZT~%tNv5P;plccuIf#7>}yx
zR4vt3$uGyY1@TnCdj@lKtbta?xfr5Zn0F8}QFThyQXoh>03}prI`B-lpD`A~;!xb<
z>Q8ih?e;knPV}$-mbk{LmSVFd{q*><p&L_^;~(uLBo90dy?*Ce&l=;stLZ9-6Nq7L
zHTE1*NzO=3JqO85n^FKPzo?$oMGj1#SkrRO-<@ynBq27IwFasv(!zz-h+~VF7!%qw
zV{oNf4cIE<EM<C{;Bro?wc7pCly_8bK-fIgyuFr|aMXJp?B_Y#ymy`#RQ|4f{T=)v
zEXCTtmjwW6JUG$grNf-_*yZ29&_#}uXHxZ!_hfenShw5UA@!zI-33lE<Yjd=r#5a9
zV#&{xNh^H?I~!iVQ95UQVCB^GQ+c=-|2_F1piSA#%&g_}Ee?4;26T6T=zf8uSxx@6
zQ-860GXn3?iiI(^EHB#cD(4J&HMRAf-f-eh>)B$UBOS+Sc`D{$x6~D8z_b6W>jUO)
z#m2CfGIj*uwpkg?d-zD{qn7AGXNQ^l;E~xT&TC?l<@f|@-+L0q@*;OjGc|P+YomB>
zbCsGZ2U^r=(y&DZq`-MZC#NN#?C^UU5;sUL5P)kqYP!DbJ0j7Tl+s(hALY+`iq+c8
zY)tJV8fFnc*~UbTrHDjzB^aFU^kJr5VrKX~E`<e{2~&wr1dU4YfaNtE!lEB{kX*=F
zyn9VrfyH)A*|NhENbGeeG{<L?5fL6A;NKAEA0z%QaMhm&_b#h)9&Yl-X<^r!-xcGY
zHmf8XDRVVJEo3rqw-F*g1NS(NWrxep#@U>?j-`T#>jTAj9vwoK>m+Ll9E0rHc@g0?
z0kL_-RSj=`E|GXl2BI<<^=Qw0JDVo7<~7vTWCdYa5QNvH#sV{T0?1{#Lu#d4zR~kK
z0-T5o;9;yfG@1>c0(WQ8j#rI)pQ6A&gW|_>u`1S6W2xEniA;)3Y@7~kks#l6{a{ce
zy1cN}u2&NANo;)zM~OummF<gziK<6HuXKBU@g@TaBD5}9;-HO@mMVBf`^ajxcbOm|
zhQpyWk1vE?Hrf>v4kpqd<`q-5->NnD%WHaSWOX<&^5EJbShd4!CL#jq^YDC{u^Y-K
z195w3{!l=r-bL5-ZHpB@VLzxO4c~JGUGRvMqxKKIboioC981QeHqpiC*c$O`$fgH!
zd3I}u@G~;`EV#Jpjp=<+I|m;1j1Pqu@|hM0Ko{sQW$LdRQYMfJFGD+Wn60h?cQA9K
z!kgh%hS4Td!y%nIq+7gcufPJel-$_JfI6^3d^eh3gD#z(S9Al+0pL;CgJCn>RW)3F
zHd=ZzT5)=2{*nc7;17sxkw7(Vrh|Ke+u0upl%?E2z#62w2VHIv*owEx*O+MVCxV_b
zg^eOhynjRD4~2!Yn8w%eOcc5MJ4qo@06$U^$Q9ceU?T+jA{50<)M4E6G48~w`wc0>
zx7r?Uq67>Uk;#^P`nzI2r3v49lXJ3P-#*AC6GL7b3NC#+Ut{q59|82PP0mJe#$!8>
z;jXgBAuGMjagqX04uadR0bYy8ius*CyfYn2z49dTHO8!7ATA2tr8=lohdr+ju~ezu
zPP{Sm|9_f+@0?=~A&-Z8xY=pBQ6>H0tSe{6^rUT=-8Rp#vN|yndtvd+tNm1;hJ(6~
zbQbep@;^<Xe~=@fy@jWQ<;msI`FnU2>~6B}jBA^R9=gn6C#~{vg=hUdO-6+k6;164
zaatOnmb`(i)nB)<W7m1U$t@Y!+8z<I+Ij2~z+qw*h$o3KC;e_}vCUCoGU^}_0L>S|
z!xJ-~6e*Wm31~KlEE6xD3_0ibXVIys%m(efVAT;*bWwS9z4-K0D-ycd(!gqY6<_GB
z@B7Dc<pW2Tj1Wz*;-$$W4mHome2zptQQAYa=PM(V41+Y(SkXK*IJl{vzFmZPSm0gv
zsh{lI!+OsorS7kEhtQ%4=ea3=^a$6P84t+1PQi-*B;-aYe^B#&x{U_|lkH))@(THV
z5`NnT*H_5cQ*_zMxXsGXxi7!vHrHu;9D`kOm#nZpiE4(GVKnbGyjNgKJ8KFRmbe9l
zMdV_0vaQxUa~^x{(v+c=u-Burf>dxI@vSeBO1WitNiuS>U#q|i_{fBL>0Wn<D2r${
zHVgm7rGeuyHo%uu@M~RNj1@f0W<FP@6lF)zRyFGJ5B2jja+y!-Ox5?W9g0Pa_{}@w
zl*gzjRsxPsDPL*UX8YO<-K#=rV-~;cjEsSRq1p^XGzd`VI~gmNw@*k^{HlkBUZz7c
zh<XD2<@u)7P87FGW3^~`c>_grujbZ)b6o5Sev+Fz)2rJS^_7LT_7Q%mY;CGelbO%C
zx33v@8(wxoHR1MXVp3xYA|uFd0rzwRQRFyxd3$b*aXM}2B2`-ui94scDM=WQxw=O0
zrg(&u)hF5A<-F7g;?FRkwEVcYuKSU`aRUTin^UQ#1I`y{+!^#XbU`V>f^=q*t}-Ik
z{)O9)BEkflwr0u68qSJII?+5Fav{Bp2m=4hQ>oXQk?l_u6qTyA2WY#=*Brwrlm)uv
zgh>RJJeBWNC^!O7a?W3ldmun(Rva8Ae^^!VB<jcafUTj`NAQ<Sf=#kU#_Ue`eOb6w
zrja4Vw28xVq5K?rAUjr3)ciJvQ&Z{pc&Sp5i*#sq7Qw>OUvgxyJ2hA_?R~)zbDR3)
zf#pXa%qMM4HT{@Ubb72HMyBCAdNm!g9;tKx5ET`1a92dO2}rh>+kV5(m%`Q>B_wPA
z4B{R>Lns;Zg-9dg>xG4;pSzRbYys#!^FvB=fv1>ezU*kwZT+}T?z|lk%YAoaRYPyp
zq%tbG7tPhKZvLi?P(6kV8xl0=Yw7iU`H5Mp9VH$>RaX+`!$*cgiOV%NHD*=OTUD-}
zyUxch3MUuRkQp?Lw`uqrvHp8XJO7VUrEP@RdVPLx12uj`62h6K4j`N^D_@RUg)rf)
zy=SR$giWq->_povcU(v-6#aT7Lbt3XKhiW2T9^!&nc5n3{e~{4WzmsNPM<F}oWrbH
z2a@GCc`dlPx^Z<Pq1k-)qBdf3u|@Nb?TMO$%tVL;QK>m9ev`|4p7nn`FG+!tj3!px
zW~dsp0kRbO<I{YaKLsKVT9|jO_Zm6xl~p?S_XK-(5Iq@9;E1Hrj^fVBaYNGUYgKSq
za8xcK_NykXms}UNgNH``xXgS;uoA;vctr91+)7}ggX4+o-I&K+ixVOzwp}0QEKQ#W
zL3oO96=vTj87uJFAlg{7YX>M<VV*HXJ{5!2KwtlP%V8TiJbbPl0RCRzpQ5+cRP||n
z+P#%+GB~vo;*+JADAZ~9Q-`S!NYZkJ7lM24Pho>+VE)WmdT852)jL4??%X?^+`~2+
zv;?digXE-vnP+ENle7-RKCYdQ+P7ttCw>f>nulh6c~(8tiQNALmBdbN6f9s`ZjPiW
z5DxY{cP_5|zJ9&~jn|Y|a@ar$T?Ky1*lplW2LnqzTp68AiJuoDsZ)qKGz5FKzS?1C
zX16E<r?|Km$3%;+UH(<*CdKoPuBipu=d|Y(x_#0Mj|By6W1wdhpPbZQxKP3_1#DL6
z@tI3B<!Aht(l3HMFb2;@y9+a5Tufa={ew|XWGwJ|*v)0qJXg+`tC*@~<?BZnzpR#D
zDNN^5+`^#*ayU{$4;5mxiUG|4Ly|f_(`0oAQhuol7l^R<x_HCqP)u>f#KU|QpJdn>
zgOl5^6&tc0zWD)8`<3#h20Mhg7E2uok{1x@0ztke+lfiY#s#6)qA{U3u_!0%zUhhv
z5dgY|$%@BksoTGPU(W-2*p!(jyZxM1LZJbo5<NancWYlp-psV~ecEbTzBLpLt`HHw
z*D1=)L4Y+G+)fWjr+mUDTpjr=x*~2!&j-5TuOC6*9I6FF(k~g#I!gMO+I;M5M5-}=
zv$aKAUrq6ws6T8T=Q5rQGF+h`s-cUVQy3D)6)F=UA%r8s;8MU1!HXqBJ3=lcmvACS
zS{9BNUUqO|W4bEO4rNAS!xibIajeYQwD3_4h~>=+aAVB{{C0I^r=|T2eFs^^>zV<p
zWzDSw0psF3WOIaI;_CPjcGR6FGzp)G9;BR);Kp_54!D13hec8iy%gRzOI*zem?9*6
zN7S58zfHJ_Vu9#|AsCk9sXq7O)?~qdxT0C~$K78!e=uCellbGgwK?Or@#8-qgpJ%k
zZjce^2e3Y{=OSMP`j2ee1|1mpXVv1a0Er_~oo#g2=%#60s;sl3U)y8vbsztfCWM*B
zM9Jl%8D0bg>yBSQk~8BxOyJ3?o4%67nM2loeWq-4iqYzim6-RelQ3P|7^HHnsma@%
z$vcPF9(nd_w!W#^m^#(8yg|&(?&G%3N|`yY74}RlVDN9}MVlm8d@g2!fsv)<aeivB
zwbg#8#QXVxEiY3_k@sWL72={im&a`rscf&-J?%2)@j(DR<<Je)?(%sqrMDujB;_bu
zH3o(Hi%5eDf!0oLcIIKAHEr1CnaENERcjJP$|LB8F=(jbkhh~70PG|P{1)9*_pr^h
z%?^XDI4w+tgyD-aZFZ+!Jmw=F_;4d-hjwz?V$U0P+1uT^NBg_dur_nv_$gD*@3w%L
zX#%pbg65r+F)fO&L8R%lV-JCwo!#~RnUq!dEG@^xt$^{YfQ-0+rEQ{Z$iw-cEB#l3
zF=&B+?)Src86JwH;8|KP;)h@S@-j*GuP=dLgT;g$|McwFf|pPn1}4Fm_&1C;KP|ku
zo#PH7U$N34h}ebHp0qyc?+S1cfAwgU>T_t^jn$A3)A{iGVl|uo4nH>aS_CFu_LBhF
z?7;lzSr+Oveb%QW+f&KbWbOX##roE2Hf#{HVDI6%x;~z7&E?|L?a5&4ci9A^fqdti
za55hQu?aT+ZkH|5ZtipM^z1gwo@hAU+ullY+dmhAZ#vV|S~z*HyJ6|MI%CB5U;in%
zgC?UAMB%tkf~w_r<64f*M0=FsU<4aly$ffWU4Om2nQ6Nl`#4X3URmOv5aQ10-8AqU
zV>eR}t;GxaYnFyu6qsya%=u1($ozi)!rdF7uY5V-nO?h?l$#jie)kG~aW`@(8ASGg
zDDbZ5UL-oEVvERNf^B;;4rC~emMJ!7jE#qHELf&I35<1dt<TpiZ^!t*Pmc01xF2W#
z%52aVjn0m2S1)~5N)6ngG80cd$KttNe?H{O%oekfIkS^5+vT->V-WGN+7J?Gvb+0h
z0cwEK=F>`~>8g8b6RW<eeFbGGV0zKs!i;@sf&9c_gObORcNB(<5&W||pw#Ben~coF
zNfQr%x$c@J6^579xhgAg4s(6W!a^g+T}fj#C6t@9KW_W{mgmVZJ0PF7#qlM7Cah%q
zc%nF5{5ctj&HNj55uHBtnvsD~$+yz5w`9o`(xyd5>rqvE?ngaY)j+u9LH}KJ3Y<?2
zc47=4r*y8=|LZ+9|8IcneohG;0E)5U!vJ?5GPr2G=hLhW8W~rGS|@DmeFN=*yqwaY
z^>bnmK`Me!9Y|=;RK#;|76MCwxm<F8SeUx6`m|pfWio%b_EH<ZxJ#3~&!;QQhHFUR
z03UYQQ+!6Dhm2hKiHx7$k2hag0mkcRjioPwmv*)(1xMhc_nH}1R+zWVKY(4HwO-kA
z`RS*bM!r*8@|Eap$G}9IOm~15ZhxxYmp78~BWHjqI~imrUW=hJgw17xvH~3Su?`{H
zYm=Bkdgtqv<=Zt%*=N52C<*oxfI}xcILFFttFc)JYYy4$vE;SgT5xL{F5`i|0O+wR
z#=;~6f8*tXh%qV@Cf-GIV8$*X{zk+JeB*@Hf+j!Gr;l=1bJ=1_D!lFP9Dv^)Gv@aB
z4laX)dv18_mXQI)(SVSRbuNej`g0?{_|1FSm$1>2t{t^Hc*DQ(Ri;0jwckWi$|iBT
z&ek5*SkFC>linvGVzdc`*<EAHsj0yMjZ9l#lDM;0e92P-Z<61j%7Q|C6V~gzkjKNk
z24Q+j6>D`eG5=EL3eunEcbp*7^fvg4z*f!Z5?sqm0{!eg<nyE-`jb*ON|r><+wn@4
zkdTv1RWaI<aPvp|%XXrkD(g8(;k;TrBkReIMKvJUwJIe$zz>bby)0Dkk;IN^LBQ1Z
zaYH=QDFNhsC!zD_lsQ&=DUApG#K>2y2zsCKnJWF`y5ufVt-#*~uHgzn>_ny&Cs&6(
zrPyq9@ulrsu(PQV0(lW8&o!bCgx2s72I1<wZGJ$SA}IcDV!=BMj`DCFOUvrsazT>~
zovR*6kyj+#Sb#$0e>T&;wRv5gI$=9-oMHWENnFp7GhNU|l(H|+mLGZR&;HD6cp(oI
z5ZQ_JyDyK(M;L@RNz(}MNjjgrsU7$EjtUYaPY6yGk1yO)nzX@lGX!PK#~|<>lLoyq
zg6Yt=o^QySHA6uR){J3Hpl6Y%_sD9707}2fr-v|wyqywj-%Qgv_s&UfYd?7gBv@Dk
z0uzOljztpul4|dUo~rveEt$V$_DzDf-5R+`m2XUzAFg(kN^zGWCS~L1Gd>@AS>5~L
z8ta$VtSSIbxnrnsqrFvrptO!Gr2orsIPK4Cnm{-G4WPHo?d1&*b)kA5unDOnF?9@v
zvg`bcM_>!u1JQsVh4_$xrLbGr+N$mNxT6T-M~gq@4yJ#J!umGz=HV_zyDNV@RHjO8
zP5U$le@5|6CRS)<ccp|g2d0M-j(moH=oR7WFK*OhX(^N<W81Eo^BZ(8WLs-i79zH8
zl1JnA%Qqp|R+u@8ICQ7SDTe=mw_PeW`Ak7%PNLo+<DpBg^9M>uM<T7=oHz6U1szYn
zw{(`9I-c%S1d-1JPxjW~fJLBxo`Ps(7wPldgZ?8jZUov*v}hixI**5AGtE3<fOkfC
z<7EW^D*e*bxpxJNn&Rik;Nz+oT2$o-Igd55x){pqV`8SoXh<*yB?ji%TS0L8U^dv!
zFSGdrHwjddU1s!sVEW2=3;I&fVwpkrT;pZS-g`PIK-HI^+VVZr%2p3lS_&LOzLvOg
zLFDXE`r3A*CwU48m*yj{n@xqxr}y^KaQZidhUc{WoWC!HhYdX`L<6EF5c|H#wjqdu
z8K7Vp$K=}f^rV!bogR!2fuT4(;-6GOe`}ofwX8?N`hBG4yjE^V5Ua+!`&wpQMYz@*
zEcSfWR|8e=;Swe`Fn~!1WRlm9rO~CJgi(YG_ANhRSf(Br-e?98{r$i{Ltj-BY*k>j
zZkjyHZg9$TSkVs>=?ZP)LkP?mYCxG*Gn@zdmRlO`oBS6jj+#p|`pv=eJ^>Nlt_n_9
zFW_KigqwhfX3#)l(Q<4qj7PYdFw0=^Q{LZb0D+%*_+?diqJ2nuZ4lG1icU_s?$R_k
zVqp>@B3c!7bvmfaAG??xB&2Y<=a+UY-y<y69lynj3xQmY9x3^f!k1E;&g2#W|7h7?
zMuKzH>#zMMyf`I%0PQ`&_9-8T2W(@>WNuzbKzDzBZ1GYs^1v@Dkw5}#n1e5?zdwP*
z5m79b)4%^qBa8fR88s&jH9GHGAnSQhiFl8Uu8tRHU&y4DfHur@6`6@%E4NJ!J7u{m
zhLB)b20<HQzyS{~@X+y|#>ktM;Of7AJ;=$I-8Cq*G-k0WhzgODc44<jREQ@hR)onU
zRUp0z40t9#B;ASXdPw2!?}@NWE19Q;wO|R_sTO<H3o=avVY6As)pr4UZz*3rhtu<8
zNc_<<UVT^|@#6QC45L4d68J>x*-8XJEAP#-KE%aY=9Y|~7l|8|D@n<h9h%5FfhSXj
zj};Afl9lXGCYeVY=4LylHe_#=P73G1HLGv_f<QK3^XRF^pd7hgN09H0rpd`_n*6p#
zkV?#kf?gi7p4}IOgK#o0BIqZ79P<Wmmu#xsJ_R*O@p>$TNyO<Ss70>Xih#D-D!RVY
zadZ^y|LXNSNZsX}m~-YE0|v$$VIE`hiAJR315hv_J%TVv+pzZqTHCbMKMc>hZxTv|
z3olWM{#kkUON%AKXwRemw8s2XAE@L-4cDB?<mQlx+!9bCyuH^&YBfD=XRpW$i010M
zMXpgedg|7TCpXdYV@w8CW)B7Ep@2HIA;DY_({o;q%6q4ZR>wd2^xz9k*>b&p&f!8I
zF?Sk0X9oJ_WJXhUk2q^LNU=O)fYchz9RtIjmGUR&>4=(O;f6qKJj=-OUt)~|Wk^q}
zw)IsK##I$OTM~jN78d&*@IieSFlH)fJEKqvjGSww|BDQ}|D^-C&L)+gEHT26$^}~g
zP}<fT7xQ`QFhYRfeQd>{Ju*q}J=?J3g&X*_I7t%Hd*1lp8D;+De`gfug}b|3l%1&x
zH84ip0+sAIfkV8|$UHtd|NG80aZsX@+_XIeYb!Lg96VBS-Y!dzt1G)`#ZkV18{nvy
zPtSaxiVG|<dBRx*Es_ygYVpqdId735fL2hn_7L1y?Epo>ZvUU6mEKqB+UxM}TdsVg
z9j@Y^%}9{g#RC<!u8|SgXujmLcUQ{pdR$J0bw>Mv4an2J<_t5_y%|>qUV<f5VFvt<
zOpR219mfIi2h|J1svc>8Z|!qe_~{;5JHrcjBO1QS@gn+!%Zensa+L;|7n*Dv5zO)0
z?a?8cRLX6Q>w@QRWa@!1Hd}u8^fE%Z_}MQiMEkc(6J;u3B?U;E#=Vo<6E+V;J9EU=
zMw5)|8XcY2-%4c-mzDXT?8lbhWd+<Wk8g2$<n+N?@~5Yr^&Y2)hfD2CND`ZTW&U>c
z)uK?T1n<vYrnsoLtnXUKAQ-E5j?nqZYE~8>E-N=&sHFIb>n3v&TY;wYG>tt-ZAeBM
zktT6wgVHDvg>6b*{VoA-xk6ep<Dz@-d9@NaC??Iu-I651nD9$X&5Hb_v3?K1s;P!6
ze@B&!j-w{{Yq|a{M_~xH_A%y|@QkF4GT6cYtHdjlfcPhGP35^^N9o)&y_3@+konBR
zw)EBXMWOwCJQO`qecyLTdEUaQ%KR2dlO~0i<NP+SZ{>VCtE{@WwuWqBp`qlMoN{s}
zta-LYNL&QpiXmbvcJiG`Dca5;MNa*v1#O-WdN*7&H#!ogQnJa{+ED4@Rj+3ZrZ&xy
zi<geVm7vkQQJ0xJcRIWc>t=%1Urd<vomf?r&bDm_`^o}yRsqRpU$+^*HGpw8vbuo#
z{@nVXjGKq;f1i8{_g8Sl+56t&ocPW=Pm^;cSilfI`z3FMgtXk7NGsM%JJMSivTUFF
zi1CRG(ZK~4q8=g^RVt_vXz2)BAqhCG68rN?gqwyit<E+Fmimqi(>$G56pQ9*;TUCh
zS~+^(A>Z2?1y4)PYX6}o^GybiRc<=ChS4G(nzP5UvzU#$DriZZWRSyZ?$gSS9IV8p
z8xjcllJBMM4*FXfHRbe4r?I*ySee}KjKSl6k7()V+4cv1f1DSh2|BfTf*IW(<OZIa
zuV&uhqp#s{B-SFri50HQtXqQqPN>0~#-M7{7GU=stFo)XX8omg&YT2rP+2w`RwQFV
z-I3i7huiK*D)}@)BhW>eDAA1vhubk!n0`A|njv>}NBV)Dj|s&597Rv10ob%S{q`VM
zZP6UInrs^f(=SPqZwqg**@T_H4knfTlpOTatl2Al%$S&omn;~2_KY=hz0D`|)orOQ
z4Z#ImC4LI>+w0M8EnC!$je&?J*}lk@H;okJy}}n!SbO*HAePxvCjG9wu#p1`A&SR-
zy}(+<*;u-h=;<NNEdB$CW6?gpOY4xkiW=cMF@3og>kK2?`;0ehlU(etJen_FsvHFT
zs-(#4VC(T){kkYvElhW2irUahn&*0zrA{Dyb>B@@Uo=M^jO0Kur8Lq4ZPa@nEJT20
z$})cT2{54i_|Je+%gJHPj)iqKLJ*Iv<~01s?Z#x*rTz-cVREqHh(#h>r0=UagEB>w
zOUY1hMl4p*&`zk(&tA_G{`Qp}MT2bIF>mb{Nqfx57L~+M>}&4UKd`<2IY?tZD}Y3&
zRH6y|V7pzix9R{bNY}vA6#=$MV6i&(vfRF(jJh$L>Hm9ydS$S{UF;!nTX3Ubekcp+
z($jBzjawRbS70O_AQk99G6Xg==fYVs<^s03iZt(7wR3+_vjwI3urL`?@DC^3eE!U-
z_CnGTbIArCGEft6&iuq`D!CcY>WM`u(@I4l+Z$Qr;!<8TWAw}Inv#P0OU`2<roc_^
zXYocx6<y(GNN-bq{CvYprTf9NQbIZ|>7DWlH70)hqp7X$UJy4{?`X{_SjoQr*^oZd
zUDK+c!`u+*0u=Eo@QY1rEJ)%cd9h_T!b!9;qNbFCuEDw9b<N@#7E8D@$bFnAVL~C2
z)P$mrN4e0!UoW$S$*Jxt(_ex6NCNk7wcQ2;pT}$^@qTkEli&WXZ@CgR9!qJTTcCgj
z3ny>1;|^<F*;VG(?5-{iw$dL1`<Xvxpeu`lgL0l>!Ylfn#TDVyvZ+dTmo&^h9@}=C
z7HmCZ$YCYgKP({R1?$<@*eJF%f`2i+)x6zBeNKZp05lV~vlV7B9d=ho@Ryvg3qNMl
z>OM?nJcn`P@4GZCEV!rd7vQkbmv0b)cn&D$cRHk!UdU?(l8bF~b54oAdy&aY{gfyE
zF;6bv=V(De_3W-y_Fra(W&PSyGyB8d;HIXbZu^P>s%sHxQJp1E@@m#Cc?UJ-%>qKJ
zV+`AFHvHYLM*Utx+8fL%|6aiR)Jqe~g@;(&hxBgT^9hGNa}IbEM0?Whl!ae9qA#B?
z?<xz_t)5PBR<j?={!+iBV=8_L-N)ANP>!%?##WwV5T2L}x069n<QCf5n;Nw3-(b-)
z#?Rg<e8?{jb}j@P4R4|Gj}Bd*d(IeAR|!|=oUB1;0chVYowYi)NE5H<>Q=D_%N)np
zox_rya&6Bse)M{SYkX~O-hi#ED>*SBq;Q$PCgADn#Pe5O1Ef2T6Dv90NYJgy)qamO
zgO9*giCSdbzaA#b`y3jDB-RN}<lnuwVO3E@Z5_<%k~M98U;EPsyigVv4;L|Ig|c{s
z2=c-x0mzOxZ!2Mn=*6>4g|}6Z%!OM@<&Oz-(FTBOhi%2K?0UIyk#tv{Fp1fxR9u@U
zP5zy_0xl`|-7l5zv0JZ-myFZcCsF?ErQW9g>!pIaB@mrF2t?mQ2UW>ItaT9-b)G1u
zL7jRGe$I3`em(;43={3~5ZI^*+3DcBL8YV<^=lfRw79_XkoBCYluU=mT*FA8VJ3k`
zGiaev8(*Sh3(&v*@q7(I%X*!uSRe0Cf2P-=bDZkI3y4_2(x?-6w$fZ;#0bsD{87>|
zBijC){14G(7tvoU&jyvF#RhDUp$#i{Kr0FNYN&(Xdh$BsmC1I=6QPNV%CzXJy9Inp
z0t8$At(afYC_-eBnG=;4<`=K?9|;@QID9Z^K$azsnE_Vv^rl0RpdSiX^bp(G6VU3N
zFUs{lU$}bda+L$I-wwM2QK(M{1^vsOag-NK~+!#9G6Ka+fey{tn*B`pBRruLQ)
z?Y-tTMf833f*49Xf1Xo$xY^&7u^(+6IHjF}cZLrHU>#cM=OA`V=cm8DR#U3_vTT)L
zAM>t-nxL+tod^xeo(_CgH}A<?$JM3*(q;EPZ8@t%gEDU^^oHnGsUlj`0HK0}BaTcc
z7hdFG?I*!(#*^2wu9xKe-U1Rc^1^fz6Qv0rZzqa7y6hw2@V;kia)K%&mz7*-FPF7g
zi7yR7M2(F+xDlBd<fQ<b9Fx@+1)PO`%kz+AkM<^8)8!Otd;5I!R)xH;5<Ek3i<fj_
zi_+~o7}oycy9UYNQ|ZK#o584OE}%g-$izF>tGrqS<Z_BkDuScMogRNxRmc6LE;uY)
z?Qfto1Gc&4Ja=FTt}z`l0M`muv67MQ-9la<!e$fSyJwGqzEdygev9*MMw1HZeCF@h
zVQ{zmGQ#Tgx4YfTAr4HxYrg`ApwPXjuN2nNk$U>)oz!@n?zcd76=$YqW8Icz?#adf
z2!YLhF=xnj2ww@xhZ{B6bxZsmOePlP>j5GG>BmH`%FM#u9G?#mNgF|gj<jZNpK6w)
zof<h#wi{Jk5+oGx%Rs^CBCYWtz<sIIZ>u)qG3*6cG15NsnppvgQ?$+IF7|>ZD@%?V
zpSI^TZ$9JzNpml4y;okRsCL}pb@so@AM3v(5CyLbDx{^V`Udb&Fle76zxBXN8}wZa
zjS3c3OJfKkhOvhj+(>eH;HioKNOs<gX`aoMcfCsn-CdK{12uape-Rf-<}xmjQ6fAE
zQFz*4zRlFH#KCFKSb2;nM9o5kt;IV$aN(S07qaU8iwtUQU+Du2#tW3O&EVH{zg`9}
zzgPohCpZ-KuhyKwKde0c7Lmtb4}4unYt6y52@PT*e<Zv`!gu#XM(1-TgZcT_5savu
z0fpb)J}U2YFElbd`I#&|5_AmWFy;(fy|W42d`|chIEYerScr_-9|NWA@<}(97X*6~
zPU?C3m+N0|PT#`kr*~a2MMM<Z^%s*YA4HfUB3YoW4@2-w`+l4KkgD~?1sF_a(p!?y
z*o3l9vfL0Sy&5FcZrekJIhYxxN~=vZT|eFv!yVnwCOL%(waJrDV|_m5vP7j&Mb>*x
z`H1YDm>b==p>WKrb0g#TkR%>|ra5diq#AK7gEZ<{Q*(2m&Ipo(G(?;XTl)D|pP}8L
zw{g(XE|z&y%`z04WbA6_e6;##zr;{D>Nnv@mhA+FI<EPVSMP07%}OKyP5(mDdZ&Z6
z8n5&#{_Q6S6iy`(gPnqrX1+jdh(&`NpG2{&-*0ayVtvoYg_ZGD`g{!Kpo8xOMPgi2
zByG`rQ8d{VwbYgG*f7RQ40PIImNGDqMLj`JW_<3M*06%3<on|MGg1LI3`rEnwtdVh
z{zUeloA=58UNrnFZ=i=t(<@O=Vhsxx5VPu(=6my|+L<c+{EJc<bpkBN0auvPE0U0q
zk~e4^myoDQ=>j!2Etis0oiOj+zfw~`Cuj8IEI35+FG_V8nop4NO+Mez$@)BgB*v=2
zz^5I5k@~eVHkBn$Exq`=&riV<!lv0OM-mA(kVh8F+s+EGf!X}gcx1J6gJCDxmoxN#
zsCvt|s@ms!cpth!q`Q$8X^;{DX=&;14v|g)>F)0C?gL0D-QC?OjXc}${d@8Jc}bkl
z;q1BQnprbzEkDP{<o?Ex6D%6sM4^Ou6o0Cpv``he+RdkayM)K4tTD85CA>Ap&8@Ul
znG3(m{S6Q2ij~)UIyviY{~SYlW%VwiIjS$LIm#wPX>Dr86!vH1-o!<MW1T|V99<hz
zTn1qsS|3FdlIAzcygDvYq~{&2Q>q#3&i9<;cAL0KV1`8>QhBfQI3yNCj)IKp0}FB8
z9#mAX(oF!%>Q?|F{1i1!lWp9>thVi%5vUosH4t`R&~A{14I6`WEN<Igeet?zzhD^>
zVC>0MsiFG4vFBSh(maE37A|LguwkW&i?6EqvHf|-F6_gXJK8<ldt7V4ySN4CgbkCt
zaSS@a=JvNC$b^liR7u=*D3J18IOj?P2UhZ>_|>But;#8@u^QVKtV2+0ZgyH=wCE37
zPweki5x*bI#nsnR1{y7@qvKt~zUt-rlm^%8@Se3_Y`(wn{^Y}UJLg-fjkva4gItbn
z%G^u)>4Lhqx|JG@9~RZh5m4a8d2A<%`9UJKu(Ty({8*Q<9B~`%*Sszk`-qR00a?xd
zn<OEeLYidLUrSQ#3~flJJ1?81&JG0bXh*u}NKx;QdlSrwoOzu_x<C&NDoHYxuOyTw
zi>kJy(e0Z5(dknEgJMAoq-HDi6mVsFk`bt`BJb#NzqWDrt=_#{4Yuz22#&^B^rU#A
zL7u?QynywU+UVequls|m!jvFwB{*ub_2&;VewW;r3EG0H_*(`0142;r%G5*Xf9@in
zf%{@3z3Zxm=FpUhl~&qRV6S#trkV<waEz@KXm<npnuw(bCuyyogAxE>qbsfJTDKe9
zd1!SsP7?NGGe3|=8E-8HWI2U5yj(JqjX2BZ4s^yfKIMRG@%n9|D{AIx0^;HBVRSZ?
zAlYjDt%U?RDaT%q0Vs*^&oqg@AAT-^<XHzLxM=3m*+ff%yXRIjowcdm8-dlpfnyXD
z?_>klt>-J%lT`ggMVoB@R$Z~o89ZE#34?@SO&-AWzBVC>EsZm!FkY)k6n>#SP;+6S
z4mob{<Wsp=3$(u}Y22fQR6u#DS27<reo$)wX--+`izffVx|U?Yf806JC1L?Y$#7m0
zj*DJ|t+k345O6zKHO|e9@R;5=0+n5)RK#0E_IiWJ$LlW8jypruKj2=I|1m1^GTbKG
zRpVv;y6_A#2>&th7>k%Y%t+=+zBS3$%KUB0A1PDkkTlcuaGSy_*?p5_vt&`pY-ZNz
znDDz7>tIHH_qc~nUg^K>dXi!OZ#mv7lTw2r%FH|ii){3k%NL;9ceHt=V@R4qKUY&$
zjhQnTMZej45v*q#E`E@BaoX@bmDYovVHg8)0u^?9xZ_?8r8aT+YtovIW6NdT(*LUl
zXDt5HLgV+mtAM})(6)c$GCVfTpY`<zYv%fIKMbdaWeKy8<QDX4uc+9DJ<L-Lu%pTx
zN-8~AUKq5xbG}X}T?*890xgr4bXO;H)kA#%8x@79S&fSdkS6ebs-Qw=XlmzF-iDOK
z(yjIS8tgt^rGa#|Vj2nC4NU~oRD|82QA!Ec958_yZsX+`*^UvI>7SJy7S`y1^rpgs
zGP5Fgt+}@#25+st^b_Hcg25|coZWJtz&wp_`mZyg=J>y$;1G}>fI=C@2UTYpHx}_&
zZ(LlmydO&bS=_(@nfzZDf4riu(r%#w36=alr|E$Ov=yZ`)}Y#I85NzZURL4FS4X7<
zQ+OVDkEU9+jtaJ`8v_Z6ZJX~|o0IA&hgur!@-kv;II~4Lq2@$i-m(bF%Sf(uE0~Bo
zqEFTR*{pZ|TCt*PM{K!Hke!oPWrU!fr>TWK_qot)wLxajY%KFv<-?Ie8n=z%IgUh7
z!#I=t8h|3DFgMqe<>9WVb?2nI^kMub)OR`$&*u5r!T_ca)Eb~uOj8fbfiKFd%CXaC
zLc}e!JNjjDbU0_o3~fw@;j<CcVQSL6GlosVlGDj6QKlPWZQ%#^<7dSGd!p_~71Y4`
z<m3youR62F2fxK-H|r72M#BP>T8Gz1D@na?-y2YAHR*zE+1YX<4l~N|Yzsvm+r&i1
zq=d4C%Y^hgjuHG?aIX!*R+6pFZM#cFtzk{F67oO$BM|onvE9C@xI&QTM$BC!DrSyc
zAmnv0{VywEIQ>=uIdF@WOaFYgoLkk*dE5AloFfQdTmmM7iNl3doI<j$I|KFn-9AfP
z?~Ab;MShY_ka?QE)p_UsR%Z&_9y1*1`p4CEh+Dn2;MQH5W-T5l1r=QXF@u8%7p*EH
zOESKNF*v+dwU!xh{W8|S^i!DDrAGk0zu3Fg|6N0cV6Mb*7-8Avfupn~tTQIL5)sE~
zGMIJNom+Uaj<@!F{v5u9rhZKZpi_l%h=5+_Q`hNC!qpy2NBhDygLSXOB^ej~{9^vE
z;Z?HHQ18leLkej)R)<y3-2*Ax(8bbCOSOtZL~UYDefApb9ij(@lI)Mwl+$)9=Eo!?
zf|nx4Kff6bbm=(xy>sQt$~W7=UN-tV+?<E4s>o_iHvg4x@qsgofJ2_^8>9f8ynbqY
zVvV7J!}_i1uomdM2bVCh%vo!3%Sx)-?n?R3N(+>t#AI`SPxe=%l7lOTMayDAcYo%m
z)OOZCf|E!Uxb$^{!3iH!YLi1e-x`H!H5vb)`1P*WmY!}NVQ<5FK#rGHElpi|GhF%9
z?HQWe;PCnN+8;Y%nYl^6;sGLOTJWPD-#GVM0;Rn>v}Dy9SoqFs24r;t@JG^E{$F3D
z16m;IH`p^)Xtc(Qi%&f?HJwcgh;t=@H{6gKIFv`koItGvkFw()+2cA~2kd)Sq{mGr
zi;{nY9GwpNA#pT9=^-|qE{2M1?asT<yb_JDeL>d$y#u}HFk-&Yl+^LMsZz4bUba_8
z6Z}jRO6mOnxQhm4UTNU&V*Q%WoBzdBrNtfmLqn=oeOG&Rcwufy3#WJ4vQT4*jZ!d-
zcOo<RCl-$7@3Xh43hfd=BEbb@W|7;yti5{>wsn?F3e^seT&02qgTw1UP>`u<P|=Qy
zKVMU8gCIhis^azHgv+<G*>rhzB}tv6A5Ww{oXcgJTIsB7c(3K!x9Yf`nTH8foGhH1
z-y3hoL|6Y1XtMi-_K5xoSU;FDLnFrkI+?*V)5-wuXx@<i&xVa?gLK`5>EQ7XBX|#J
zZw_RQ3qS*cFKtvExOnWQ%DMsrIZ>}8mW~S<D-no4Ty;N#3kzZ~E)(&JTsZqa^VMF!
z1gNs1s+te(0fbyt{<G4I;va5O&EY#Vv%Q;`reAFuu|9_FFw`uN{|eJ))m*jA_{~*N
z)*qKd`V1)Je?mS3JlrRp`p(Dx@z!iI9+GWgjF><FkE1=G4I$=E{}(Wi4_dSBC0d3}
z{*!x8Z97Ukid41Ve4k@2x3V!31{4Hy&epTq_Zv%%AdLxNKQ1Bfr<fO2yqHLGx9WHV
zjprJ|`~CmsAb!X1Ky&`K56pU~MmF082cH<uC}oK3{r7%<{r!Y77_S}82Ade=hJC}1
zQJ9Pran!_gKAE~{{ekhE{MSz(3qRA0c6|+otw|%8dhA`_D;=Q4JN#Hru+@-e-Ny*C
z9vF5o{p$*3x(@TFz^CdVFxV~GJ$)QO!HXDNdMNY6FcZb<`wg0Otz8I!LJr=spU5U?
zDnG;&Yr(Pp0Y7j(hAY-1jn?nz3p!tJwCC9kM$l@`5t&#vF*mw`MmN-ISir%>$d)=n
zV*2Zvalf$Cse@3vyd#ptjt#_x?G7%y^vahpQ$KHHD5(F^N&`Og*ijhtEhSqo$Go62
zkPIdlR5I8~8EacM3RBU}a;&5Kovnv?+zsASD2u_4=~_?qGfjN1=8i`8&Y@=CgX@Ac
zlO5{XRoO61MSewoaW?3%<vB)ERZH=v!V}N@7}n(X5@Q{OrM$~^5V)WCYE(GCbqLPy
zMK8}B&s_BT(VJiKVIT#JGXKMjd6M>L#8S}q%REO#YUyQQRoz$0f%EJ(z3qQr8>gqI
z*vlCL%ctwU9Ltp7MUaRnXqvw_6O$)q-}+dYE-a-9aFgf_?moUWX1U*_oSYsW9F&w+
zek&{@f*aPRN8wz7zn|=zV#*V_qRHZ^>U=Z)u_}?3;a3z43sTgZx}b4d6C86BrNXA@
zc;HJ4PRh;M+8FG*<F6MUX>1S@gGCW?yX54vNme^LefG7${h{chhsXe|HfBV2qrz;#
zr=0k+s~Yj+hmMy|{}SmmgxJeh=G)QBPlzKAe1NZyGK~qL>RyU&#Zdy<J=g^0gf9*^
zV5o`Sdlj1g#6kBJ)Edn{q2gX1`f`i943qtF)h>EL5T^6P!SnE+uDR`YIA~&xO`W38
zR9^U|-a22*HyL@pi=J;A-_$qLcDm`^K6u9DLaMz6e`LTfl{$#;bFYmF7qAkM@M_DO
z{<2Hy>)O=J|4Xvw2CP$DB{$mNeSS&k6;JrpH^y8-$W99+$^lRg)8VH_$XgX`B6TDC
zL+dH&cVlNL0Y)l`@KEwAH6%;alb|3fNlOtpaUnN}N#@XKYE<9AVWOdPiwF~}R!Yt=
z*_>KtK%xoWC=lgCRX&2G^c#4#X1F6tcxW0qvSote6Vvwy7na4JcD6KPSmNGMRIv)L
zfZYb%+`m?)7k0eSxGao|cUX*oxQGyA81PoXXFb0iL9o0)wEY#9YoMuUEs*SZr<3H0
z-l~xF8UI6EYYalp%L^XB{3<|QCP~B*cxy}k$@2aPdl3gzGr`M$HSl$hR0BIy7v5Lu
zQqN2Gvlozz4_o5=tf<}!7YpkfC@>_WFIUJs1MABkb#6#KzgNC^qmj4`(O=)=cT!_f
zg3<pM4{`s+0|p82|L~Q+Yo46q056q)1&e7$+vJ|5;snkzwkv^?@UF~o6>qa!TB3)E
zG4^@fCa9?ldPOFb&5v&|(<L@4AI=1CCZWi~?pjQrEk-Q<WI(_)367e^VZZ{4%&OZx
zACeoTlou7K*kiygByW0cEY8R^H`FhdM5rjHgoO3d=qWuKa6q^%8>8`xdo_|lbN@72
zQaqq+Hf(rsfK#k|?u?7#tJ>iyn5uwD*15u*+fr%fZz6m2!vS&q`aC<b-E1)|_cBGM
zPHI%HfA1F{|H1iH1j`ox_b8t^{*tmx^;ipg?{R2E``82E&El!IE*rMb-I^J)`)Z^|
z;@DS`saY))9tsw!-p@P_e{{`A;){-6okdTu+0IHrvF$?~d0t{_k7edeU!!?W{aM~h
zCytV1Zf9^C>%Y|=W1ptUH5vRtY8u1hEcjO2hH8L39wTX+IgK2$vzUM5Xv2q_nZ6Fg
z2&uM)0W$fVX3saMh5p{z!9YzCjwe+gfE-B$Jks7C>Mt*T$LkwJ@i1KN-f=>|2Zs#{
z(rD7Kr66xbh7ieXUr}p#!k{9XoCr)RVff*jqR%2%^!<z>J(>vH<j(8F0mRO-rKikd
zO#nM@2t1pwW$(M}z%BX`jd^G`1!(=S)EeP@b1egSMoM;7)3mrTbEX&M*TRSG=l56{
z_>J=xTdB|HzzTR|E_x#SUNL5hgH)WXRjo0BzCq!ss>j-EE@A2XvTL&K>WQ1fYWBzM
zfRdq3EUYRV0M?a;wR30EwcO7xsPkK`WSIpN+uu1yb9q?-ly^9K%qM&(bo-_q-FT@r
z?St&g<dEL@AUTF{6%OO{=RNjk4)0U?_h*9Z+xfXmqY?eS>SNTa4fpxm1zBPW1wb-4
zI%~(-TGLPFa%9dy{!XLm{9B9r5Hth{<%6=WI{D+x1p}dD*lD#3fMD|S)F3tCE$)X_
z_dkY@dh%verTD-W1{FEgL@5`sKqOhL^wAJD5qO9U4pXAs=rb}U_qHd?Q4AysnW>Q?
z=HQ|JSTM_@83QONG+N6;j#4EUK!5eXNQZfr;O8}tU*p14WmEq3GuOl7?rHfdY(NkR
zo^^%1#!aA!>{2zm2PMbkL#+xTu(oru$0nqcM1W{4_Tl*5L*MAbo1){WR)S_-08O;V
zL3hTfX_kZJO7imlPlpdE)BOKCjjq6Z*Q?#y7ZiX^OCm<z<|H`^^Jp~yBpFQ2n6$K9
zyT7hGJE%K;#vDut58oun6U1cGjg>7BaX`g)Qk%sQ8=8m#Icr}}KdhA?z+oK*bTyli
zUtwljbjqk#$_YGd@=*F!Q|+4e!K@#Xv>Ky2;gR+|T+_ZUuN-~2Dbn?$lg*#6u+MJ0
z)dzLxA~E2Z{j^|pf%Oe)UK^VrtBcztw4_K@uY+%~<gC-(KXM#r)JEsOZw{QUGIjT(
zz(LJMOYh;27J|!&zNAPt+S_t>&kHDuD+`;C?~<%5x&OMiQkpRH?wEBvY~Af%5OGjK
zgc;zTV5u+nnhCd#c%Fn>5iW$6ShhAKqm3ckM%$e=q*s<$d=d(kh2=;|ZM17L;WL>o
zr-GdOD|{_qZXoTQjyNfgYZKrhEIPu<2KA5xPmFbE>gs2NlM(hTw0M`yFdK(RvP}%F
z_Q^6@k7cwY*Kvqi2OQiJErVuv6*;vfbdhU&#myKxvXn2TJ5HGQzw0>89gp~+%zr`l
zHF*u3mutWPf4{{3DU^coZL&5CUZeL#379J3Kuj|-?Wdw<xO#tfK`tQgOGA&}AF9=o
ziiO0f2|&Y8r0`LQl(+8_Sz5>hEtt5r)oo}q_#MM(X+RB7my?!J9X?{M-Mt}<4#~#`
z7yy&t;}@c6F@>zgo4^Sql*04TKp7f}A~zXR%io#}h8Y4J`1qrJfhY6g*+c*@eU$ZX
zJN)oqa$o-|J0)yF5@0vVWEelZe^|rVh?)u<cr3jKf@~!eg!3#8Y}biSr;XF)49yBF
zjk0E$%5VtQMQ0z{N|?GL(p{&=Swc%Jv6^>ocklrFo+lZGp{{r5vg3_6_JgFS(Sn~w
z#()20<NDe+*Mh6CxvM?hh&k<q=4{Qm$l8z;nwVqaEQ927JzNJsbEq55Y@5vp=M-1Q
zC+RAhl(*RV8-?IX_tMA8JVWuq+m)RstViFrU3U^8;PnehK{*(hy5NBub3wtPry8pv
z3r2nkJa(HL(Q!5|ovra0fnONsL?5H#erGjvl14GP{3b+$mb(JsA;{zGwD7wbR#S;b
z^D*4_y(hempd`HpCLko4!}<0IolJUjE{8ftP6Ll21gQ44pqK0S?Cu+IijI!PDy)*6
zgfDr0fs!B&+)ab!CT>}B8PiDJe*Y0<JzXve8((M21!=m`@f#z7P1-W9vagP%0(9t@
zU!-0aNblS&YHrDzFZk2#g+^3O>0>prOesSJ$g(~02s#fFkCq&L8TjgK`%Z0cRAFgS
zFmsmqiD9z>A27pvNvz-Ic<DT%Y$_Cza#Lmdck{*{WXVbc<+7o&h?ry~oaX0+a%ow@
zKCHhiT{T-QaSK^Z4WuBZwkrm6c8dCpcP>L}v35-*;j8sPiJ`>YG)<K}9=Jc)??9qF
zm+e~AbcV&kN_EZaj`n!$R0(1=rTyHtg-=>pH6d|?*?-dAIDG<e;C;bK2*sCA_U{0-
z2_DW^J?QcA!l~DEZ3RomQGdhQJ&Z2A=tH~;KZ7<3u>OkrmsFrpeHc0TatrH};F^f;
z8Imm)>q!7+NZ(AthckT@Tnt}f{(+X3yt!Ga6@Z!etWsX<DlSKhM%$p7Z6WRN`Dy*<
zG4>z*-`UFsO&fr7HDG;raPC%maNfS@qVn*#c0lcdw4P-J8*gn;^Cc~)CPT`eHJUQ?
z-R#Q%4TabmA9Kp;$LA6eAie!AH8qi}^<=`pJ1@ie7rT?t?7O<AvHdV66r$!3_Js9~
zrRX(dy9R|Q+;C-NLhTr)@-@~DyLA#|-L&}A9{YmwzY|c<Dv=F+P~!`Evib$H^RN)Q
zmX8fF+he?~G@xzg{&aVd&gT?_PVBwW11~k3Mu!Zn`a~0RCjWHSsrk7+oIc|0yby13
ze2F{RQ13h`N5C|m+5rAj(kKwQ-vZt+>>t_;mVL9@oQ-(woX%vkO(C&%KJxX>d<!+~
zTl;=GpRp?J7wV#>#`Bxn=Im<6Iw=NEPP=1cAs?D2Bho`rSkmv0olp*%3~5X5i`nz<
z7WkN}0Lwwh$#==m81m>cFpoe2s3_TrNDDt&Qf?^J8J`j@@)_m+**(Ser>TuAkf6d*
zFQ@s?f|Y>&MP&s_<#>tcraDGV-!6$+SN}_>>?o~}^-&uSC}Mu@sT*<r;r{dIE%8~o
zZGGNi)fxh>4eX0h-?-Z2^r`F=h3#t58<@rmb76JWr8l0-IrA;}5MSy6auo6o;*keu
zJxcS>+x2loL5ZXAc@)LoD@p9~f}T&J^;52x7rCa}ci)uNeeBlpc4oY7l{V1j<(62j
z08IqD$=a&xljq@HA_&dixqFQI1*bX4#3vT^rP@muZP|H&0nqB++aMH`hU%qO+ZW+6
z8@9F+th@Z6BYFkkL7i^NuFX}kx@=c;@qxUHfZ&u4*re!B16e-eV-6H2ii*y@ziZp6
z0A^WPahU`5h(_*=#nIERkn_UeY9kD{UeyM)+@j3y&fVOtjp?O@R@mG(B06PEFwliG
zGFp5u)3otdYD9bDC}j2bT#&cV8IIbjGGnwpG+!t_41#+MHDmh<WueA+i@qNLf`Sy{
zzgtr_A)U%^jfwXfkbZ%Rh2TBz!^;4ve#h{R^!*(xrZ3Mk58!$qii@V~J+;ee;_quW
zm-O!SYj+Ct+Y2bA`%8;6q18(2s;BLeA}YAKBn|MsOU;ZiQtJoZ<lg3VMrI7kSx@-S
z!e#P{*^(FDS1hFz&=T{j)Uqo20lb0%-C}{tVW@Bd+JxH7tc3X9n>bd}$xk6xJcEcZ
z@qezV*zl5kuP%Y)MmF0U$7XvLTH^hT$!!KWISWA8#NKOi>OihG7_oyx{XHOr31nAV
z5O7)(V-X47vH2WE3yLiD3zVGWdHvz}(+h<V2L!2rJ9k<^T>fz^Un$lzzDRN_D$Y6E
z3A1$_in|<iEQgwT3#ykAe-W1xjA`Apy5n3GR8y)exOHO6Y5{>agF^zIJu0lK50LY(
zxT;(%OyBNQ-4e#u%S^y)V>8PZJ3g)B9+OvT_See9J<O^=Fm+|F1=c%HVgh)ed=INU
zw*>#ar^krB$)Bml3n|9jjh+S?+!gLidAQcba0_&NaPQ_w?mdtxU_D=ria#n-yUO_r
zxVb8lRLN*NS3FXPZt&ARULCTw*2wC7(ptVnt`TwbxP^pos8yCipaA2L`UH&W#tJiI
z!1K0+rfq71+zbL*KJDA!s87cP7}`1-MWfdU$-p_%haq3QA+_jB6iW=TjT1RK!zokE
zCA2%QXS|KWh_sYW2=NPs*KOJeVme}{KOrGss;hy${mlk7QgM>%r*KLv86|8l)rlPH
z4%$Q*{juhHF5&L3CaHs%-XQ{KO6zMfF0U(J(H;ATI6OVfeLW&>%On$RNmZS?c-p1{
z0Cb<JIZt%@`&KM#`}(Uethk8tYdqbVci$Xs*lB%^P>+(;z_zAtUuPNihx2s&)?rYh
zcSjf`<g0ai?*JLnex3Zltbjo`uI~(Z-A#u1z~9yUuuRapfTM4aI*RMuMHCHqdW;9Y
zOUm)5wn_s~C&R)l(3A$5v`%3>8_d_-$fRF0J}4*pNC`6dU<<$0=17*S&(ZE0z(*2#
zz2E`_PHIax!{gDbPg{R`tTP1%_;@F$LKRrU=)n^#Ve(_@u;IGYXVDp?X$G9PgVvO2
zBHyF?6{4$`+eiK!jg^#5<%cL~=)MB_^{_s=?=q(r#>Fx)4p_1JtzKjC^b88<uCb&F
zZBmHc(l~737M5;IdMb&MzF|Uu5}y@S{S`94yNNc?S~1vD<<xyae_MKgs~N!*nd*#&
z?T<28gJhR)u%kHVIO_&0E5c*?AT_MfEml^40y7N5G2V#{FvY!lj*~+6O^}K?7;g)H
z@c`O$i+_-DpDl~kG3;OBI`;S?Y;YOo_O(YM-;E|&Z_>z1rtKPO0PXseEZ9uC3S8`4
zJ$}i&DVbtp8Ip3{A>WN0QeWO;zG-?9aW8?vhlm$fL21WJEUpiC=Yz4~Ja(DGLJV|Q
zkIy}(XQn)z&3R|9@T=7-c)wvjh&*6p3P5~=G4gKb(}oVFo3WZjPXxvfCGTpRixsNI
z8(?p37y9v74dq9{ZozYxdGB;5Rv?wE60{@gjZtycTIwHq*Q&bvu;#=q`27|Z5k@E?
z)1zG6+?xeJ`-s0f$XTzrm`c7u+5Hl<XMCJgxrY;5`*bus=^-VaRZ#Ve>@A_Y6E>~7
zb|4U(a%V<?n5+yY9f!X~ME#B4UZD_#_2TZYs6KnurI9sOVZZhX^di~|ihm8jJe0Fv
zhS&YDNb9_j#CZj*+AGWn&C}$w^PR4*D4|n8=0@JNas{E{$97!gh+7MkeP=ze@@WjW
zv`rfq{db0805X2(`THv$L?9@IvC42jD7&dDKmRQ+&-{;9vXa4S*&>hnDA^2IUZaUs
zfa`kVmDQJMwtEEESAwhj&R;GYgrk9GsDL3&g(||h*R7!C4T<2Mp>om0Eh@kvDXi}&
zYTTABy)NjGItmj9KM2X6sAqVl#oZF#`Wa$6Z0vBMEz7JS{6>hyZ<Ll8Y3Z^XxfJag
zUI-bZ5Hlo-G-;hY+kIX43Ena^GSLebEkj!4LiO;sokNn-(BDA@55tUk)v0NA@71FB
zNR-DYNQaB=beO(&Gk`vE)jS`*lDZG(Fhe$}K3bHE6Q5kbxS|I|&99+O-~1en`Z%pW
zJq&iushMua<yY)=Z2R-B@R&=nou^6vy5U_1JYZ!!rcH(OPdh(zwm8-ivS#)h$IVwS
zx=iRg)D>}UurfvEPeF+yO{uXp8^XV2&OYxm6Qozn>{k?~1lUMpzDd$Yk?l#HH$ekE
zyHS5Xr~~@TNa1d_WP*r%j`7w4woIoMA{2-asLpFS&Ogk4yD=vX#Pz*d4Kqiqe*G(m
z9vj$&RSG_oHO0OXt0^L7xZJkb#GcNu&p0f8cyzT|Sq7(tv@+!9%4y6=@y5X0Qvv4W
z92tUgJv_T33O)uI;XP+rQV#cC668j4X|Qlk@pOht2$bd(W*A!Xi$2Tfq5|sg{iJTK
z1?5J@Q^`JG?vjRcmk7>W%R)-^Qq7TP+Te)3j+@%vE5l~taJ)Fv0%~+755aN&4ldPH
z+k0TO*<q{BY>Y+&T>%FUwUVIl2`pWIO=)L!GM1K>o3Vw6F$|~L-}Oh9Ap&E1pHPFE
ztP3puJc+wTFa`x_yBGfKgX@OJ$;^l%D9NoH)ojR-<;64C2E(2ugM{=Nu1Lr^d1y>s
zQ8}+6-aje8)0Gl>!wdgFLL!~@b>k4q1lRp;oc{K9*NjuU<FT~3xH_}rz)47ZdQ`_g
zd7lBA!B#8^tm}BZA#X=P_ooek896&76nIPzQOHdU@w%Rln{-vjL2KOR5p3Fbhg6$b
z3-FTg28mg}UdvQg#N6*<uiE6}6G<cUTNfE%Yc?c{uc!n|m|YRw9gcHGY(N>ud|-V;
zU_2PSi4BZy4i;TOWdRzt9IjmZ-y2ZSg?)&q*x+z5In^qRA%zpE9v?{)0Jx)UVY9-6
ztnh-E{DQ(d$-N^-UB77-pOMA2U!$fEQniQJ7!IU|n!)5@14DGcy4%$*CgHo|<rb&d
zavnK0h&}D&w(@GJcJBD^t^N||8*n4ek}3N$R=kyiffwLP=>XxNKtv1_XLM;y*MESk
z4={zt;q(d$tzl1!l^)4eLGllzQh|lsTQ3g7BPcy}l~zpSl=|U|KR-CR+*~tOsanBw
z3?@nVbq)>$b~-1V2to@ACM;9?#HVrQC)nBSLeIqQD3&IAc`40jhbX@A?L<lSF@i!k
z_Y-_}Z;VRGPIjSxl2*MwWypjvfQ2DkH;jHZPbIDb2anMCfl_g4iB;S@SJ|}?rYo6}
z;!a(7?AnT!ykUB(Q@1&i(sek;2b}%oTKZ@)Bscl{spyR*XXf;Xj!j5k41;e0c>ih@
zU4Q1OF=HgkJlV@Ly}YcVQOq79-@AO3KXI~eK_9+M1>zK2rarRS1)C)vM+XS2QWj-h
zV~k$<*wGvD3&K^kM8~0UP#VQOhHJ6UZw1j`eT2aw6Qz3w>rbRhr*$kdlLY{zqO@GS
zsr)#hWQm9SmR&CMgTIQVG^VPF+XVVFK1AOSy40ux3)nAYl`wU(+hf$oAAZ1me51qz
z(N`7I8o17^g7<a%UFP|U{o*+YSTAaxvmNQTO6KbuDQNhhl5-fSvrF;StL}Pctpfu<
z5cR!475xt)_7VEOVpiyh46wI2=m+gDrE7^Yk$~W!iLl5&1@(>N!|B9A{)Gw)gepS7
zYE9f`t*NonG1l+{d1ihHA?~^$lw-a*rLT{}=(UK|Z=BRVoXRgidXj=*ieQq^cHOsi
zfH}~Ylfs#-k`cl9MO01uL#4z=ntP`GI=~kl{ovXDfDR8&qL<5%<|{JN{WhVdc1G_2
zrr1_o(K|_c$EfdQj6?uv?*q%T*a7nXQ1QKPi<3*}jYSzqDENSR#!lFEZK&@^tBZGP
zqHVVr<3=|U24dgIvi(?@OiW`!TV#jN&pR<3hET9x09=p=8|aNz*cu6+xx&ktb2Vx3
zSMi?cwac+AX-yXJL&EItoV5xK4EI)D@G2EWKo90Ee(0~Q;GvWtt3|19yRh{0%*X}>
zsIL-5aJ}kUN+8bsPf&g1g`Cnf96UXRNr3CVB>A%!K=27WIj7v8mM%v1U^T#>Ma-Z8
zh7A!Ai-LNrV|n%H8}{IG#+JJw;K$<>yfx8SFdkTJEV`{1!Kq!M@WW53Hsa>SnI+rD
zXZ8<8c15n}HV1|Qgm1^PZ0QE*G(fT*t$+oJAdZtmj$QZ!QFG(TcDvWU6WYA$wB16i
zp*yLtl=?L`@cIYKBZu=enOD=BuSQx~wlWqa{<6F#Zb0)f`@E4^Iy?F=^%<UT7VlHn
z8h+5k)n^|&{QQ24<`p}G#v-^l3iCE7dd@oFW4l6JsI3<LH+7!s>d>>~)+Z*;(JN^e
z3|L)hZReG`UQg%Yrh(FU)iuIqN#rF8(H)N2zuS}6xrr%xF?Np2$5(@*W*;O^W*aY2
z2y1YO;%lE|jJi~U>vb0j%+D3zyC8+MM|{I$RkOtF>-wiQu=>?(+MgVF@K_Robt|6v
z?;+O+&CYrA>v_byeQR*OlsX8kIf)J@fqkNbIXuwT-N=IuA{gH+9s(dq*ujoZYGhvR
zE(|5wV5wf{iFkbu5qdqW_^;qg!HNBE9<u6j27>b&8|8Mw7-R;HDCDpIgt|V&!}|JN
z1flPxr}GN?G&*vPj3g%nqVR%d76CkjgniuSTCty=RsM}U%uL@=%-oRjDl1oh)pGo?
z!Jn-$-bh;i<CQLRB}|0RUs@hBRz50!4i;uVi6#N(?ctFF(pN#K?F6hUx>~Xm)$f;)
z&j)@gapK6a?CwgieeVw2P$B`fN|Bw1iA)JslzQi9^G=4Z8-!z)x{bK~wgLU4;=;lh
z7Lr;-u=(OQ{-dDlj<_1M?$aA5ul3yV)zjlEwbfkV)|8r8n%0HvV!0utIx-<x=7>-t
z(;;=274pmj?G+R9c1SVEa|sTgmrB#^k*ji5tIk`TW)mNvgyY2<#nT*jik!TaYs%fI
zg?)<^6!ugRaZBnL^cGhRwoBIQg<0?n2UkZ~RZPa~W%y^`+(hbsx5fPx{yA_*4T33!
zM1bc3V;WBuCq6;8t6)jc^g0p^BG5Z+z5KJKwMTd2(zKXG8lJVj&YBMjySF9w+uN_n
zh=Ku-sC$buF#%e_^tHi!2^0-*^urGowi7Zist+1w>)Hi<A-u95?%_dS51&m=tJ!c4
zjxGLgl<8>1sJivShBSftzVML|d4e;;Ec^GGpOF5z&Fcf}*E7N&2;MIa1Kic27IAN1
zu<i5KA7rF3fYCDdSG?vNoO6Q$?2eO)#W@rkBB;KELMw^IhXiSXy92R_f&o8&pTeS%
zhNrq~56ns~Ei$><_YGxeo|)mi5b=9@%<p^ECopjqUlQs^<;Wz*=<$pDxkMoVK_Yax
zEAK93rZ{VEPumHY{Sm;%*D7u@pv02!&Wsq&_|To!fxq8Uf?I+zLRsC*ER#>$03f(U
z+MEEld&C8XU;5DbU)T|Uggsu~0B>C>Ir?aE<cxl+Xu{h+Ft`0790-mz#Z|R?1rC7V
z2;Vw}abC-huF|y6_@pbjhB>6l-&3PDc5ngOf8jPamKvf<EF{1ztOuXi#+&ig7?q1!
z;IDGLW}@={4#VHT|LcMVW2gf_V?7p@=PKYl6nA(!Tx4b5VSC#ErvH7YhQ|2`oI5VS
z%DlN=>BsL(VITGjw>^23{c$tf9@&dzsfk=}qgH$!O!^>%mw;YBR&Fym)p1YQ<{sew
zqfi8TelCfKUihi{4pq{4lO|F*hm;Nuc<%|PIzEv@!J}R|hlXEiK1aswNmD_F_~&3C
zRyjH6mDO@`#TP03$PFbNK<F{1w5cnoGoe!1e7Ri)UZ8A)y_cN!+fBjLxilcFn5|Vz
z^u;`xKQM9uvHVMQ1Ao;fA-wGB^;@T2!K3(7&D~1HDft|Y&L4J(15j{)A%$9Ho;qK0
zn35XEEv@q`mPcBjOn7qckV(E+bR(kJvq`*9`SdqEv`9kN!JYNhd!R7l4f?VnVNu}`
z4xNDZy13$evX}e_9WNK*+4MDOu>ZuO8d>YjU+`On%r8{IdbE{r&`CJ3U+n%cUtDMQ
zLI<=vVfOhTNFUE;I7=I=!udPSp!y~;7a?ldp5ibNo#Tc^DV_J&ixz=!uD&LV=H!y5
zWg;64GT=In^-xYt83XE|3+*+XMW5!We17tSWhmBlK;EH#eca;qT?}x%ywb20nmHH`
zLBlv)_U~?CW#`?$c~GkL>u|5p?f7Mz?Ij?vDBv}w%WVt*Z;dC522gMXQM*>Ha>Aj=
zAUp%WezzNi2cNOLrYDDCkH=vUw>*4a-F>z*6nXsP8-bszK2SPbFVrZEs~juOIu$t3
zliU7oUoa#Pq%}MbQ>PZ;zi8%(icl2XRv2lO5o;7NzhLWZLhi-921MeZ))2s>F(XCM
zagOti<Qm_5hl>#+bVeVvW!!g$Maomi@^S-WQ6!fV=BWZle8H`asWF|W_;uk1q3Hg7
z=tLi?>s$pp0^lJ&hK2i+3AVFKu+Z#`HjAzfyo&vNtSHp1==j_V*x+JOjnytppjTu^
zRj?`CvcOh9qxKkenR<C7+b@k5U8k#)_Dw8G^2g!7+`tpT9tvqooC9O$N|G2eyF-5E
zlkOTNl!hjYU+wWe@;{6K_5WIq80xd$mgQ=L-Z*-Ne@(~GI)uXD9cW)|zRS#QgpekX
zrN$*guPFN~Pr|8`5obvXgzR^ka3O%V8BCui)oP0MthLm35$K5M9+-ZQN+VvkSF7WJ
z9BY80jstYM1Pl%4+YbDk<WaBN{k&E<0wfnze;H7iJi<psC4#&AY1BU<Ln)$Rph)#T
z{}r`Gv-XO)Fug%@;aK*q@FT{gxPh!apM9iX7JHD(4Gu4&_su~aN+isQtbpnl+HaVI
z4Fg?1B|cqBYeIbnTbL&0d2hJvao~XREdoQlWCj@Mtm4Q%-J=<N)cnDkPac%`y`g0P
zVwRDa_fHj0u#LPNkq-SJc_ycKW#n=I3lF1DiNe`>GVSR&IGC|dFj#f=V6OS=iP?JE
zR#ncc?#Hgd841FNYN^B0+)8-g7&&qtq3=FbXE^<s>SL|Ffus7e>rmX|;(qazVD4JE
zeE?tBjAQ*&XPXI9%?Sb6w3~k*Ld|G3>{5|{N5pX<sN21)a@>ueKkFy7QIbX@9^@8^
z^UsxmVcF};lRI1h!570{@3^gbZJAEA)S5r`pfXnF@>ggnPw>vy^u<y=oi5ASSq0ie
zj}s{!HR4!(+qGs4ocN+8yCxK`9@_`dX>|JAyfMjWvv+s*RWeR>bTr>Hm>2W%!pnjt
zLf6~3#eE{l=$%l)Ag$hUP=&U}dbo!}bQlgVk5qShI()CnBZ6|<147<>l<LHv%?iko
zaY?H&n{b~poTiXvb=V5_3vqy>#fP6`ZN{W?nI#7g*Sd&!n<q_V%{&eh4a4ZS=z_0#
zVXa;uCVm0G;ZKkVR5ELy%92yaA=cj`R=1FM0!yqo1YGYhRL;+uAYTnl)8Xflh*bJZ
zEHwMSA!Xg^&fk5jFkCD)KMX=!lTgfF$!lL6!60owZ09C~HReKUMWBWaXEFbrmoWo^
z--M3^{3bVc?Wl{_D&30Nln8CNj-#XLaU5xa!7Qx#5hcST7PDd{)43?%oqZ-eNJ#8p
zvsjzfWOZpG1=d`xSP79CyrepnW^CA{ji8R^(YV2Ms=0r!o^dA%&}1z>=A!o+xT?DD
zU3F8JTN(!^iq6}%UAk5?5#@hY43*Tql6rj%m}DHm)baf`+uqOXfG~nnFA;><leuk1
z6iodzh$LoLbwYavNI6lhhIu2RUkQxUzKT!4^!>A(E7$(|zkwRL{C@+rvo0T;(Vd4s
zARfue%$z@mFE5YNHouZFI1&MqRjH}NQPmrj&Yj`3^R40e&V-I+gjU94?xTPJ0x#pn
z*CLno?QRD+TW9C;8Rn-D@i2?O2>+FJHl&A`4?!5HUa;D0jRBR;b7ZVcVPaz)?I{7&
z;F{;j5?hNUZwUZpWQ;$f>@emDO{=xEVd2^Dp|7Ww`AE}&hv)f99A@1qJuR)ZvL;i^
zH?yNFvxEH8+~^#4gW2y;J2-U-0BUL=cO2Wc`}_SPK2Tj+See;Tl_`c^rSVC$Czm*f
z`rCn~$B)o*!zD1t3fH}S@pY7ECMP%>C%&QK@Y6+W4V4r}cifLb{Y+YW%z5GV=Mi1M
z>A)<9iBc4y0P}q5tSzJV1(i>9dpoxl&?dsZuEm0D`%%(iYBO+63{)Z1tv8>@77Z9n
zzZU@BS}YKu=t(Zp8>zOp?<?YP%44}geQ}xi5D_E&oAfA^VS(dAX@zoM*5hLnSGOBQ
zqc=~EQ>MeKIcl%Gi^lTDL<0gApxxK^tY{gZ8B4bM7aPn5g!7v^%ha4iMM*~eM*$r4
z^*jl&Kee?Qd32vWOV19~!gvJZa*uL6lmr7Aj%Lz!cBJw|((VgC%0EDV(kssO%aj_r
zyL3tjNyWqoNBjG=m+oa|;2rS66|lC!{(1#ecZ!Mp6gbluXq>5W+@jUUg?W(grtEE^
z2WJI;6zF#n7~a`OMJC^QA1_Byo`}K@X%j_F)Huh?z(h+vy5&<7lRw-t%r@-cu2BM5
zD^+u;?V<F)hPn{`O{!O#_83V~%`$5w0>Em!pz2<69+4ov?w<0m=0Jm!6+gy4zGay{
z5Z(sJw3(8kB4sNyKt8h@;Ns0JQ)_^CsDwZ;hLazJq_EM7$MSziCougs3eq|G4l)>2
zL3*?rH_L$35lhPl5GaeC2}SqLtssP0uXLo#`P|*I%+_AaHvE}rbmX}i`;qA#q}Dc=
zjPD07ff;1w6=sfu4HT(bN|X50>K6(SYR8a|;eCm<_(W|Zqdtfb+TP0iYxOSLT{TTz
z=pX1~LOd6lMg4zsXqo?xjbs!yz{;?2es_@utN(yNkh!z^TpvM6BFd1#uWYsX-Yb!D
za2Mzs(fibfVY?{nW8$W$T<o)HT;Z96l)xUy#EpbUGKDEbv!}Yq{v_xXr1`oXh4+oB
zmN|sxJ#q;?)jxQw=$p_T(++${5GoB3yEtOEqa8P`)w9o}`b%`@gGVmc8p!@(uzJ?f
z_y!>$ph*+^!P*`p?tBEcJ2~FyXCPetLGgK059}e|{7T|{%o{I<fmCC`G1=R@VRUJj
z$gRyjr#C<{$;JIf-~6RD_)B|@o&rVl7odGQTtrM0ZyaXrNa`%Sv9~Ga(x&kK>7Dc!
z9NR5iZ~xi?9dNFIyKtH#xj18FXAStreG&+)#tdwTY+SbcT_3w_L^Yh|-TtjHsct<T
zyV}=KeZ>4V;PshBE%ty=aS4zQ(d{;e!W^&4*rCQmb@8$}et$C#)K{|<Gn>H0bZ>aP
zk<o|bh^9wk`qNtb!a2m^8vQ6JpZ0q<_ktmZ6A2H<t^s5Gn1rLue*?c*x^~f%24CX6
zq=uMU;XVV-50}L2zmF#o^H-E`*N=?LAK{C9Q;o0E_C>rl_jEpRrQ<}?xh>#-z%j@E
z8@r}YYkBn{!194PI$QL!6lbw3fc9`lwSvF=JEQF&a!Y8_<rC;<1V#=#jL{0As5Ea9
z5)fcNAR4@8rq%R{K<I`2IidCwa)ui&fUuFz+h85}15CwDwb!~X3t3pzcWzm9tep^o
zd&aFRMnLvwSe!X`OhGF^0B0OFMT@@@nf8dhi(k*sR;?#p1p|e-xKy?pEfxP4WWTRn
zV*f*HLFFY^K>PCU!bnQ3H;x8{(^PwSKlyXP_k|=~vQC;O@j~m-24d@Os!*5+WMYa&
z)++1B`0u$2tP}j~fWGhi5s%fPC#k=g{S66Z9RiDt9{O;f1c#NDDS8$&j*%YK>IIvF
zB#$6{5oW}2HnIf5!GP;r<^!8XB1mIBSS#b14^@Tp#mAo*m%7pmwoegoc?IUaAFGOC
z(mSuw=Oia>q8;x#q;#=uJY*{OWIo7aoCIpcDOaNd%{3<40?kE0&AWDcH8SW(!ibwx
ztSRwH`HdwHB0#{)5ha9EZgIngJgPQ7E)D4O%7Pmo9}6y#efcc@(k>!c{FJ}$Qd_gE
zJcT^7c%W60B0i-d{@xSt(VG+c`97#o)L%=lSIexq_?0*Q8w(6{b!!2+$Ms=s5tz$8
z-K@9Trz8D`Z4w2Q4|v|0#VSHMWq-w*ULJw-xSpO|KUl;JAurAU_LU&qtQXyJ;xv;&
zqJO^2CZ~o<!$46{rM?Mm2)D%3PRAd%{m{pbA}OF9hefz(f0FE~b>4WrSc^m-TMzZ&
zmc4%uAn^VgHRW@w_uii3Z}+!>R52rmf3Sjm_>5J~h2twHfkaD#C$9TCaCkYC9;{O!
zz7UJc(oC_zzK;`8WU<y^mqTQG>Vi=z_c4(X-}cXK+!ELve`o$+9M`zhKcKnCp?g41
zf#+-p;nvFa@?FV$uPGNWx_iO))oMaUS0lFg%K|wcu7ccD@EZVlgde{S7NBB!Pv8@+
z5_&k9V0B$N-xs)PC?cQ#4U1-3$gfG)Kbl17gCcfdMRoqYz9>&V70pNzEobBb1#`IC
zWnST@@82_rc7N>GjcH8vW3Xo&tgzV~xl$Q@N{l`g*I}a1@h6=T%Gf0aZJH|olHIn>
z%54XVMwhuGBQRR_P}*3r>FW&UbCTccU{S7O%#C5V$?N=0sH{)`w?Av!VJ-}xivF!z
zo8BvPWV!w~n(+9)J^_*369BZewJ-OweEtEbG<LpgUQu)_b$i4jFhFCCL4qo0*uTIt
zJ@rBae+cmv2jhjT2=Iku3b)9LT7AUbehpc7UOe3#=Ibos$IXXhlZ>Z^hc3Br-o-+E
zL190jy*N5ZUWGBsK4s|}HlRBQtFKt%nQKJ@98Y+<gWz?!lIG^)1Ka>SFi%xzW6F?z
zESalx4;TEaj0LEcdU}#+!6DVu7RA`HslXuB6t(j;(;KYtWMxcHXdeCu1H}*(YwCH0
zpYWT0gXjR=UZrgLx0wbpv5<BT@%pA-9nT9T_?rZ*WPoI0`4ir1J{XxhHYGR2wJhsv
zD833A4|b6AhA@zgudXt|{fP=WN#QNm4lbOp*%<9d_f49Ushd>iUFy&@Tr8;j?T(v5
z2-9GWve`bVodm+ruqpjmwQ^A(?lzV@+|BBW1#$k2Ft<vy<L@juG{&i#n;$9ObYZJx
zFP9JTI_|=bmky*>W7(o68pNk&<&UZ=q8KJN%}HNeW!AWQD(*izQ`g<`D8lo7JiFjt
z=8!s_EApWK#quJF4bdm}UdwBp($jfllL4S*bFtbstVOB00&lV?)IDye7>`=6h=)-Z
zeBT6pAD<3!A)<++kR^|}g!r;AqBrc_FChcO&Em8yEdd@ZtpS$|W>;AoMlM!bZXXqd
zVSQP^4=8#gzmLb+rX2=RoiiTah1cj=P`PS8K?YCSw&fiIIDIDl4|f6E`@!g(zP`Q}
zbbwW}J^6rm+&e)ot*&2=ScFBhrr2CwJ!w;w+m4boL19F*x|~E_E~2x}W<gq;F)MYw
z3E9m2=hKsgu!7=&p3;z7dIqW2=v*`Mnu=Jls@15lFN}FX0oPruR{iGYRtOJOY=bom
z_(BAg;b<tYiCzz-HsK`wruS}6&Z)^%P6h{uY5A4tR7>@%i@WlgYMHaV@~3p*KjU)X
zg{Iw0SY-KXVwt<42z=rj?R4HHjEh0L>o)pLifivm149clW9&4=#;khv@=_yXf@*XE
z+yXahop(5|%U|<%{<7yoEF;v^==)kNcknOPc&j?Uh2{nH2BPBAgg&!<0-)mJGJo*^
z&x<EF<$Y@<>3|U&XL{Pe+>3>!I#53PYz_F}MexSP#?+$k3gHLSU><<*6JyHKnC4!q
z3`~xW00-BE@7tf>%!?^RUBf+<c+p91+5oR6bHTa6Ir{0j`ire;POggg*F>XbTLe8N
zN2O`=SM7?qD3^mQD9o}e&1O-jV_CvU%YT}(W~+3DuAMDJKL-?bB{H+8^q4ibOi+QU
zaq$X};iX_#QWB!kp)lG)9ho5n6kG+-?&0}Q1HXs(Ylesk-cXNyDJ-nm!dcXBaRe*Q
zKReU)t0Ht#iRmBszOu;4L?K+YN*uO+EVUpW&JdJw90DQf_=B@g|CMF~Q9ejOyRFUz
zxz7_NxFJXdSewc(ebtvh%=GlKoJqZbS5%}_nQe_|1|NcUOAF4(%ve=-`^deXkr7M{
zFjgyGCB~bPA=o!zu_!w^eWaY$h=bQR9goXt$fEf}W=HvGmh*F~J2OXJ_e#J0&US3?
zwQ286J)xN~NS!^1UY!zZ&<Yr2sEdkBC<?6%(Qz~jt|tT2$9DY8z@T7iiG{{My|e5Y
zE%*c@Q@oh{od}9diIPKvet<cRb&9UApfs^lL%+~S!VXMIj-QTfvnEx=)~3e=Coe{V
zhI%vx7N<nbOXcDDXb4$V<&MCeFtW;UAv#D321exGSVr8Nn=T3v1kk8;hWh#y$!EkG
zcUbWI+>|3<3kr60yCbeh$rMox*VJeascB!~p;Jtcu~B&4-rBvj3ri#!#rVFp{%L^a
zP$gPnzcDMSFs4CAsVL}>bbD=DSWa$kke|qX?zQ9H!*=q-(7Ref*O457gARL3FhzWS
zXhbIEL}Ni;p&E}g>UH(-)fr;V)=H8e%M^_J7ZTEdubix_$Lje38+!;LfqjaGb{S7B
z6Ra2>@tnRcRv{)TDjwFjehM6@DW;wNr~LhiGLdyQN!<g1N546N#sclvZ4)ui*E?8f
zc_v>nQ!Cj=;0e*U0@0Re>(%EtxKV#wzF}j73f6~Ln8{}`w(R`Q2WzNrH2<y_@j^^t
z9N@h51p%mHWHj7(b$l&(Jc^G)x%KBDjeI^RqQ3Ck4AIRB=BCp9^?lOu9Wn-m%e?)~
za}@Xu=OYfy(BmVS4W+K9)W%dS)o6vamqC-gYKaJo1Flr;gx^J3-Wv$$rV!D3+lrjt
zKuS>C3HD~SDXaBg9wVs46w1flqLu(puufaf=E>fQi|Cf%4Ez$Y#0ZAk5}osmBhs%D
z*IDx<Fe15psur7uunUbq3}cASqp#qy<e>X-UuqyZ)MZ}aEEf}ur0+wrT7L@~Q<UeM
z1LSVQz>ox?33#ni^bE20{H*4e61|{~T@QZ=Q)CHv|7~Lh4HaD(78>2q^?uFfw?EE-
zu19O}RLS5x>O}*-Nvt}vT`#^MDklxDflXkWzW|!N?t4x(--qb;M>9Kc;NANe`8zd=
z{7L*Hvjim}!9fw*Zp4O`wre?Tq$ntFP_8Qkj$OoL(ZTy>BS5x2u9ipGws=+aub9Ni
zY{Qk5u4B=KtwV{Y|1dpsYo*m5G5e{An46-oQLmub*q17dtMOn7fuG>pj4zltwY)l>
z@cO%9=bOZ01nItZuogSHpIe*QG~?lEe8GCm^}8Y2<M&FWiZD8~e#2%An@d3WOCqAs
zg^2(8`5M(~riY=^?C&fBak1%{+T-mP+wF|xmJ0(h6Id5smwseYt*m_7Z$y^M_LU%|
z>S*CHFv_PXr@h|cxzZA~Yg<(&K_(l_C4Wcr;h?|s2c0*6A_MCUR&#>x*SkP+Cj<i{
z=fsk2dci?d=$XeLLiq1lu1J>PfScPy*|Y*{OfX|9MHbk~K%w@c<MCjQUuo|9b126|
zj7e^BZZG9;x}@}p%vlR8Jl<-1+&BaU;oKg{`I~~v2vNIJ`$LoNx1p|FJ9Z_FkrqW3
z_{p6&MaMzGTz~CyMS`Oi(E$=pPvQp(Fru1eDN&rq$5}wk8LX8VJhlSLW7p70V70a6
zhP?7NYEkTr370Sird2Xf9Fcl=7EE`EG3L5BQLf`mhW<aU-a0C)sO$PZq;yGl35axe
zcO%j*NT+l+NJ=Q(jnbWZfutZUEhQb&ApLId`+2|deD9xtLm1cD`|Q2eoWHpqC?OWq
zP;HYTHh-2>mutjdxP=nH-Z5ak|0+>>+|?G}X5H~!1zf4n1sB@6JkeZRWIAa!%7H)r
z4v=a2Umd+I*31XDodSuDb+orkEeB(1n)#pK^2K^tepClpcZqT#rL8h07c%Do1e)n3
zL5lc0v&G79izJ2_vp+k5FX#u6Wlpp;^B<FgL+s^40t1qXkZx+qmzl<E6cfz_!4E<|
zX#RT$%WD4-)-8ox*Xgl?XT^Na6_S4*7kfkg``j@juA4&LQqO6Tgi2h}bO*;0lQo#p
zgI!HRh)P^YkW4}}Fuz1-NM26jgHV*uJDc1Op-iV}N8S0{#82mA2~aYmlJKEfVI%m0
z%<)PGTr2?JJ>4?x;mm~Z=1x6CASeAR7hEACns&p_3wifm_IG?b`}{6VX>Gi718LL2
z7f}v>Q(NPMt>V^K2r8mj^F8}ylAn@{R&sU9ocrB~-I(j0TKhva_!StsAH7>$;sFqF
zT}b4gME_`t5uAnqeS4`0_S#Q)m<EYt(;HrePw@47z@CT%tbFXY8jifUD0~}J!><rc
z$fXBlA!0dH*>N$N2N69)Zd%cz5925?mT(+<=+}Q`g_}l)`Tah&STOZOYv?0Gv+YMs
zwF6yCr;N0icnGU6r+ruy=%|dUnvoqNkwxWk`z~0!Oo1PmcaTGyDTubaasLR-i0idV
znH&W!l**TYM%x<k87Fg-Hw|0(jvWqGKlyCQeb+Jh*;0is-0=WByK|mtU#uIDT@>5B
z7uh~mw}<mhd~S#I5>+I_P5RJ3VMAXH+U+#EpmfX9JJb`j<}MJ?jQU!G9jcaZ-y8o7
z)hdJ+UpFcIM>+fd9<fg{bezd+PbGGjhXj+3toM2k;!|vd*bS7}ipDWjw*4;aea`0N
zG93O%1%&rKw!t-EGQEolWuJZyUl`x>4BjXTQMDgvhhg{ABnzdl=Un)lM8+-R^9uAJ
z2YY$k;x+Dga@ARr`jB?CAf&G)S#uvM+%w?UVywmr(3oVmk)XKY5P1`?R>ZQZ!kA17
zk#JUHIa!;hvaND`@MaWetv-2jY69!aPR-T>5hreJo#!bMUUZBU7yG7rv(90rUti`e
z?@J9FFpgnJXhN`0Lfu7|y<@kCaZZlbvZ_L<(q_R|9yVm}#Zyw3PL520&kebj?U3R7
zmBtz{UKF@3WafoN>a;cGu1#6Ic{}0gj{3*AzugqveE(iN@9sm}5I1v!<=1aKYk^CC
z2I+aW3YoWxrM7>Z&!So*4U2Mgv*-Qw*<RD=hr)w0-0@VIx=y9uM2+bqwzvq*knhwh
zgkYNO#A}ad=Yp{DwMutx;W1}EhIc}>Z20mVL{p^=eG$c^pdSL69mA(e-1a9}AUtW<
zd=agz@f0UB;E@T;ngs&VK`ko`uO1?<wOoM%BdSTr(@TFdMkDR{tm0&eNxywrIFIk@
zJH)kcz|SZTAKV@Lb#TCkQT>_UizWquV9^Yr97^YcvD&vg^qB#u!6BIW_%w80?M)e*
zKifD!^Jyr6y(>-s)DZY4yf1KhY#jgo^`dD3g)$<90ZZ)Keg7_6UPNfb%bB0iY7Z3_
z4WzGI?z&?JL;$IAy?%oFvTuqMQt2QUjnDB|{;Z5kFX%8LRxZu=^@zyea<{2jF77sr
zPgccO{LQ>B@*hnRC@|abgvIqlb;abfv7s6|`Ku&JvMwdc`HfUjMLn0kj$yo6ohgJU
zo$h80aZp(h)6y%<x|p;BY4PiY+EWj>TC;F|r`L3Hb&_Ef_;DAnH5>H-L03AfFbf|u
z$KtLia{+HA8D6SFXM!@BRK%-Q@NN%S2!D(uchJyz@9w?)!0}i={pnpf8#KCnPHq1i
z0VUFJQIPU*yKh7QbX~#}^glg;&IScNX9R25BIu1Rur1L;fRGWo2M5FXb;@{h#_s18
zfub6gGsg5l@YAx5J{(>1zkJ;P(93E$LcH!fwLd@4iunK4`1kTufE*lE7X`g<IRGOy
zo5tOeuc3Y{Fp*-<zbfs5jGP@tw|&qfD72RL@xx2pPQbOmsJH$|a<U#CW&}ajcB4H#
z`agBvBCSqP-Ub8ve4(X}5AKMktNDJ6ZJJuPK?vmCO*=Y~BqrWpQ%1DCrYKD2v9G3g
zzo0tP#fWs-SVvRe1QH)q46}xvWpWAGK$CqThKlVdbM5<&kLQR445|!lad-xDnv!%2
zOL)>D_OGZRC;l|sEYh1dHU{msg7P*7KDHTx*Pf=x)<g-q{q1=`5}C$e@u&=Fs`BSi
zA+NL$w#0u#9*cvHcSKc&r(x{Ii~WXEVtR{W8dKBDB@;IA?POG|2DA+1Gj7MApqZ$>
z)S2RMi8)HFyCf=52!ZAp+S>|D+chOpBikC9?HeO2igLSx=61RO9-@^NRCNUUReSli
zTPTzD?ms!2H+kH9<d1&T_|o{8P-CP;eCESUD@zrOv=`*SNDNx|?);+&D7o`C;5+FZ
z`GYlMLfv$CE%2*lFLbL97Kxz0(*LDrs^_F8u~h5oBWYcn{*YtcoZfarw*LM}C8e(Y
z{g#{(e9gdVD;yX)V+276^Cn-uGpI2%tZwy#(S3xe_^3xaPl_3+g3$K6yYYHf^YP{A
z*m8ML5TT_Y>fNI~-t%9jVF-P_T-+9}t`6HyQuy{WoatQlCT@~g2_u6jk+H`rfpGIY
zdbt!GLlOmQ5|Fj6tN{j_8KquN4D*65D(l~Cx6;y)*|gk<g3lt5n(3L=!X6dt>jk^?
zIol2yJA-z6!&by|RyZZNlBzp{Sx%W><2NP9^NS_t-=F*83ABb@!sYtmE`PP?CVrQl
zDqWshC62XZpvFM1Y<FtCv-7LH*~Gvza8D<9?}o6sLX^gKf_n@PqHKQFa@yu~_0fQk
zEwlMyL9HNByW_(j;0k1)d+2G|$2*tDu~6k^bbR4Oz>3h2FA_+#YV%%43mnJc@uKAx
z+n>#M3l7PnDKn9*;LhRCf~6uIXPZyQYmdi0F|_&X`Otr;8=UGp{X|DXFw4XmO_uEM
zIu3Pjo4|-L#7@-rD#d{_o0dc`>Q98sUWZadx`WU}F%j*va9O4D=jg#mDjojM0yHt|
zXm+`*Pfi@RDVYgQW{_ZiWvTA&uCGV4GSVl*O>{J|*~|GL<Pw(0e%kh?NKSTR!cO-K
z!?7`Zk2pg|NJYZy-uQL|E;A7^O2flVmzDcT&1ftXz5ivH0B-eQ(k~spvz>Le*=45j
zB8)g6!LveiQa8x7!DQ=MG@Q$>sGilk#DtQp(x&&3#`%#+O>f+XE#Kkf`#GBYr?2L#
z{6FrL^W7=XrqQYLIbIFI0m39qUvIowqRq$M(-OYN)~>!=x`NRLCxI?I`2NX|a7YIv
z<Qns-o9KE+ga}`z6E5Uj>S9DN#>K)+;>2nd$7~(o>$DE6`f{>ad`}fd2kAY%hkz+W
zRvFN*r3FL2R|T_@Ab8lgeexMQ3;pBm4yW%&8TZ6xv3S3zW_xCq!aAwlFZbDPg2r!6
z#Qf5zs-sihc@1*dk%U3#vG)E~qRdjInao)t5kk?*I@Hjg+F;Iv3j5nD!#vG9`|tJa
zDpF{{7Q4S*ZJzZ;=%TD%=wW?j^qAkSP!?A<JZXX{L*64|{kH2d#s(@%re_GSD(Phx
zHDh7Rzn-v#-oi!S%fM0^A4O%SvYOlZhC>InVUGMq1|ti#w)Ov@cd06}TECVAfs4Zh
zXYClSkj$qabdmY2e}_%K@p_pWm|DN#xe&vy1^j&B8(?`ACrc|j2X`%u{f<%|`1&}!
zhBjKZmg&anJdA$j=>~D%-4a??KZ;fym*yINsIFqi`dbqGXaIG_1R>n<!>x+m&fOsu
zOCCRjI215c8E{ijG`}SXQQT<l;I?7Xc%g#E5_!y{st4cYfV5cuv2cLjPK5z!Py8*u
ztPa&x0&G}THpR~QmzpfM)#RumoqUQ8GIElXq`9aFsy9$sbl$k)=Cn8p*VB}ZPTTyc
z!=CHsqIOHC{vL%p{DTpMqjm$*XIBo5zqKv9kzkda5cNQReq<&=f<0mOV<H6t(ZL)n
zkb$yJfB&e1FAru*ij2iJ%yP|Jv6EV$js{ux^n3<yM+@r!#J{*Gw)$sbGGRJ_2$nBc
zLe$vWDMX4xD|TC3GBWBe%Yd0|uJ}S{f)9nt3&$51Ffq*~WH9VrmVh7ct5q3Y7W#o3
zyKl4$3?2~k3<8cvJ6|h4MdO#cftX2WJGBzB##3dB_1X2}TtL-&$I0Y&h)pBS+%%Gd
zaD!h~c{2(V{oe4)fvK`OAh^Km^?IQ_Xluf^m)q?MS^<<G2tXUkqT3o4HQ~Lovkl5e
zHy5mH(>$UE>cbV@R;gk#Nf2?Q`xZ-bb+@!aYC?cQK>-c&=h+#HLvRTXnsqi4YOf5C
z$NQL4)&?|uf&^A$O>7)ldvGTrqzpwnCnJ@^OE7%zub_L2MUPkLH3)3`TlXxn@kKM#
z%K|1%$SGX9<L^HG%vrUATaEG|7|SM$581>ZeVL?xN){I#;zes_*VAL)BfM<TU&RNF
zuADA?y&^6Bw7KYL3ywN1-9Y=jyy{ZN#MV_MV;q3u;d}-D-B%u}j>#QBrbeFJTn?lx
z9&Wis6LLPh8l#0UPLOAQ*1Pkgk1Fb43bOUq450N_czdNY&O4e!f+W{}Ro2V@sj}R*
zMiObHb*l_6R=W}Y$xL7$NMcpzwhOi4@;j$sqqT5x#XO@QCx7flB_vmR5qRV;=^R$G
zLXVAbz1mld1s(iGu<7lOD{+23u~3$#zu!F4_tA)?(ZNYlZA5ov-2XiqnUbwBtv~58
zGY$au)ldinkG~+C69pi5dbyb8bZFQgC+}X-8av1mY;TP8CzJ_=4m2D=Wi{##LY<sL
zE&(#%KBmjCO@r<ON4H-`KvZ>0VRCcx-b&sA;&Uv>?<r&z`K6*Kw59Vq5$;jvXD73@
z*^X8-Lyy5)R;UgD9y|-0m`7+kxTEnfbcmeu67uF9F;0$g`pxd&)Z1gYkXoI9ME6gc
z3jbzlhWWmVlZuYtZ;I;So=-+qEVp5|AQaB!m6rP=UlCO`lQnn1+SXK)#;<W9fR45z
z3b-8~yJQ4cvi)G#6AFh%-q6t){G0}*lYjx<j{ejn)hAKY7eDTAs^u$;w;)IKw7B&Z
z7s5p4l)B7PEKY85De=c2P^<GwAeFz15s;(+j-4mI=#_X$$i-7Ao10dXtU6a0yNrxp
zcg~lJi!s+4{p-Q#H4maY9E6cyZ^xP?kb&>=+sgBv&~`kEaTk?TrTe4((p?0iN3glr
zV{#nwwA-3gb8|Y6DAT(_3Fjr1POY+}zyf^@6R|J?c=v8j9wHgOJvuhQh~YtcOZe~K
zUth=1-BP)<`&JtJ*^%70xqqZQ(>K7Ii}bv`eJb+BHS6YXLXeO`+A!jq9_F6&I!T)5
zT}Of2yD!%8AJXKp*rnthbf@^y$zLL2o*uQBtZ*hSe^=stm&OJUYc?5D{_3J|SXyX}
z<pPh`6Sl0JX@ynIS|$}{l&Pk{?}~V5)0slcCoLUti$Cz=S^!*uq1AUMM>kh`kuMT*
zJ<%AyfA-G&^2b`bMPWF-Ib$C+WjvZsoO#Xw^ZutBhMN-)oAa=n1RX+)0%$M2F3o{z
zob&J+F|J40{2ZRn>Wm4B1pHzL9(7=N^l8kTv?0pjx82=dn4$C}2>|kpZ7S!he|FN%
z1yuuA3riJj4aD7r=CERWm_&PDeLs2lOMM6fH&ZOk?JaGpJ^v6^4g0g`Oo1c=LMyiC
zlCMB<9(T9!I=R}Gn||`*r0m>iZcIIN<c&|nt^``J`SN@6&CX8X+QBh^fBED(I;M1+
zo%R?b84R{jz0+D*HeGoU9mI{zeM)QjS4Tm&E46_TibPsG?Wr)Mr+=!2aKKIfz2jC%
zo8otQv^;9>%vKd<a9;iw^|EVv<Eb%}$&byKjMB3%V*b>u>WQ1+hqg#?#mc{Wcr7<Z
z<rY78=Qf_BV{$W=9fZt#5SanX7}xWuMaBR2&Oy7FRKR7t{>?CzNGm&Np1&l9kP8tY
zHYPJh^(hg|Ab1-b6p&DJs`{eErP<S{sNYBuj)+3K<F$@siZ_i!RS!Q54(?#QV*}2p
zQzk_;k$VZAywXJ4P?*iA*Xl><%tEt8*09a`<GUa|;B)P?;j}hWh~<F07|SWkUcVTb
z4#%XDtU^`8!&*&4m}g+r!m7r~Ag=DZMTPk7M$}H7;yH|}s~aPqdVq{iF0R&Nm(zZ%
zPV_{H*sJf%b4U-*f@#wv#?0;7C8a;D_QI;*)XsM6UaGEu#51Q>EhpA88+`#`eoAY~
z4{s<FH|Jex?kv_0;U1B_*)8^*9IdQQR_+TK*PbRoB7UCf?^7Mu7f5`|WK^%m+vX96
z!`c2gS_9`Y+LHv@bEZqO)pv5p)k$>?V^8s;V)ZY3#q9G_OYA|Ei_JZyDW;gw+}!=A
z$eD+G<!)Nb^+J)w(Lh0?<Bh*-dtXASszt92O=P~DRi~D8xSc5i7$w1byEY;Iy|(`o
ziOtIRtH&nDyD#RWsNbcK?|Jz@tHlN;N1TK!>I<MEV)~}{rLr>ia+%YbFW64mYAU8x
z8@<LOqWE#K%h){xWJXz8L^Z3i65g?KY8xt8Lk`9Mc#^l@CBtuoaV`e|U<;nHkr;rp
z2zEw5*!^ZqK9Vh~^uz<<HJn65Q*>-m%Xnxq9@q5;Omc--vk-?6s|iKOj5-5bt+wr~
z{7irypM1KKB**C}kco@m7o+MTog16tH#89+ZyQCXH~SEvBC-yS>?gfsoNFjDoYqre
zOW%n=W&4@TK$3`BP=Z=W=g(;TKr%AYmbaH-s{W~;Ngp?4W=vKC8Z9zvzT%d>mvMME
zmCyxfAx8>apX(tVHej%Fpgw<>c~bz=G6<uML8BYQAW%`iq&6{t7k%xwv=j@6zz1h6
zeRB5M1QM~&=n&Y#fO|za(<;^D>OinK3S(t<yuilMq(Mow-nRu9A;wM%O~nr19!{tJ
zSy6#*wxWib?@7%(8I_>b<&+0TLbf-L$Mr0mYJbJ9VZ{4i*b*hH&&FGYVU4ovCFpqd
zmd{y9(s?piYR&pf=;2`;4NtHpIK4R=R&}JR4-WNz4zq{9LETP9f(fwGoA;|MXeuUF
zTTo5TvY|%PTZOtS0yCiNGRO#C^nUk3Nj>*<glu*bX23w;t{tmf2mk_|>;aA%7NsMr
zA_AZPPWYqY^-|F<as{FdOg^Pr<(Hq3f_Fs^@57OF*S7=EQk0$F$2#eU^Kl)=F^0@D
z<v#CK&gU16$#89#Yj5rhuC&V#7T&^&$dA#{m_>%+tI&9Cws3Nf7S~+E;bqnRvCbQf
zoyK%5Z#->dp-cLYakneyf12QDTUG1p$-vouk1ZcU$&<cp-J>WP7z)<V)X$O@wqxNm
z7K;U;b03!hH8520CinYt$Pzk-5}XTGS38aT#uAbf3{o@B5g|}HKGv$LY)!;rqkc#D
zw2(oK#}WCY$4|o&rxFPt(f$@I<}CW_?WLP!cQ)&zl-{0nkZ%T=aPHKU#j|~T5#n<r
zlu%yFa?4gRk!~=o;9FC5xY0kj*U%Ieh(ju~H=7smHiHePc#c|c!3s6Ddds<L6o6c!
zn+RYa_tQs}R#TVgO1zG8JF@f$em6Uwejb6msp$sAKN-f~>4d<ti17qX1DEe7{NR?J
z<kwi*ru;owcs}c~7Q?yF;;3dD--<lj8E*!}MSO>`B72N0A|b3E%jult)bne3?`5Ad
zhTQXWsQuuy)^nPFYHmx##!>W_-{Oeg;t^v*!BigyDrFkCLPft``{awudz~Ix@G&|l
z6=wx~c5}fW?@v{8Nfy9^GjnCnE8QCRupc(R;z8nmmj}PRuxkvOZoYqf@ed=!1NrXV
zY-e>jW2W@C4Q@o+yF*656sX`{hZr<9>Zy<i^rIrpPSBvh<%HTn(3qh_^ZMkQNnRT&
zY9(JG(O2-^uoXVXj1;=<qp?B@eJjEcy;_HPhHK>36-em$xhBf2{}x@fpO>0e*pCcl
z7g>#$#Nk!K0&Z>wVe!Zzc4}Qq$f!oT)i3iVP#QfZ>D6Q-F8(NYu1Yl(%andnDuA1w
zwxM95#*8VJto{3Lh5GLm?r-e%8ABPKC;bu$`ZDOe6m+rWI^)z9k1RHc(wB*y-x=Y;
zzF$-8ZJ&M0MHqslB&6`ROeY(zX6r804X=&2BK@quth)qB--H4Nn)o<rGO!zoZ;Rm*
zdq;}-VKmY&++O5{QG1Vn0rG^tEMH$6BvNc@u$3x~Nz$wh;*?x8Na(5Zci!Od3<=^N
z-XWdnh>a~BQ_afJ33?pDLN;+`mtxAGexc^&PYaO-gw=`GSszRQ8ktlB%!wp$sj0=W
zX^Drzg8ck9`B#Itukq1<_2IY>i_6@(!?T2uG#ibzU?8AUczD$a)B%F!2t)L{v{_>w
zhyAv8$-PP?$^hPt`$~O&z2~9R_@Q|q3Qb0C%8lgJ)00qePk1aO;8S8YUbDFRU8*P7
zJQ9jq-GxO$4+#A~p<oQS(3?K~)Vjmx*&)rEgXN-$B)D`6VCpD328KCKF?#R}S7%)&
zTXF16!#i(Kp$MM4Bx08x&6j8lEIg&P%$j*3K~y!wh4&vY@qdo^$1}wf=<ivLjlQ!D
zll7Y)KXRpIk5QWs&58yz;z-C5jvU!62)E9gpC-WHVZcGw)>EOJ0u>qh5^+nxAsg;J
z1nFEKaj5-GTX~NaR?&L<ImzsAq_E>(;6Z1L)WC_c<?y#7yW@%OOQo*Rr(r?@-dCcd
z%el*>0V7{YaJ-Y)kzjd2iV3<lDAw_7naZ-Q8v(camiXJdeUVrtWzczd0cOOvKD`25
z(2$ubE;Q87qHJ%;sw#E1T2D!C{VOEG*hnbipe*qiFX819l=OS@fP%pB+xUbZ97PEv
z;67r~_<JQNthuQpH!SFU1ABD}l=poiOi%ddI~KVlR9i>QEwDK-1ELNjk7MxL5JCKu
z>)}NF#v4;1OYBu1_rv6N;8az-F(!vq1?9a~)8+<_eBU20nb$ze+7wH7QU4P$+sbrr
zAOEUTA$Q=t($DzAszSq?bEC~=Uj|s5xr9zn;Xc#77yRm1PDO~rF|0SrKWs}a_q&6G
zpfeTPDF!Q4WuV#<bReJ|ysn-tL+pLSLQNvo(8H$?`h~@~|L!i|W~opLRmOHAIZp0q
zxj1?d?o(ZDar_Jv92QEinU55XVE7>$W3exuRP~)~Ee*Wx86p169~jMEF$?st1An-T
zDBSENZoixi=FBfN(tU{K>qc))aBDh}IPnwqC`C~@TukvGomPinh^8;{N~#_vn))mW
zC@jwMmo`eg-ttX}NlWBPhJB|I!J(%{8_1^l<XZ~3pUAkm>$haZ;wao7#EuV}45mgD
zR|M%j`$<q3GkGs#z7gDB80TRgYdz3=&+&nhb;i!%F7yRpejF3uea}UHzvQ@zhg=Jo
zFJT9T=3{c}sc$36D;P_!RsEA8*Xs+crc1c3j(ZWi1>3W6e_m0oJhfPqvk&5G#@9Kr
zmdeIt*t1(H{(d!0ihG4lua72tTs`M4kb)3chz%v%8B{}1H~sw;H(at1Z&^UfR2=z|
zYeCXadnSz_CFYRr!jhhF?QbYjytoi@2!*2)vxVXi{vhTQ74iP4h7H<%5)!OD48iix
zoa~s7u$NVWTBS0-fg*j5WHAV+L!gyvf2sSpGx?fUQP(B#baKqc|3w`*e-VP8hw@;7
zJ_J6Of0lJrgq%rq<Z^T!_t?WGKTL|j8}4hE{GV_=JHz&fggKgzZ2BNzkM5~;bgpMl
z*|uF)v6F$0)3R~epqToH#Q$~Ro{Q(<qXw(0>c7uj8aF@h;pUDt!W2kKpIEyq;7<R9
zG=|{*n=dV0hVt9cR8OV)3iM4hM=)jPCNe?`hvjcm`cVkk_M}%CVwm$U8k1nl)*gkD
ztZ>Fn08S?)>@d$(l9ZK=Ei_v{S+2T>qZ%FWk<HE|WsU0aEMlPg4M7gCdweg~BQZW=
z=*kjw{q1{}zsYH_i#gej*FN`ow!+!m{QNBjn?M2qdhkUheAKISB+E}E3*L=6NC!8k
zB#*{0zI@5I`h%!`xsAtY7IskJxpq1f{1da&x6)nVsH)I$JyDdyc8Fr3%6%%VaxyB_
z9ErKTw1<K7IDEBtzg>pJufL~AiUI&K`|lWZP;&t7I_ZWoU3YSF)RJp5lO*p=L{NUO
zid|V~V3+x?=|Kode}4f7^VNY1d!d5BH67<5y6}TA5?$j-@e!fzP7We|NCW;sIk?P)
zb9#(<x~-)@P{G4Q;x=Bs+(kh1z8;s649g!CX!T5qgBmHB-PY+@8Ih-#8g$`<ZA*rt
zjBc?S-D$rD=hlZHks=ns-BnbNif6aW366`Uqu9I{XBnY`IPBp6B#_IV<XOD7Er@}=
zuwhcW`QtaT91^A(nk4xo5m*l;xCX;-I$B*Qa~=NpPb$5FvBLy~rn;U~9l$CIzSc(7
zLM3GLg9X59@$(9F(u&FgZFVS&%-2_c+G@%T9R;U_j!581G+KTDDPMtGBt4A*GZFE>
zu1+zJV*x?R+CwU-pN$5R86OSCsaL7ARxCZV2EhJuL#>>Q(P1Ss{e0Z-3c;F}A=s*_
z;aQQzrG(n>I70zK)mn`=Lm#7=WQ_o(Xp7Didb7^dtpo|A;@ff(q8sJ27pY?vl7ahl
zrRdDbKhf}-1yrQ{t-&E#q=)PL9m}UZLe*WFNyZDGj#%pDn24co@JI8*%4syB=@Md2
z*)FlN^ytBwPu5$GAjMc}zfRmOcm#cvM{TS>O)TuA&){L9;|avw8JTr+dIsC<FO2Ky
z8`(P$Z|*G|W+{DhL*>Rl8|F<E`l6g=dYtR_nv9XH4rpp6khRN86L$)+k1`cpwfC?R
zCb2+~BbLDb-Fj@WcUnUeSgXmgFr+Lt%h6b`d=%@aI;a)d0#vr=w{(s_t`hX`1zG8`
z!V_H2#6JGpm7qR0_xwIN_LOKPzs8tPJan$e!gI*joFfIz`AfEu$$vc9T>rfd|J$5b
zn8$%&Iz@&djDOQ=gqBK5qH_nC;na3Y|Gf#K*V+@52N_HJ!xL(1AqY^|27m0|^nsi`
zF*tJhMo9kY#jJC2Fzs)~+OaSB*<)hX76Y1U`hrzxItTo);Qi%gps|D@1Y96r&$kK{
zkoB-9R;3a3`J_5hd0hUXGeHk*wHhr&2M9C~3VWF~`%Nmb8Ua_5+cvt+_&aVsIOKd!
zd}74U_addGHBOcmgfA0oSGnGfQ;YsxxnuI^)5Y?k3ypG)$cz6f(c{`M%2m2IZ;Q(c
zAw!g1X|}`_Wb?ZnazN+~Rrx!AzUTcrth0*!VCt63IClEuN-zZ_!NRmF7XOK2XcQ)+
zVv=NSp`nd1qFuVcg7vww;k{_pN_Fv^Zm2v+jN|cLu=S9(Z?QUpzT>(33@MMHFZ_68
z=gVVMaE*#yqhU#B-pVHm8uLZfO;H{%y#z9Mh}97JKUCOtY+xe&1~5{m$CI^@cM4m-
zmFp_=#d>~I4vIh$D}m=Bq6g6V`lPlLdoX8CNBu896o{RTR(hfpwV`>7kfji9*7>EX
zUY0$7Yr!|$AIvZet_f;`(Gf(GXWvk?xXo~Qg>5*vIk)8AmrTLcz_+hTjBD(y7C*;_
zwYfygxJ^_-!EUGTi1KGg;)()X)spK%Go*MMYYRTSZd1=u6MA9eG=WF_!fF03pPc~Y
z0wRkirqt~gy)&ZtRoOa`T)#60TqE?PfCR(aY?=a#%S~NvX(<Sya3=!=owyb?1A?KV
zS3^mpxAeP$QL^g7(P*8>6Wg0@hPj4)&APm<-<ckeP8Hw1xewy0>U1bH>}T&RwfIdS
z^jaQVgqx#tct^;?u33}3yXg+;TuY2wA6*71D?yeg{}pXQQnV=?q@}QZTK1=^tYTN+
z+sh=bx^VG4KZej=<m{*|d1^d9GqCi8&HdIqrRygfD+9Nh=$j#m*n=c+I^ZA&&((4)
zMB7=AD27rS1Bn}4X9sXtp{-h%Wc764rSD^3Hzp@-o~ixfud~r2go*+*zCNT+v~vFT
z-d&eAS1eKf$Dj?^&vIg88ytmUg>5#BwQatz`I7ZD_=f#;4Q8;p8aQO8>Q8ImIiwo7
zXT^O4zXncIAIjFf#wQ$NZZCP`uvj^H>$9ou8szHM#QG~P?DPQ-N|7|YO;MG4g2`Q$
zxMt|~!gM;hdI7Pn0uDx?-oP(@demev#>r$S6k$z=yP$hG?@MaI?CK^)i1QZn|GD{^
zL*9>)SZAVJwgMlH@v*<k!&zNY<>T<C)zo`26e7Yt#z?rZc{Dz+<!0~}cuJMCeRm(-
z0)h$rf$@Ly$FcG1Kz>i%?+x19+o?Sp0m4PEw}B7g1}ox9v*n<dWto9CFwqfFGK{)t
zG2%J|+5xnVuV5Z)1}v=luW7Z#UVIz&vub3+O2fcLMS$m5Ca%5`mw4m!0I#N+1Ak2}
zGNP9A1_<fzwaje2m5Fh{87PL<SlXm}o*ipz9|)$OFxDOGOil`CJp0=1$aNk!ReKtj
z5kTYiEhsrwtRc?fllh*)KHo4E0wf>zOWhA1%*uVgkoD!y>_&+xNPml*!X9{0v#!6B
z-o-4$z@|D-QvE>F^|vK>*<KrHARwipCp@e~0&<}ENuK6_pP15kw-2^`!M9dR2%&ug
z^GbupW!&l&3c_g+$%7I#Jj@^8M%DgkPB5>y>RMCC2;gx&bqy#lG-ex6s2Zr!KyvXD
z5Tzi%`y-#`qYK-kI%H;ZQr&f4P?Lbw^((ZV+&AqNMCyJ=FZq2*1VkmtYGlw2?EGSP
zuksd+D1Q64G(d-l#f+J`QRSxbRTx%sH=IOO`Ri02#P{a!U|-xxB90wbH0u#bk6Z0t
zO#LhhJ{%XDBZ=-7RI)<62oW`CF(=(abU+-YMuPbuxufaoqRl1DNbGQUPrywJ-cefH
z4n@d=NtsNc6A#ixcBEBB%Ux-hWR~kBd0)1O8fyFZQcYaZtBb$jM}s`6f6^VO<&6iZ
z9i#GFuhVG6qhqVHklxtp`+%SVG}12w)O@xls%ch79GGFXCr2e!_ee(3p~zMDs1SpT
z0zsMM#)OsJ!a`oY=@(<PjF@)Q8eSXBPEhte!K^k@`PH$VdA`1qk`{n^h~NiKQ`hQ)
z$y=D(O?49B;Ts=b(^W*{gXosLr~J1TrM5eX{CJ%(WkuOyB+xg6lZ(lCtD>znMpI(F
zzMj<@GpD}|ynvWpPq~!>`NznsIUo@5K)%>S=J=+uxwY9nC!dU&7BsN0L;lA5J^8LI
zIZcgMrZz>?*56<0gU;1Mt#6|6js-$?TvFj=53gz{o{aEOO=Qytb?^8x0{%vixJ13^
za&8yP^N35*xi2XOshdbc@E|Sg?>Ie^?yv8{PTksi1QqUY@E>2!p8xbkFnS;+91^P{
zsur;Mi$(0Bkffg0m)+iMT^4aCY$6E#MSVa%XEj&(Q8wFdFxiw@tSr+6xm>4!hDy>=
zXY82)qldELmDbkHW3P&M$q}&)@N_os2^+m)_582GE&Na6-UR_XIFMGbDm6I~(SmC4
zRLbD5oC_+6)(@V49r|o->uO!txRP}q`D~d{qo0egKlvhC-%wO`0bi3PY(EiW^wPbw
zGW+ae)4_p>vNE3c5U<XhE(d=(3?A;d{!O?U=j)xS7XEu*@A2DZ+o|35aq3PJft?=)
z?*>wyz&18r?p%C^v1TQry?&Olmwl3o<@>!Zh=9Px5;xOVplVbEa`|Jb>#DU-vOSEW
zBq(kn>9O$Rlc=-LpXeC6YPNBTOP$RrlgsS)KimE}SgHB2ObIhjhRl%PD-F<S%yJTU
zwqLrh&U%R&IVQ??2r{>CA^(|Rc-U1L9%m~tL>5x4;l)P_F3Kuc1d$lNqvq4rM)f&)
z`g$pUebQEmdd|*lc##Nb87YQ8`_l}+K!P#1Gx!#+`EI+I{ETgFBZ}|DBi7<Qjp28H
zZ?)(9=Le%w>emNT2qxJbuF})P2_{z{#?aZ|R7T;fqDcS*LJTJu?eEQmiOo0pWaf{$
zI1e&8GgZ;EKU51PSCQh06-f~xZVN)kruaK=lQ29y)Jphp2*RdR8rw}IC*Rp@$PeHU
z=qark=c10^A-1w(xJD270IfRAv_6kG*V3}s+G>i^w&DxG^WWtPaWphEK%sc5ye}La
zNW@lHLw0k;ccvPjc~?*xEm~GlwoT5Aiz*yMx1oNGLyIfq`4#lg+mot%Y;v2Mh#+JM
zsqW@IaACcmfTv}o<h@jrs!<H^#tapZWI=eMD}@giXq2V;>L(}fc?|!hvUw(Qf<8hb
ziRkaj>Ky0eL?yW8<AN=zXUPs!s}q56k2hDE!1uy~QH60o>9#{=K>{669?n~p>TdU~
z{%mwl`w0(@eG>a1Io`Z#<yVauJSqo2LhFBGI#M6Ha*u84eBn?fs|5S;TJTg19j&+n
zV<K6J$Z7r#=|ay>A7bgO>Z@{I1P#v;4l7PhkPI0cTVz|!#{RY#A`X4BOe4M9iIvb)
z{Mn)`llb>bupiUC?nZOevls+~YK!R&qMWS<K+~?)D4o<&5zHjj7KzRApyW@Qnk-v<
zMQ3cQsU?nI^Q;;(cnCi2KBM(G(?`!jZE=vE8sTNH?Kc#!F}*9dUqo&H&i#!2pcci0
z@E0@9S7jAzCGI|+;qJeP0tO7pAds$VS@rl{zN3xV*GdDeA(5O{nk-=Uoy~ch+J@c#
z@|^tJkxr;QU6Yn=Z4MK!`2zZtGZpRO;Z!x(dY`|=jKZ|0lb>Mxl0G~k4cCOW0!4+_
zBB4jievh>Sd01#f;ssGV+}24JA+-4evFzfTFR3Au)^u*pXHkBC?tu0d&i|9zdl6*U
zPRV$W)prtO(pc%41m>(QC=k)QTOA7dii_BA9JZY<4^tNEBLyNXIk8QRU+s1>SsriY
z2lh$cD}~;qA!09Nzw(Yvw7dQd903TvH79^v{yO^|=n%&{X}*K(A<N$cFw^>rzC~&)
z`AD9)i6ifYrG!7gG|%>5?o64R%%yjc+%cN3YSaH~bz~A>(??(|;ER}bH7<gbZ&08{
zBG;Y=-W{9)8vQscV*2e<PNQ5$>n`6`CN%fc`mwMlAAOoKcBgF3H+rwQ`9*L@uf^el
z?-6XW3X?Ao6|4qr?lHth2yE}m4#xc&uSs5+-67s=zzGJ!^@zbV+GCjOi8;_;nv9on
zMP~xLn6}PDf>NzuEZVN?V#(+8Tt1FOO76p@Z-mFfpA`Xh{?p#}`ICw{{TrvJf64Q8
zB`7BOVQtqAE0POF)|X2Lor8imaNT4!{Q0*ZY!Pi;E<aVBt8$&*UT~c#Dd>5#zFvAr
z?0(ZJq98+S>Ed&mER4fwqtO#O{LrN%83w{EqfK3&wXBmQqB@3IZa)>husq)=Dv)MY
za4ja~F_Q+7>BVAXir(NwbaB`@*z!B{*!2lFzlKQh&Tg3p#D|<VFnY)2Zu2Id`0Rjl
zN?mdph82b*W~F`*$36Mx4i7pY++2POED|rwzkLdQlKS0K_%z3hJ0|GavtJ6C`zrwz
z)}L)d0+jL4oi}wvhL-QjXp1DIUj_5IO3O|n+0J(>6aA@R0m`~rI1R>dYD&aFCxUc5
z@_r6O;&UWeqWZ>asWg}DWYboO`_O3&#PNV8sBOqJ-4`5k9!r8K7)l<afDHW(1V%0{
z|8urFcg*J?K1$v)M}ig_0Z#Vp_C3StD+l4Jj^B4E@5X&&5*S`1{yY9ocp5fZb}9z;
zm2_DYXGPb1Vm`<~>9h*Y{tC1_*X7P>eUw#YNS7;#plk^A{JfU@|Mr8BgOOzT-HC6v
zgl}FcDuqfzLGYCHYA3sI-tw}UR&|HFpaAH=#|Izpn)I<GRiZpvpQ^Lx`S^?4-$Yn3
zFm$kUa$6D`8p@v8?KC%zQM+rGhdrQl5wJ-rl~c=vP>QX8{Dx!_hbIRK+H8cXWP!k#
z5{uN%^Rxbr=gS|%o5!OJ_4l7M;dkLzeHvvALhF4Y`xV^(@R1A{-N9@4_Ko-TSa5{U
zap#)uruLV?%~?oUl<z;<$V{~HxVq7o#xX$k?ld7AbNR%V_g|&TNhl!(FOGZ=H){2~
z>vNfSrYn|aa{7o0VRRvcc8vyIy`Zh&Qfi`XKYs3vnMiklAQD6qFz;7|RWmJ+&&3&f
zkH!m+5@Uq~EjdG*gPEiu8m4I6u!t120T6;UI{Hk)uLz5sImIcrgb*>OkDV*x*Dy?N
zk-874U8t}kIb!cn&qOXIo*l(_<P&EYNO7_FjVr$WJ5;EXnzAc#PJ#gmv(OZ8;z`?^
zAUtTRNFufsaUB<hz3#3Z!MrO{f*p>73ejHK<Chjf_ET#8pcXl|XTJhfg8k2^(3X)=
z`Qb0M+_NMuGJQb~H<#;d^|72@x|a#BJhiDOD$iT$lERcwOP~u30_YNAu|z9WrtN*4
zd4pd*!w);hLFCtT&!V5yUK9!ee&He2GGV=LLwdY|n{AXOr+Me#Bm}|<A4%THXVM-J
z#~U=-@RVc}?Vl=Wum9uY`JYKO58Hr%9p^Ck5<x=rbbEj5aBM!)0}-G;WfEF|A3{_=
zKgRV%0Pn}X09-(+E_dbg!vqC+axB(~0NmvDWtFR8N1Is$8;<RZSim*!Y>jw;qsN_0
zSQNVSt5%tMY>xi{o4i(vwvnVGA1+*?kr8MAp2!lYo7U(w_+W><*EwB4q_=uHt2iZ=
zPMf3A`rH*saN5$eNj(Zl!d)@eStBk*h`{ch*izNUDH|`3jla?mjK<;;@J*8jh|3EH
zts@`$_!~#arRDw*79MqqfoWUU>e6M@SDp>%%Qmf*bXf*{UOk+=+JH;yanT&9oh7ec
zNL#Y8_J9qF!bx`XJKqJ}V;*1T955fPJR+SiLr4Sc?$-N!@XQ=q__o)~Au891JGA5y
z>gw5r9~q=06$^Qk_wSyqb&mITilnZ*!}c1WGr6XBq<UgT#Rco-7Hin}W51gdCnE6>
z7U=|MMlHFZhHbi`B}P3?sK~pc45;9CPqDS4tK-g~A8gO2)lET$ZZe~L)cX~NPa08n
zdpHmCCXRBFwL%I`*W$_ll6fZE3nCZJz}$gk%*MvRD>0$eWtp|hAtHi70xKL{{u?&q
z-rKbBRGJuE+;a~6!*^=*kDd9>U#(fQ1p7UIJTDsKEao~Bn)Y^XGEM6P<2gw7gNTL*
z{uJFU{<8G)7@4*Jsupm$!1SU4S~&W#?vhc|Z>6^GR;IJQ&``X<-LR?6d0pv0NA`{3
zKm2;?(ceHvZ}5P^ru%2|o5tg#3)@Y*>PINyx87AQm>2ku;^G#1Lt0j)>T0jUo7m!l
zYG~9uzcAV2GpwWGqN|B?-Drm?n~1z++M50kjAMizvW;#=J+}_{iP;u<2YbWC7-U-h
z0o&hreV7Aq@2$Qy0T0N&8O0)}`*bdtOb{03>=%GW<~0tcq5K#Wb8(Rc5lLXp;y;Yi
zEEcn=s15)FVC{t8zyD*~!=5wR_0y>8GAfmq#(#KjkVFNt`-OQaiTFIf0s<={ZjMC6
z{d03fthWC)KRMfb8-yBqaxE<n=0O_L>fTWm?Zw;p$l+$Q%CfTUkHcvIq<z$F$AAvD
zM&%NmYL_Lw09|&xdwt#5>~oxmjI(#Or%HC#-#>AZSO5Shlysj}>aDe*PXC>!>yamg
z4;+L9)z<rD9JQ4NzC=QHEf(EO9@5IU+2$Z!wY??d;$~waFxPR5nm530KYaAh;TCu_
zDe|{%YO2c<9CiGtBc9IUV_k<1nn*s=PR)KDZJC!zU4EV^OQ%8fRi;fb@Vs9`L`~~r
zoIclq<e;S5@*S-2Wd+|9s^e7kB#V&GLaAdsJz3Zi7H|pfpT(05-}$QsaD_%XI@X4e
z1`Uc;rM1^}xiv($&aMkHrHKf$H7f68w$WxBaG1WjA)0ooR-W>j#$+)|AsborOtoJW
zY)@rVCj75QXeHxc1j@tq)Vge}4HcFy|JJVTpN0LDG=9}@%*^=2$OHe-M7Orq$(g3o
zQhY3>S2sf8V*9(A<#?J8rg&;)&Jtv)8YtO9C8tA+QMB{rpS9>efn{MRyzcSV&ZWhb
z&6+5t$ijAhwSeK&%wPe;7~TBSec>=^1Io;JDWtVkMMa|^kc9@g*hw#}8J=dV?3jf3
zv@crU)Z)*8(8AH*IFjSa@*hIZ*pOXKRcm;uqG0>yH(nvca+{q`yBWOMX;f^+blS!i
zLV;mxT=na(c-<WXmd#puM#!tGDo3-$STzqGzfQ_qkC7tu3JLJ(9cipgB+-tMkZHXV
z*^k88lm4CGaD(i=MD6*>;YVX*dh_M**BaHkw)E=iPpP@pU8Z43L!27JBS~7?y8QJa
zXsfMomVb(!Z2Bt7;ha0+YD%Li5cRYN%GWx3M=N&S4<|Qwr^=E$9xk;MT{(v$7s*tE
z66JV#(Xy>9UVI5VMZKDxj5!b-vPAHaOvF?Py1Bu8${gb0n30gsp1Vu7;HD+4rYb!(
z{eWpH9Tu0>9(j^ANI!~Vvbv3YmMLhLDO6r=ubLsE|Nmp8AJ1%MHF41~3Y#LR{1wbO
zO!G$ldrQaY8EY&>S@H1!&-$BD@2Ad9d6T^Qje#QASbxVW$t8^Km~*b~i`LOjcMk;F
zo@bO;Yos&T{Uedx|A?@L%*<MpWRdz0C9l<+KYNC`b$?91*qY&glF`7#67vsuCtVRP
zT;JNA*~y9X5HG8`V{qJ@tx=yR`(qHfkuKk)V0`Uyvv>55XtbU(eovT^(X2Sq_%vfb
z5S&qEm6%@A$hsT^j*lT^6HmYj=y}txDqQg9_4Z1Tw9b$-SbOwFMG20<wt6We$)=M!
zUEMC51~N^(CuA6D_755XX~*D(|IaoImIp90-<-d=xWH}=Jp3e6_xPx!%VxxJc3B{y
zMMX}e-0B}%q2G}}Ir$@7wcYIi0ZsHG6H)Y6R~r22)xnQ5N+R7vX@R4pA{Nn{H<oq4
z74@4c=EWclz$8UGNZLTC!Iz9LTYeLVsp=&Afv5XjIs*%$R(sJS>v=R$HR}4$zRIcj
zn+`0{*+|%0oz#HYT;SA|&glC}&(>}SMi9-o`&$jwkk|(T(pVZ)lY>kTee@-!HPw$m
zBzU32;cSYQzB0ahG)-dQzp)|X2A3e~r6DF+?fIv)s?BQ5k>fvHEcwXIdC9-nh5Yy8
z=14xCxQ<%7{t@OxiV6PTm%V2{|KO3nD(~$fYn`ocIbwg_ac=Ur{BS|%&xDpbsP9z4
zL#O^0jn2Nm;Ejy6G_7H>qjuA7sv=sH#+u9izuD^2$7`8?#oJ5#MD^=?Vv{_lrPrYT
zusu($I^RtK&)vAa2aCM$y}=WVV)o2`*)CdtBVw~#_>t1Peb|lgbraRW3>b;8f^GP<
zh^pXqICUtzRJ%4Jk#{)vb5%JNBkeE?V|MeiOS(<ahrWpOs?7harLi2{lk*b?LAsVx
z*~?OS5X|;PjKHC$ImX5T7YtCD`UXtm;Ebr4sJ1)0bjxVA1!aSeHoo5G#q%IPkKXhE
z2-Z#K^?z|{18>>kX-@CImv>c|zX_@}XK8v_+A)#B=I<E3y&;)}8*t_@qWsan+aU|O
zreZDY72t2!bpCuv)%;DI>F?tiwVt-&d|6sA9$TK79rNcMV8DxT_b&ij12KikA&kci
zXKWEi%k4xf=FP{PA8${YP-go63)Uk1quap<0YNa?X~%Q8Z0Y-UNZ1Q^htBowK&6R~
z|EOGm`M$QwovJ&*qWHqvvaTOCf)vj8W|JRaY@l%1)Dat=zl$mlCI1c-uCnzC+Z!At
z#O_mcYRfsWNER^BZVuO_a68TO{&Qps`G?O)5U|<)caadv?PpA=Ntb;EQj{v0aKHOd
z;o&-|RX&=h&aJ13%?`;e1G1gH$8@;#!i<TloA{5<Jv{W))KS#mg7@2-#c1d}c|N3J
zcwcja7#ARW1L8i&`QJtKCQsd&5{Z<_=|Kt}M(`0IUk0qF;P0SJxxabJh2QftAE9vW
zV|p$7DXzQ=^^b7F=Q1IqBlNlwD2k!TvYq7&JTI7UoC_sPzYbxNC_%((*an2)HZkNf
zKk2fjx6^#!p@`ZgI~FolQkQ??wTK16dI<53K1ZUnNF~{{K6EgR>#W7Iyk4+G!%hD8
z5lVo8P6mrSfy<0G^6!e_%ZhcHCjE}Tt*)aanU!2ad7zhln7_7I98$?dT>l}d3=I4*
zA)G)~NFI3v7<A<0&c8&gCaD^xcQ#-&nF2A}@SNv-$<=c<-15p9@VES?Ua)=oZFAW@
z?u$LW24Y`X0jCcwrQ1U2#rw2x+FBp3g6PrQeV)<iDe%&FnqY>?Nw6of+u*#-f0$$%
zw?wn7R5Ffb{7GGa@)dj?N|K~gZtsE!kB*WVn+d)_Wo)+nH((U@>^O$yy#sD0ftrkF
zz6A&=R%fPL@{$bOi$S!-VYj_Ykoh|-A95L&6(Cv9nf{@r8rd<`CQ*Z)3`q#Yt8fs}
zmzQgt$k=?s`c-rmnzo`NPl}WYBRJH)ghFG!tko!dw2R?;3@Hqf#kbi#^#15Rgjbr|
zu9+|Sa#!ea?;3v_Pd0=Q<u$cwyTqV;1rr*NLIU_FR*RWT<LpQka_x5d1h_xRkVxs}
z+lvvy7-HaAaoCMRi6z?i_?s`}LXe|n6f#X?|2N$YmI;ua00-aydjZn`TSs5k1Z5#)
zn-Th|&OU`#<G8r{arfluD|(g7^j~nnqN2}xm~U$R*iU-$g7>M9L&wF)lSFTARz64{
zDLtV5U?zu-7F3Dq-;g1d-5*>2@Ia*}FWNq$`n*fYh6&A0CbJr<U9=qK8&d&2$6`7O
zi-`Qsq3{rmu&+^l2HI&ws1!(h)_*y12&D7hTSo8lu57n_7g{4~T+%x8V#<b;#oy<6
zu$fOZ9jLx94*ZA@MI{JBV*-Jr17K&Uf$RRsmvawT)_+Ey5lMnU9a~u?v=InqpZ}@}
zzEMw3TaM21U~@f*KTr)C_c+O#K|yp#{IV4VjwMU(XV4rgbJyQlFpjVHhge?i>RSeE
z4tgV(zs$Kn){kLtPu9MXFgOoFhoq;)1tf(~o`psyHem!m8%MVpNH9Jpiopg}(^XNQ
zOg^Qm1*N0=;!X`(a&%I9*3e}z7IUO1-zl_C>dW+Murc)q%E0%B;tEfiAdauxf}>x&
z#Py2<p4g;uLG<7=<%&s+E<}jh@!)(2-FyZy1Z$qKfSGo!M*ui|y7d7T+;NAH4?YaB
zFl5O|ji)v=We3<K(8yW!O+usdA4VVpbxn(zI56iX7fZHn73}EZM}q`r?wNa1@uR0a
z5xM!H!FA6N51G2sUU{Q)p|16|0Uw+t0WF$jd`<w{oB1?Bc=v5D_@|8mU}%jynGIT6
zu#>fO;CT%M^C(gF4SG{y0<mN?>?x$Zb78}UB8gW^XGip-P-SC_m8p5&WrFsl2aM^3
zv6PmFkE93Wa@l{|I^2p*Z;Z}n{c$-BDWm=sHpJGYOJ__ET#L%7d!Y4`2&i0CbT&zh
zNu7YGuXhbnCzr|>aApfCSjGg(x)dgOn*Z%<qW@o|0S_|Nt1bPZcbm{iJTjVq6j4jk
zP>%mpyc3p;$s!mb_5pyd(TWbnoT_{2M{A_;(Cux*j?GNTixUGwEtC|_--Fi@&q>AK
z#(C_iEJs&S$YzuBMrLc)7P$i6+}(CWFZF{8)iVQw+G(EVFCVIj2wZGik@%e(PSMcn
zU$%zpA)0`>vO+Jt=j7s_mt(_Ryafzv6}YI&A&10_KI}A#*o1<ksd1JV39!A~(yL4C
z_cIva^gpGV>{&TApHQq@<%PXh%RRs8$fY3M5sIiV@V3+QY2UzB*ME3s#^LP)bw=5e
zE^cD+V>_2xtrfhsbjDrgvHR}oI|dDWzO(%+e3TKfQIXMBk<pPEMIfUpq{N;HLKLcs
zGBaeG!5mlZ0Zy5|Y3*-IKbm%1KYTPruaO`8#S&yx+q6)!D=cR+1Vl{b73`;w6W<I>
zbf)N=l=K*)Hn$h=Q~8b$PABL!<4V($w;3f@{_Qm!F0n70&Zlc#ycCX7T8{x_)%h3&
z9dCZd*ANwxq}{97b@=6riWdx1&*}~hMg6m|nBq9!l@=U>51Dq2erMk8FnVGVmd~J^
zI?<tjER_RiZ%%kXF}Nxtn*<+vb&QJ&eesW{5E;qjrThH)O`m_jeDQM-E=?AyRb*kq
zNR!lLtIQ~sfDe76U2C;w-}*D^;E+-DbYDCe!rOx}t>yb+$GnQt>S)t+So_VWP=o|^
z-aHv>FwMi0ey?Yf7#Wcj@U|t~CvWn_O#Btt23r%p4D_l$dbFk%hO2hH^OHJbtY1Re
zrWXMBz_Q4*)HVrYDKZ2QI70c5{dfxSHmbsN7knJ29v<A{oKlktw!eg1eo4W7-zY1i
z#WPAWpY^vaDq$g~=_DjLB%}+yS(}mQ*NhkH)$SLJfqF=wX3ytzQn-2O7f)rUpoRSO
z$$pbiSTTjb_1NEtS%;K%8@=1|%r-FT6NqpH1Z|t@1lYl@Wjd9glGsM@(<2dg@l@(Q
z?7^GeIk8#W+R_^nrm`?Z=ITm`?41@${tM46UG+SCPWnt{b}=fRLSEf2Tu?c6oKy$^
zvNW=Q02TG+<Tq)j-eVH-%s!Rv1bLgK^2FEyel=n9xRo&hN>||^i%^=mupJU;IdC03
zKBPC^qGayB)ZwhMu*%)U%$ailOf7KaT3F?@Bsdm19ah=x*{x@hauwqlC;uO+-ZCny
zw(Z(pG?LO?(n@zo3y6eBcY}17lr&07NjK6d-JK$x(%mK9e23RF#y8&EKmNe;9J1Dt
z`#$G3=Oh@X5ZZV2lvbJNH_EVg9V1TmTj-pGWD@>Y$No>bgXnujwo+2c@8aT!LN8CJ
zPxUIrUmJE(w@pS5!3()Q>x@8A!5%qlGtKi$G?ELcV_-qQabFd!Y(w4<KOP>>r<V=s
z1*1Q4dM*!UGD>xNw5qcer*P<TSQiiwU{`u1tI=c9g+C40_kA4NN=;>8!f&~}2^{s5
z#7hxKySa4;2eGqbHOS5OG2omIr3Xvl_IUI%-KO&K#81r7KOM};*V>8P$Ymx~^*em~
zn#(q0C0n3bFNOl&ez?u4Dxa5a_<fYhy&L0c$!aS6a78?~>{_6FFG#l7LQc0YcL2m;
zr5~Dcax>pelJA(f?h3)S8(*?>d-o<vYUE^6X~W*yP5yM9;c}AG1GIe`AkLh_W#*?b
zSJXVo<j@U)02*fl$XXMClGm=Z_Nzs1-p8zLn~iBlfT-3jlkZ)_PCX6YQhN>O_*VB(
zwOcSGjpUp1*yO}dd&Dqj70}dRrYqV_W|xP%J74PPa&vcMRQWK&h5ox=>oY4jFN21+
z++aIPse1p;qHr1kC$mIYMK8D=iJ?BJLlqV69dq#&`~E@JOI{w$rXQtmW^5~VIs1LN
z>^ef@E5F7vbld8@7J3)p2m4Z#&K_g3Z{(XlCbLF&P~f=QhN6IhI&M(VL{2rMvZf(c
zSa>$Lsn$LhK!C`(0@H-vua+3Z8OVgjJAV3P(lqKP`}%0H>f0QkUJVTmQKQlU!?x!t
z&U6ziurt_<z%(>DXIXmR{(Lp_6<B7+)o@`vT=vi`vff7;>LH%9|NSz=)JnqjBXokN
zR%VZs!!p6{YV)PXR^H<-P-*zT;l~X(BW^5XrVEOkgPR@T<KAvIgkJguAVc&ljfnNd
zmgc5-V>ISFY9K&Ti_R6owgAztb|Axwc#EZdszag(_AbFv(_=?4D6jmM#q6K+t>QzI
zQvBz_eM0LUd@MEtr_s@u#^ro2aA)<J6fpcB<L}8uu5dO|u5)wR0FqI{BNv*j8bV`;
zlr%w=#}QmkFBX?NfQEt5H6SF8z}hn!G|yvw<(QoameXH#Nge+Tx9BRgLXc1`E*cs~
z0`FhHxtqtvOl{gkgi6<GdqVjc5|YQZKEW5z5@9+7mQ7L~O3@d(iCG4RRN}DR-)X@j
zu+H|#-|)LTE`q#Et;A$Vn<LVAeJT34W@Ynrm>eV2i;F0;KOL~UH>TbZRi@DFY<MJr
zDcPK<1)xfu$b=PNCpaqQkHp8PY?4Ajh_Qhg2wU6N^8B~EgS{os(!dn&58CJ?peHaJ
zlL&c=yj35T`m1FQ^t}E-L^DnilQmaY$J*Eej`=c*Wkt1O-Sq+0C#|(pcrXUg0Vp=4
zgm8~>)O=>t1(Sa$D(gF6*sRh2YkYe$Z~Xl9`1$dSVjx5P$M#6Z^7_Po*;tOTozTf+
zz&SeG`0qalVdb-jg4H_&81VPdHT(n8csR!L`7nV>wW;6@?PBAE38otilqmg`YxDCZ
zLem-T-%iZoLHf_mGq!m-T{U(iw%$_B^<}dfK-6YM?N9<VL(W7OL<*OYy13yR#dfo}
zl%g(b-ECHLE#KC(1o_5W8BYuX-PK`P5lu#f6(j<@Y_I=hd>&}M5BB2LCDW3K#3B@o
z)Y5B-4_=!lL$S0pmMxzS`j>!mc~k*Cj931Z(Y`azA)D$4%$cIRkJzuRNnY;E-6ctK
z4V8K=nR{|SFZUcQ1a-C#$_ni@oy%W--gOikTK+qIuIo<8U#i~0!W0#SL@Z31f2?`c
zUKJ|!I}V3WYA`zXTj4HYLroGwW}<SLB1AeBZtJL|PMTxKzxf`d-he2l8_BtE0f@Ha
zZfN6dm8&Y;=LcRLN~lm>GOQk4IY7GT;LaX$BAF9v(XiAft6z`~=hhm(z1$K}jH@f1
zt9l^ih70@h?07xqi59wE?Fw-_Uh@9C7PRs73cCd7VD?{xz>_|;40Pfrq0^_Jh|x&{
zd~R2b*k&ej6|1p>f!bc)&s>+9<+;4+IJtg!`_|dfL(PuNNE)!{S}nm`$a6s0-TvAK
zwuaM*$6~Y@A;#?L;A6a6i8m4llbD$GEV7n=R;hf7djP@ZUc1;FR;de#xcsPMrY0}1
znMrqHkz+*4By9T)!a=tM1|Ld4=?UE4v+rmpAF;5UA<i#nwN~%9B{Q;iiIUD~P@}B-
zAy{DONOJG7((o{`zW$aeCcKzXxSJFlAsE+sNe`l!1_Koo@i!z8Q_rM9%}!Jws5Idq
z1<l(a*$Ic2!P6jRP%J+Ap!D)T3%sCXCl=Rcl!!kN^kq1G7|k4IzXL&c-kznADW9J~
zd~0bEAigQ7NbUMsBEl{9GAKf@AD|NeA|Wxl@svcAj?C)4FTN=EGE3Mf3?p>8$Hof)
zJb8u3Q@LN<uT<#qm(+NxU4QLmw6zF$O|t0xw97C$N6J^weQ5D>`eNz~at}kch`NAG
z-ELvcUV<O5GQ*#+L{nPuLA-I#Ujyr38bb<u><=p&_94xvvhT%-d59@&KgpkmISs(B
zyl1T%GNnO;LyC1aoD)wgFpr5c?&~?LM1~{+Bgyjf3z)T{?=wdzbTg%2V@1>WJQvs0
zi(-QESQXPjZOuaKlj-**`rSFT!$q5~$>~7O_E7QCUpEFcRB|enmlRbcQZ{JaXvU|2
z75<gw+Fj?J;XmclY_G3gv3(Cf4oSRjPgiu8?VDR<DaG`7;qi<KwtYwAAbbLBqfPTl
zU!9p-b8XEx<Ak8v53gnZM?t7_d(?EC%wsdx;Pi3eNnjULZs%)l_Lti}b7Xj2?A2II
z<bizZx1svcxHpIz`WZ`Q2ZEvs(6G-bT1%jeY)-DeJwAI_7poYS!QM?66IZw%GU~D*
zi1qm7aCEd$2+?+~7R>#U8#0fkYDT)ICGFe#os%G5@OhuJ?7awI-gz=|DdQ=+Jym{g
zPcRxaJz`}4&`AF}J4ZJ62;p;`i=&q64acL`0B@SoCVG#kp`K7<Em{6!MQ=T3_4V$~
zsA=bM5`Wm{)`Z(vLMg5whqIGR_8(cwG|;7`$Tx?*GUmMYKfkut#euXu-^_rBuY&34
zNnYgl6zxF^-zM=7=iOIH$?<l_E_{qFFyt-YK;tE%&zAtX!9cpkv)H1j(afPrSbVVd
z?;#h{g1H|9g*E<TYH(7lraKNK$psXW)_7xt7^WM*mUGteQ&#&T4FFTY8Fp{G_YQ1=
zUo$>4e)^NJ1?X2LEtMwmhpXAIwVHGCtVr1kNPWof$5Jcl)!1#E4<9CG?tTB(y+7Rj
z^NGDCV716=Yq<`dnn=KdqU`T7<oo&Q;R>LSI-~%#?`@1K{ULZi{ZUjF78YN>e${Pq
z)vdE7tEVw_#DOsY@l@UfB>bV%gc*R|tWQa{P#HlUflekhKih63y0v;9kcr@V8SMx|
zkE}h6%R(#5ZOLV*B^Ho30#FdvyBq>XQz4gl%QY!;fgXB|ogFLJNKr2lkS0hBDB}&g
z23FdOGJSUG#AhlHJ3V>gj{e9}{u%f>IG7_zqy(Y$y!MF^_L{AT*5h7&Ol|Q@%tj*M
z%5D?98Wqq^WXhzh5S`Gjp<{|b48h>~8!oGR5uiLwc~x7bXE|DxNu_FuxFw45_qijd
z^a$|E6vr(EG>?5evrO)wb`1(&%CTC<6M1^qU$EHPGHqg0kGc!%>%c;oYip8He`5Jq
zyN#c9GlwAtu3=-@Ux&Vp&)gRj+1VyczcO6qhE_nNe9p(zzX#Q9HdA%J0uipNNaxI}
z)o38Xgsw&7N${N4V&EglK>^3Tj7TR_m<s6>A{p6Z2!ocXS*xOkn1R;gS4hzu9Z33n
z4v{yFyvWwPzaJb`t}uzaw_Qsy!q<Wd?9vPQAT+;@l`g(I8|h<Xgk9MBn#$~iGlm!~
zN58#R7(M-8QsC}8(rPe~7waADoCQ+z5RdaXZb=c__!yZk8@}y&;>$(CawKemeZC(}
z^z_RI>|Iz}>A|Q=sfgXN@SSE#A}AHEs$o?GBguPlekd62?GG#ahU8YslL{Lb=8w#f
z&L#b**=PlGYYUaGNt3~C>Y0>iqx}SxLS=a79CR(Ca1*;CmQre@B@sBPVhch8i(4n$
zFls`NzYx9S8k+=8T1)W6@Brckn?MGvH>XH11$x16o&EIZ*MH^Er?>x=L$}94Tsoog
z<Ktt@Q_Td}zNEql{ZQU+ZEcyEnRPmZV2~Xxx3Lzg=qqjTw%fnI!=!P8xxW0xZTpK$
zK{sHU<wgE`kyGixSG$CK&wVH6k{cSa-@OqDVi;{`2z*d0U!AKqTw2o6fo(ulx2w=}
z>gUyM_4mR}Ma9@JU?-(?0A3zCCD1%@y|mtD(ShZ4B~8#5eSZ`Zk%<>`q9SVVUAAmw
zHRC|Mx6nKX5w>zmK}WbOABRcHG@WjZDom{!QLaVBt+hLN8@;}RkauO(9B=&-f|&wf
zwN1{mcU@KJaPzCO!jH=_UHzvUId^H>nR8D@+}?}_6=U~!U|p{2di}OUdS27M{uu;}
zN@=4%?^c#Q-Izgo4S;EFEcpn}4a{AKNNV``slRA;bzI~T_7%xr$2HqZh*7_Q#lyWu
zXA2&DnZ?<N#d~GEeUFvr!`P^h#a=bMnN@@CwraX%eG507?c(SLHa9HD)lQ(dT?juX
zuJKg6PEDDKI@w&a5ea?S537H{p11hi?*xG^_?1f!@9nT5V%Z0aV#aqTZ7@)|ZWF$i
znU&Su%K%Q}UbGH|q{KuG4GrYJo0}Vvr~9fOy~V(WZ?8#vN2t(7xfe)UAJ3fBDolj3
z^VS4c=*sV_@0!Ho)^F3IqQH-?$GN=iiVPb8nXD6L_VOtXwXP+~1WKePwHDG9hEHNj
zL|hn$3bNbK8%zqS|CB6{!HU}eN15s?1C=@X!yP2-s;@jBoY!!-SH)yAO=0X^SzNA9
z@r;&rr0gmrBpQ50wGjP<Cvb!wwY?`p&=(Zn6<7M5jLo&T_LBbu0~Kl9{nK)W_zHRp
zw(}tzsB9q-jdIk0uwhg~YLFYXH|NC>JlS^D2m}GUBrBT#47gdiI_YHF^W8N;h@afF
zF>?DsAfs}d%6evMc+4`T-}5YxEn%VY=}&L%I}PWwC&YSen=PRDS#n=4xBKb+P95AS
zLYlG8{cUk%m^3JR|DK7M&h-kDS;81x#VcnD-hAJgQl_*IIbQ;)BwJX_O1lnI85J~|
zBox_XlOe6o%hYC04_jZ<?Ijf&C5W!;Axjaq)pJ*j4<(POh-c00$wfvHX+m;W(F8OL
zzbqyi=;U|9zO(4kx~X-CO`E_}L_K5Y3;RV8ODNjw>|%jUV^Ja`^Gr(blEh#%KS8FE
z=kGWkb|(VDgan15jsF)*W<MXOI7ZVWBcaT*evyq%D*dO&rI>GhecQkD#11-J&jZUH
z4&Z5QA&p?1*%-~6C-dK(Ya}T@)|dUkZGn**mStaoQ&d;=?6@v2LO_2>VDxS9G;|4~
zM&e*Z_5!*EjNO*=L$BY|+Sw$NuZW~x{J(nT!#t1>D(rQA(sbOK!fh4LPD<UIot;f%
zS~F4bAxCj<eZ9)KFLHmTT))W`{jmTF_&A)(_-}<^gUH#UydUv;P6+wl_RJ=#VRn8-
z_`<yS=F7}{N-IpSrqQ~E3z29!uthn2-6a)%<A~Bz`WG`Zf5l!bZ{&xI)ho3z_F)7<
zepga&L^KFZTI?r8D9r2bu%y>w@jdzLGpqUZ7PsnN8Gni@OP2>|_E#vg7E?p&ET5U>
zuNdXs?{U6J@K^f}#wWX~XlQ)ymm3OyTWgG9DrzkNa0x-M$oOLhFZwT~DK0cp6~s44
ziF@NFq{>qj^2a3XkC~qHnsf0-;mQtn-13N!RY>?80`FkqfV%{!e0j^>mH1f{`}8kd
zkG`Op>F1TihxQw#l5T0?P+pQleJ=f_?)XB7H0@t&(?tAcJH7{!(v*%NwW!JH?jry_
zn#bYDz@TuQ%zo)MajuyO>NZEtwfeQ!d1&5&1EBi%5A>le=5V2vboRB#rUia3q=eAm
zE_TLpGJI|=24ZMo^DfTMO9&vqt0u|4yu36uHB~Ry3yOHj%IfCq{ATM57B$Uxy);XI
z0~p|R6P>&wH$XsmDIjpJKzIosdNvGyn@PzTM(z_qQogn;;{XHEU7bMgkK*H+FiU78
z9ofdav9hy%aK}nMgVVU5%|)KLg<nFiGt<B=SU(q5K9xJg=u}f{EIF<cMTvbfBqn>)
zxR2lM>K2Br@nM&?)|HHVe@#5=E;N43QuRb!0vH)G_)!Ihg8~&5A)QP)T=N1^3>wQ@
zM&pb(F1iz=Sm0zCbsQ!1rO<M28zTs@p*j%_zLQUNnfw)Ch+Z!_o@H1Rx4QhMuvqzy
zjPCf75CLY8sljS3)`VD#64kfH`raBDT7K?tAoeX&CyM3j(kc9bQw5!g*?67~f9aRc
zO%Q!IpY3oE`2UTBRoz{&az7hZ=5=5@R|rq)kbavj|BYOfePZTeZ_Kr^(XYWq{69<L
zs|-eAWPIC@H;tE7&0csmHYk}4Ujhw6a{~qb`RD7uMZu7wkti$|NHkJDyThEGb!A+|
z1++~{#C#EdLZ_`QQWLaZ()@VGXMyRI%7CGPa#E%zyT3g~`}Q<ulRJh{wg!gH_Jy75
z+s5p?G`7okiZk4(WDO*4YV|#pz*yg0L$j-OQ);|~3^xM*Ho<9~-J<yO^HM`ZP)1}6
z;7>6h*_)hN28<XG7=4iVM+7IyrIn%#2|^XL{w{P0T&M{wo)Q@}GWX2UJ!=1l+l>9E
zi&--Ar{H(o==b%7$oM|7Dz`w_U}VkB&C$`(^+l4!n*5b<r$5-*y8JK2B@LiB(GmsR
zk>@MZmO>yR4DxrGDm#e8I@)r6kuA-9Pmt{_G7FX8>krp)_a&@<kO=iyUv|0pW+cb`
z2J9e-<sQlNOZNe#(+`Rs4|K1wZ##=l^-mCIe<iC~!G&b3jkG#)b8>zZ@_461q!J7A
z6qpAL#Sze8-RllBuXJ~b&K05cXD@Zc2RmV2(#g@N%h@+8D*uEuW?r}sBOC)3NcaJo
z&^q-JKiam2y+7y7vXh;V{0LQhJO3mT+kDeL;FE9tX<++Yj0;(T%sWkx7U}eG3`MOP
zUs@4rHE^XtZbKZ|U(>a@gf0l6li;SgT9~}YLV#aKTR@G1Skd_*l1Tml?r@u&JfG|F
z(l{jFi_|WV<caizO>POS9@oygMMYM`qo1iYR`T=8AI5V35^}>?#b%bwCVkBHDH4ZH
zGCn`@yX;M7W>R{4hpN*30DxkUgz$T4h?|Sc^>~SSU~QsAv#Q<5+S=>tNLOb1l@dA>
zyd+jZ|31s^QK*GcoV*%N9OJ_8H}K(q8)$BUq2`C4O@feNa_nc2hGJ;dHZ5`BG-;Z~
zI^CpMpTZ=i0j8FO^5|&yo8h4rL?~lBA;llH*&Iys7yiPS@kaFgkcK-hg8z`%5CFdC
z&Q+wgD^$GYyDJmYQRDGeS*XPGJpQsltPi`={3{=nJ{_#sHHSv$W_h_mV`ZFDi}Cg!
zFC-yvT_!+Bd`N=C(@pDlbvP`dEe2VYx;q@&+xGP-Pk%&^a-Uabw>oCaBIl<m+6XdJ
zpR6~jicCu$WELlTc*;s~A>{)Hz3@*<AYca|t)`#pQrf|G`dzo6K(g*s^DB$jl!ZH-
zILkB=-|AK7&UfCY3R#YSfV(1OcKTR1Jwdq_b$IZ(OSrxMYkf~_-pM?Y`@{=ku|`E2
zP@>mJRD^%lVW)|JFzWA|^>z~WpNsc7KdFjsR)vk4T8R}eHg=Zm3vqcsvi>x3YV{m`
z7nJ?iGUn&T-zdAq0MoqFug5^=2A+%`D7&Y!Vd?Lv2=~(Hz-fPrWHq-qFrh3<d-8k;
zc(^}3bFczz+}=M`gO6w-nF8=u+KnDAjBk`X;Rvw6T<zQiB3YE(Mw^4QF9WnLo4l|b
z<y8Og{>q%U{Q0THYNkw2PtV84XLWTIBT=3xEId38ckl-c9Ac*VY=vRW6YGS!zBjjV
zZ}`pG_TUY+20#Fsv%U6rvd_$4)UL&<4tVk-B6B{6b9~2Ecq}N&d7E3TqrvwnmH~Uu
z-D5m-N9c}bc#S~cAvXVn!435_rT_qB;J?k;DjlgWTjWDWzir1<2!@5RH^j;>l+lmt
zpz1|7XfI!0^YXS>$FHB+d07PBOPA#IxRHEvjB#LF{RV5>s#f)nmN>tg)$*>znVIcV
z)c&v1)CH6$Ys7fSzVkz-T4ud4M8s0H<Tf8W90rO#sBGv>cL|7S+qxb=khD<Uy~Xc1
zOqB~cxMwxO^w4WFR^wEsb%zbY?+-~;H}BmIf`_vZ)1QF=*h6f=;yum3p0UEiOXt!+
zi<B(oia%D0_k?cmPR)#}o{2k`>^RXSC0Mkcml62-_EIZAsdWxiPNl=16*bcN-3y3b
z`xAzM3TvVH-n=OW)9i<|#x>cq3RrYbX|T_*(baRo7zg0O`cepaaJiotbchB81vS*y
zuW$&xe!T~~0xBk11-AG86h7B5(-ybqyKRrR#WeePGO)g{YNMvlkGgjE9Mc3RUU3u_
zNO(hP-!dp$mRdr(AKS%H$V7Z#nR%p^GZg{#@=>CK9(eMx2qQ|EhI@${<@C`DiYGAU
z&7U`C=;cLBi;EASz(AO!awA2M?BxkE(Z~Bel0?`_PNhU34NhJDGM(;x^oz$f{$#1O
zt5i?688n0Bm?&x0)mbQ`x)_C9#Jweu6eEz@fh-G_t0olmp-k2%KCuTPDT!TIo$GA{
zHvy;}ge;?fhhw3*8TN(UGfQk0XBm^IL3U)Az@WGIK_|m@aHnT(OT_VC(o@T5%_Y~f
zVY}+$IgaM9c}cIl;p8Mg=at_}sSeIkSs4HjB#;w>Iv(6zshbslP&zwInytqy(E7Tl
zpt|7Dr$*}4fRWQA{i!;goc%J>siWXWXNsbfifv`xT+^Xkugcn%ULZa^=6tr9fdzgl
ziQqQun}l>j8;?wPh)E7f$kWlogY<}Ci&9hKKRe(VYGDfB*B_{tpk()-)pv)zFND4)
z;y<oUHflV#m7i9_m$w5m%>~rIkzZiHl!ak4cOu%Kjc%oge>W5Q9dy{Qz1!X#fc=*N
zlnvs?O1Z*qyc+Sv3IGLfJaZt13EEO$e_9!{On!g0skHm6)T_*R*9==n6m!c`TLJV1
z)$3E9K>JQhvakDPp8Q&;)pIz#g1w)0HGpYe5`if$ePI8KB3i>ejQ5d7SI;JoGmlv0
z>czJu)7kG}2ael4{Fn8SSC^PB^7(Q7o5)gw)Ak^1+x_Ld-S%)=m<)Pk-j8cje;guQ
z+#lK5j<@F)GHNat`iMr~>4IQZ9+3E)P?PfoUFbf-mUdV`9jG1WZO%mA@^xOc^0Vzf
z?Qzd9EGDqnj3S?QUT<$Bsm<-|^x6mrf(vHWam_WK5@6Xfa|f2&Um)rx*YSE&xF1MZ
z>H{S9AIC;LXY2!e`4t|Rc5O}N6|2Ig0IPBW==hKHK7TYma~?a)@qjYZYkwAWxJp;D
z!!q~;<aA!Gs7M<3zr}pe!J?t?JvN?h)yXqyXcb=WZRBw8-AR#+l1FmL#>cpO#2J*w
z=PSjnPH*>$jul}bjW4jJ*&oRTuW;@`nLoaMCpoXM#nie<O0c*^fYnTn+1>t=ErRzZ
z;iz3eXnd>sOs>5zE(yY%Kuor%c6V)nAhH|!;@M$kBBp#b+`~q33pU6d2I#8BVZ+@X
zAdR)f8?kskS+h7lj2Kl5&f1m-&pG#DBWS+KnA6;lzl4Xn`tR@0MwH-c%!Y_HsM-y%
zpvQ-+Wo2b$2i?1?;{+4#Zqa98g1a5(xZ5>0R^t%<hOHKTeo?hO5PVM3j(|9B+cIMt
zh3~xk?!@>>OsysevLEB<q<{NLU4{$!Eujsp>`Hybms(X4_Rw=;P<AA<;h%286#NC~
zWCE#j&&T^e;F@8&*egN0PN+(T1@yS+{1>n*QW2V)YPZA<nOGKF?qUJKY|PARmFa2x
zeZJL(X$q-%dL0<PSn{iN`m8u3^}l1D`0&G(*efEFJ^b5_H~!*`HEtq3=PWM3AI<Z(
znIZz^eE4l*&z_IA^%Q*dN;AW}w!@NxU+CoIkrb)gLb(`xj{5cx@oO!-iI0n1t=NMK
z%|R9|oEU=e-`(2OiAhXC9dh1d9ME+g{m`_iE!T|~0-6DBEyUO<wYQ#d35uXgVmQV1
zqs-@@>uJizwSSa(<>H_k*;|TxvhJ%@v7vQF<JvyJzj344*i77Rsf;?Yc&In;Qi(>c
z0X>b`DQ;*Oj}wn11MU%rjJmg|3cN3?k9Z0$WvSk)XqJ_afJU^>(EL1pv*(}whOXQj
zX$R*#98F;HBBhzI-FGIW1rVkBQU-UMBT_1?nw?iC;Sgh-OPTW&LTAT#8KI@~=hNd<
z9~oq3s+B9pRm&G+5!#z=n(gr8dbpfpLxsDzV0J@Oob=>J7635yo48dCune8MtzmJK
z;cU|#iz4n6GO!|E7#+_+iN8G!p3v)!%7=qkbN5?atgX;-?!aPpPfFhfBEW&D(2o9J
z9sa=yY!~uuw^><P|Mu-9&W}t?Ouv8sCKvJuTz6RSrKP722@G`J9nUQ(kyCi1Mg_32
zsPmK#@K22{Iqxq3da_h|f-aOmn2L0NT_U63^;qm@T4{;KI}{;1jni@ndODYRBc`Gg
z0$BI35Ff*riP0aD0f&O4ygw~=Q?sa$G9c!zTtfcPiRrvelYo%Xx{i}X0=$@yqEQ-t
zAhAYq#R^9mMP1OD@HOG^)OW;+l2l#yh7p<~X8P3M9nS(NSWVVKT@P1RY*I8wD;tHq
z2e2?v*X88+RvVMK=_!!gbQ?$CqNr2@y-#}e3m8C;`eD0eo*}p(Y!#(z_Idu?2LNhU
z5j%GO9{uzO?O=C$&*~H#@+lOJnYum{KBbk-W$4)Wr9f(Oe3>@a%levhj5eM51%9($
zi+cUSS8d#XWr7Ef5(Ot2?fCL-Oje>PeLw2tflyF+XIWJtCj+0bYm6n`Fl(j8;yuDY
zbS?u7(MGz^%15)Y(n<8vZm52f2G552gcxbl-9>Lhg$>&!yagN7l@hA2gNw=vlapeV
zJItGY`?)5;Sx?*v@TO6A>!>0___=UM7zIT|0)m2J&<RM>Sy)(@oSgKz|Jx^<z#^ai
znxC4Qnt`DJl;%OT<7v=)G4Kt<M;Ybi&A_eHba_5qY@_XQfPkvl>^CYqhV9X~u_+nG
zl!?m5+HyCp*R+XO4G&@`I!D@RFfIcFL%tcn_@x#yYS*StQ)LJbZf=YVL~k)8UukKK
z=fguIP&+gg5HjxByn$Q!wtdRDcO2G&n%eeUgChl2)Zaqj7Us|<J?353!4HU!H=|uH
zB=THWY9h~i!Clm7xZO7(Sg-kw>odj|F&8^jKmAIzV#NTJAN{~;FP2o~N*TcDMREJ?
zqg^#)?v5tit?!#<MX|`qwm4PM*L~#xe@L#2xR_W29&Hv`42AIR9YoLENG^N?xb3Cs
zR&gIgJ^|r?m1g7aoO|EM1H9;qPrKW}WmtgM0D|O6D-qezg^Ztm{~q@Q#NhFte^Dq}
z=p9y7h58KSW!+}fl!fL;rrg4mc&p(J5J&NTBX<LGuJlf;`hFpNh8~OGrL`a@;NEU!
zAr#%lW86?TH{pUBE$BnuIy3vbHr1t~;GgaA@Uf!#+e^a04MKjP&&PlUb){##mLHsB
zryT%ZWt{=U#s+`Jc(9oVd-s-teu<jAgtf0o2zp<$#CQ$ZDnGu^(}cp$vOC{+OpEIO
zHE4|u?f5Zp(JX`yWAIW)nu$ht=xrkkakO(5f=_ly4=Ui^M*02d)P9bMM@`aq|AAwi
z8)F=y_{A(O<JWjmU}8omYfxV7ml|PC{!x5G15Twv+8H2Lb#n7NKs(@#*q4)4?pVaB
z8maf)V>J_=BROu>^)=B~HiA=wajF22E}C_GDne9?M2ITu$?IqUMDZw=nBNIuMl}Bc
zF8}|<rLex95JyTE@;X@&6A}_)Wi4k^&a)>)lAf5FQYAL2aoL|1c{q>#Vd*UcKqAX9
zr*uNEk;6Jo<-BFqm#O-&SavVdNX^a*%vpFM;lzEegw|kL>A<eGgKn?$n%9{jYoVVZ
zK2>gBArlwFZ-qTG?HV4RNG76d@b!~pPzvA-_y?ut;G}RwSS_VWN>?QkI!+8;3q`?G
z5X0*iIie=Sc9pW0@*{B69j+XZ%@&O5%#`U^GaX_leV6a>Q3Q438||c|>Wt8Xx5LTu
z1YgD!L3oD9w?a!T_-zeo?z&KOrYLzY&)f0vC7#zPr1b+#41JTVQye89M4yzj&;~r&
z;C-bJ4T*z?hv@mu&e<>CbE3bklRd-|T9;EG^4<(u20~SUrAWTb&zTgi`V0=J%~p{w
zdSF(Vr>3LjnX|QSoV6{<W31K^jL1_D$9z~2f(so`j<e1H<&4i1^S9P}hpZqkv;^s@
zpNO_&t=`jy^=xziF9ki53F2K9?j341$QU0IXfz`fb7d3)r`O+E8YQ7f?M<$S0G*b>
z@6<a!KHl4F)*DV#p~Ci#g@t8$dfMOb*TBHdUU4~%s(u<WgoHR2_NVp1{loj>=(BUQ
zT5AL`+?}z|?|~xJIrZ1@7;M;$XS=2z?KEC+Xk>xpFl*u$nZ6C5s*hAeE3SpS0`=B<
zL>sbqYHU8y`t5ZcV?jr}zL9{}+<4{7pN+zIywLDNUx@@(^6rFY<|huXC0D6QFQD#x
z+*ay?Dx|~3%lkD}-OdT;2A=)Wu8x~7GBKId!9Yg7pLO6Vpn&Ypt2qIGsx(Ktf5~nB
z$M8>1k?bpL5{m09O+X};B*tB$F@H-N2tz3bxP(r^!=+y2?5h<n&ayjSG)04TrSwg<
z6f<?lDJcz8;>i6S`8P0RUN)zxU%c?RwOA!FKvo6p;REHS6ojZy&AuAU&)aM&axZ_H
z!*p7t%`#edUpBv+n*b@1<jnb*X>ptSO<cqXL{iI4ohk5Wu-)FsUTZT>c-^xCMM<je
zLwGmN`tdb?vovoNgFtuf8GNSn`=8FHCNW8+g8V=U<apBLBKXHutnu`8myAh9Jxvoa
z2^8?wct4V-A;&OTwIg9>ZBuqS(*b!<um0P|ugVKwqPfSp&6wKWYmkj%0PYry8rit#
z*B^r{g|Y`FPS0spS!4ke*N}wj?doweD){OH&fA`$hJJ!6;G>3Z|31Po{jj!ZL6yc^
zSy)1NeZ?qMm0cPRtkfen)Sq?}h9jij<-W34L|`s16O{Ph#ytoWXJKUxS!Q&P*VNRs
zm#|xGbXn~T%Fcelm)3tUmJ{-CR!1*GBt&Cjqjs%1-~E)ZQ=dJ*h0Di5O6+%FDWrhD
zDI@!TPm{lU%y}G`*k~O^T}miJT!7_c*qPepjDg!gmIS_OQ)AhVV&S_~$5C1nBL<V=
zEcN<vE8d+V2PTs>V$iM&lB`Bze3^+HKuz{aM^M8g1dFCR+)BmzITKkG;LPwY=|<$i
zugwSv!(Rrt#c2QK%D*ps4#!iz2I%gmP@=y9Xzk9M*l{n^{6@bzD>mOPulDAX>i3gP
z)AXPiZal9EFoFSnGLAAK(pJD+2IuvyI3Z9WU?CT9&%k@{5K);#$Px;BNSvg1kLoo9
zAoP{I<HfqdY-rvU(m8cCVD42rM`^X**&()rrsO!7VN+9^d|<BG0;L15?ZWj&?EC35
z-TkdCM7XJHiwXMo>2EFDUbNgJ)`#l0efU<;rH1?qHUheS?GnG#FZP$}eiRnyzy9`B
zuM?N!-ftr+p~H%mMp3sTim{I!G_k@|oDkexBUI8Bca}3mR|ddG1aqKhF(I@A{Rb_%
zKb8N_&hk-S6$vXBR;1h=L~l6KSfH=>@`;_AyV8{kc$u*oC$h@D^NAcmvI&VS8d~{7
zkcnHpRt-+z#pf{PV%#76A(~NQ4UQmCk9Hh<jDt0}eeTUEq)mmVjY;I$(S=DAk$L>~
zILGeC_x<B{FcqGIjQt(@dJ`04F@P*9v4UU&dz3whrTbx%vxn6$PjkKK+Er&YSN4(x
zHP91qpDD$}X~tbj<AD>R=qsp|H|~Xs>B7KKA$>Q6VdHR+I!EI8N4C>ZiqVB_iGEIf
z^g(D`GOP4xdci!s8w}lz-2EU*U4yu-u($XSDkc6Yr2s2~dbMv-{=bm$n{xHW?*HAU
zc2%MJ>T2AZw^USA*#<@S?A+Ykei0+7{L}+QvB*$J(8S(aF|YZNmp3f-E3J1;WJj|y
z%j^GqRTn7-CwDOnqd-CyBK|!_j=MhQ<Skp;pxN4SegZ!QMSQHpcF+ilxcnahSgI=9
zv2A_FB+YYo^7N2`&9)z`ZCg^2M!|^ZtNIeGnjAW=F{{{R#Y?t0Q^O%Mm2M)3aLp!+
z-u}qF9xOzkCA+tCPJ7;~G<&sZjrbwO%Ie1pdwW|}rs!E{^}6-L7z7~c0OPi0BPPa#
zIJ5!|`=V~yN{k&>l7sj3<=jaTuIkc}BHb#^;=Kow6uwl9jo;-E7&~fSR5Qi~49YP!
zXPIsha#*_Mq?Nz6YXQc;GKqpu)`GX~C8zTL+Qc}Kt0QABEv~49EF<_hXbZ;UGY?7?
zsw<^WG5bn6x>GRJSJy}gPpB{_t=@N&Q&WbSI=Z@`_GBf)k^u#yqoc!wW0JpS81RsM
z0Ud0Q)E;k$IO7iRzm>vzqlKwlWg`glJv-O=o{_^)b#IC?T@ii}Hx%T9XJk@p1caP6
zd*zYpWhdMez-7jy44$IQ=2}%_vT<}=2qeE!Az2vdew7|Y?<<xj$Rhg!-Txj5eIBJf
zs{Z698=~q8!ky~cn_m}|1t7TsFwKF<$}q?EZ}TKw;q0sG_wkA%HbdWn$;SXgBpXva
z<#o9RD8^qHTGePy+ao|LzLZ*TlsO-!8pyca-UWk%&fGD8%U@?XaFUPK*~LcIxSDm?
zTI-ExNm%_Zlq1T)q$FPTyywJT>mtC_WaPz{h}1Lx*4toiyFJ_)=KT9-M`1fG{rPmB
z#Se}6#GiU5a0K+q;A2>IVLLFS-wS6OucuX<B&RlH>UB1NP@?jW+av-W-@aUx=b~=8
z$pUw||9@oDhnD|*6B)7~;eNi(F68qwddHqnyqJ=N-4T%|G!ztA^$#DSX^IuJV4#De
zj%hp`w&J-tXs#&(IGB@?l&c4HJ-c&HE&5n$Ez$GyrI$4iMJPcCvvyq-vo^nbH9NcW
z5;Jgg=X)d7Cu-crR5+q?|BKf}!uUC?n;41VZ8JM6`)C)8Dm)!ynp#Li!kye2`Oi<L
zV{8h-M%JQ)*?c;_&;Ft48*V+hjni<S_$@NU$0lfY8C2hp0djej$78gIKXoPLi2*S*
z;1`De^V9Fo7}KtE2{qx<;xGNPy6c$D7Az76tEH#m1F7%7faXqaOYKiY2qqsgwwnj+
zli)YkuF-=BsdDNf(>{0qFb0MikYlb4pj|9vs1I<kk3rD&>`!CSh6-^I;g?`C@T0n#
z3s*jwOXl<mZ+DJ9BM}6RYA=up{-4coi#1c%Q0iOOI${!PV)q(03g2>>&`6WKju-}v
zi@M6sYxnr5w*p_<vVl#VUi_i96fIT!MSkj+l&ivjM#F1_gvI%hnM!*M;#O*(M_OR1
zeWzj1Z(vXB-XcJ7Dn1L{^fh$Sx0dfeUx^YzZgy~D2?eJ4%p<@*L+hNs*d3E}9DItd
z_St+FDqQ0u5s=>|<oRcEmqs0xAl4f^-TXsAi_V%V<D5R8L;_f}59b%T@0b5G8wP}E
zcoeCM<|JxvlUQbACoDMHyau6RJMw6g+53;))0WBn2<Y?a70)N4>OtzPj-&!#w$19=
zpN5L8W|Q`9Jq}lN?Vl#m8}YAVL3YJ*^nbqehVz$@*G4}UxJ=fk^F@KD7Z-<XA3@5m
zblRq%)%@j~t!4HIQ+QIt2RP)52=hg5&Pf}#@&<NJ+KfbtLJrwj;7k81l7IqYQ;B`_
zTIZAZ<aFgzp9J9bbeZ>R|B|2+Acn4+j}kA~7@HZ1HhT!zb%yD4tL7DTp;;%kx^(ro
z#}`_$E2N3oQa1UKO>ktcj<e^u{`0$DxX67`AB-a6iJJ-D)l_67Va^H8NB}0_puz&)
zL;`qxZsxqA4hmkcdaet?{9orxB*S6zKkTfGy)O?mzZyNQgC+mEGYHl7<bnw7!YMSe
zbU<o|C<!ji-|s(53Vc3F{YmuW?VbDvg-3o-=eY}Q<H>EHQxb7h+Su5d6LG(-SwGig
z|J;|;r{rs#U2`_P2wrSNl5t|>7#69L)n<Xb?qZvJy5><qJe^8mU{rOTL}ITwT5JO0
z6LO9-%RVm|8L6lPnidor#J#B>cx$$$xE=~c>1vh&3D*2>WVLe?9<81$<mYzq$>-Tk
zt=ypsqBR&ry$VA-%N90+$?L3^Uk+3s2{log$n0pQJSno2%ixj!Uq`wQIH5gmrP$p+
zJn%=(SDJ=_Us6(9I>2h{cT&ugi<0505efA-pFf;Zyt)F}f7fR6|1zs{haH(_8qd+X
z9L#PF0(Yy0bCszOC!K$tj7s^>A$2fuC;I6msgrTF2>IN#LLxdorQm`w{O_Z8bJ9tB
zvlxttB1OB{VCW~0TFO`IE4$z=9_``~dkm~=msk_=`Wo-aQ+pcj{CkUCQ-W5Zc}057
zZZNm3U~+TmL=b2b{<#UFtKJo;HO-Gk&%P8|`ssG=?EJUKp?EmQH0&`(;5>PQ0@u#S
zcK*lE%Cv6Zzl`tUl*BO<3}r7ZR8lbgaacIpKIyBvFh50gUkcHKe-LXQfhZO4svvCB
z6k+w}Q=Ij#J~nznRk>f*r?bm0Eneq(4+S(dc{SCE-xlMs3cCOH(V2&|#0@iB+oKGx
z)33nL{ufM!`~C7ZI$GKelv2$qsTkT=VqV+uY@nr<eQbM?)^rjbo2I?J@iT|mJTFr)
zEw(RlN0TRFkGqk`oy35}zU+~o(?%{8#|yRp^=Z+0GW<M8$(XoZlOn|(^3F<s9{Nhp
zS1E4h(5qj(Rn<0guJ#fa7^f7m3*GIbTulyF)4IfYZYx9PlL*ZYg+p4rqdJ*B>(g-Z
zLpHGP$`eIYjR96~)OBA=fV<u4{f*Or?&mv;i@ERk9xQN<A9+>EdU^E!@EBGUrmN0O
z{xwc!#R2Q9UYS|qo;Gj9p1iU~8p>&b3|05+(r=-TI`)eOzlr5Q(D_a?I#~49awWY%
zM10A`u{haT=8z(W{yn00u`l1GW{Q_<$a0~@Mq;yQFj1-$iI6a<Czr>Lfi>`XwyEJJ
zo7<7~0JA!ssZw4W_5aly!NemhlnB9K*qbbj-FO;w1{2cB$%&i0rYi(fwsP+Wt!!9x
zsj+OIjv(89o$iE>ySInkW6m=&*&%eYc(m)4(bT7ZqtNe9TRD;I^unl**F5_1a0@*c
zKc0C{)t<NChVjK5gyk`Z8^VkMJ;mANkz&Bl=0N_wR^is@__sb7dAmEEeT$X(mc_*7
zq>i_vsZ9WNaK6-xG|_0QY&eHAXV*IlPIoxL#qN`RjeB`y6Lh4SCn3O~ZdmUs|FHC@
zSEN+ts$pMYNas_vShal@e|vxXi8JYl)`Oq*nEv?M5Wy3L=VutQzM9<Qu9%*1W*|v4
ztI()q`uBHjP`r)bpU5P};BFaG$Hbf9Rm3Bm^qa@Cl#3Qs+B)s2(#SdLp5Y%O9tmp2
z7^+i$K4v^PtEwYNZ~M~-y6F+?Hp&ievYe-Aw@8SJG87x(W>_>V1a(=H3)8-!!y#6G
z{b@&4i^c6jUqQB^$tEFR>v2z3i<^eiXVqC%l5K<FV8i8ui0Gh?;8WZ#*B2+(N`@bH
z8(WRv%vdAToA>#7TF^?kx?WOvpRfMvejJ+6g2Lfjg91WBL$z+6d~R0)FwxM_qg_rn
z2ENN%{oq~pSzH)(%{xdc?h=_`N^N3uGy9b>t;DnUR>au`o+T2=&B5h5p{w*b#Y*sF
zP;HlxUbl+TtE%`F?cocbLo1wPy99TOl|ESsk#Q7zEP=+?=f-BDDP_?vSCguG=oZ$y
zwptDp61sJ@+bcV3PSrX`oePBrI#P?jPpR~_nSRy1*BRxhKnr`-E{t}@;TC))9H8rR
z`jxf+>ANQk56_`qBB!FG!1N_nxvPuu!7+6Z0hE3AeMFby0hKwSX>?|F`l_0UP6M?f
zaLlm*b)W3_tkyE8U_I*LN#ngx%h?mbR-EA>7FWexA(r05&FYM?QjrT<si6DSHqM<s
zOG$>B+C<@zFyHVx2Yi=#248u0R)Qu^r<E?@YJaD$1n1q2Zq+M^;rI95?@6Msq2KS@
zbk|_pSG0k2T1I9dDJiM#;m~ff*KO%j-|{krsyGlKE~p?vJRblG>ukKROoYtpDJqX%
zKsejRaL?<hE@Hql&a|c%r95@_CMJGRH@{INgQYE(saUhk=IYN<x!Sf*pu)2nr=wf<
zzP|n`Q;nu1skEaT|3Pxc$)LyM=2C1fyx*8SZ7$ho%F`v^eP(P=##b<j#BA2YPX%S|
zz_<E?Z{5CYwRkU5Eh*aUa0jbE#+)#&rMvDtq$)(sn4Neq<8HVj2u9edGnv($sQOSR
zD@T>vrA!5byl!j))tpowqML}R;l9P@%C`|+Wp#}zAJs0@k=u}btDQt14D#cohT&vM
zmtQT*=P@vl{X0WU;R*2QLo^p%MBm5>P;wx#kM8N1wWT)YMMh_t8Q5;m^Sb3Y9SUtQ
z92DgSv{fei%+G3OqB0Y#IZhh*W){3Q{7`sWE17Lo3#-G$_F0g<ZlbsMvI|p0BWbqn
z6L{QLM~liIK49VEk`fYL-)v_TMo@gdcW`#*B8{u6Ium*Pi)p#8wHr~V$H`qmKGfz|
zxzCv#r2JwVBV;$Afm6#~Mq(h-@Gf0>(ca;lL&xl4piJbj)O+B~1Ql&2`O)8Ja=B9o
z^^-w%{NuOj`KEDoE(d=j-$%Onh`$njrWe^FARsVyBUAZ4t=3QS>_jFq7{1i>Z42iN
zj?-M6YR#aQ>y?~-&Zc7qY@(oKL{&kG&ZtaERh-(M;~w$m{@pi=y(9!FGee5$xt6<@
z2_C(UvN_K;54}+HN5h2l><6R8S6ofmy+NeJ7y+nhhYyW{cZqp1t8d0?CWJDo%pNRt
z&ui)~CFM=2OAs_V%^Os7D{qSq<0{C^th4r$_m%@)&3o!+f>_WW0fB#@v!@(bM~<c*
zQ;8F5q|p5X=8_e&bM403n?#P6U0yXAy^)bD{_)9;!kfrwdQ*oi_`~w&xU9U`)a4<y
zgcLFH{=M|ZJUeSnUcnZQ4KV2MtFgXfw%?!6=l1kS!iTA;sl9qCS3pHY?d<GiSUEgQ
z1AkIe<sOcPr>5{|nw7pNgZG8pIgP@bH3S*FiO^Vl@9h1~Yv+Y9TUFv58YI>I)yJV?
z*1v#YZT27!mhBI>P-a$j668gc^;-*zr}{7STIT$%vw$~VG&&Z8lcg&sL@1Ok==|;?
z>~oT)oXty!nm7j|wKi@1n9LWb5>Z2Pva&ib+TQ}@Xnb_E7z{6OT}esF<fO{p94qNu
z7;9_2lKEjbnJ@e20e)E|l`p6e+`FVI@}*$wPg4i$u<eOz=1D!qqaRsn%aMG);?G8!
z3qfcee#}@|BHH)_(Bl+iJA&^|lHZNpxMmVo*7vHAUJ@*6Uav8H`?Xw&@DGXLAbZlL
z8i>$p`WLp8x5fk6D_X8WEMI=2jR37U12cv$jP@U5Ws{qYfex?+t{JZfI_S$0+J!g+
zh1mAT>yyW|p!;IPXH;R-5GQ)fPI9c6cm=O8bcQ2hBee#{Ci39Ad;NDEnWd|<Q|<Ai
ztaK!)gS~y`65k8Z9x5!<R9C;+OwcbgDUarZIg!?Q!;0Z0lKPoyk{wl+yHC|C2j)5;
zRyTv(@>4#I<zyenQjdw1g87xwgU}_VK^IJ7c)G~`Z$503ggq17$|+jAk{y&jiKW+&
z@WkEJ+m0S+e#1dQ*A5Y)=4fyGLN9I<*10BoJ40E9j<m9rL_erS(Ci;8Z_{k8D={vM
zH1G;KkLM-z#UkhamWr{gW@dYAWqJ71#q-lyKO=_Ubsh{G*6n+J)CktgXY!Kg13vw2
zZjjIE0DVn$HK;YrG^Tp4b|Sog|Ng^=4}1(XczEAIbsJMeXGUn=3WhClRcDzODS0W>
znb}%asNGd?AQ=kRbgh2qO15!&<e5*4=u)~T*(y$_uJ8-;n}Icen2YLuuprIS@Dl~U
zX(H&NO5M?^F=l8d^Bu!lP&ZxJP}^R3h_E9)_7a_0fFXS9EP5pvpVplHYYtz7o|{$?
zMjFb?6Yai4Ni0Z;P&ZLhoj746<|#pFXP6dtVCQM+7^iBV{UyX8fGG!CZGcbt>z)`g
zcI-HS%W;H#;dtWbg0jNd=uQd|N>%1x235D4smGQssr^-TP$rZ{LvnIvaDX@9s$97C
zur>dd|0z7hf@ZoBPTnrv<ZPRv7o**TfMSaN=(1*M%ZN07Lt9Y&2CadiTx*37mM!)5
zL8;K8L4|i?)d!7m&w=4_`lN;uBP0VOs>BApxyn9Hwz2NHt`4EsIo8iLNy%^pp&XXf
zQaktHq|&+?WdkKKf<GLK<&N+M;qHU|Nd-UNp?c~&X&;w<HI<j3H%FAgGe$C))$8l-
zx&BROwM}{8Y9Qxhvy64w$S3f>L-XnG>9Gx%L^GZquiDffx8oi!%0E+OdL1=v3qPDI
zAJHa#CG*8fBD3)RoOf>_y>5`nHHs=m<Ah*pX$(UMv$i_z_FxmcnM+7+_Mvt98Opfj
zv`0F!s73T<FDx4dzMr`1L0olWb4_|(xl1-imx_#9%d0a6Pqr2r$gA=0EI{5)d#}+C
z#iQz5tl@2djyYH@#JN&+W|^5@*cxtgR&Qfi;VYA%wj(Q|?B(GRX}#RLb+ujf=yiRO
zy+-vDL5OpvkxqSTQ3H|^ofdGN?p%7W=4U7-^%OJiXjZGooRe?CKCuPECf@#goC@t{
zWES4tL&zvtd7kdto=6@;n&xjtp6*B93*W$FpxB}Wsn?cKgSRTTlw(8TuQv{>Wp9K^
zjmUy=-~Kzd8_QQLrGR8#la86{*czAEAmzu4@|<!rZcW!Pm&<>jVmOcxyY`JsOl|RN
z`!>ZRZz_Q<8IaZN^igwx(hU-xdxTjS)EL8)aUe3)jK~dE|B#+Ei0(_z+Tm{Lfid3{
zv>JtKNpGl*=_vDWWclCN+U`?R=cD2DgvZCl^VYcL*NkBK%4oM7?2*vZ<`|gmOZ=w(
z6rBFkMt2$Oyr<-@7z)>w|Fz$4o1)Qx#rB&6fgA_wWnh|KteSLCkNa>|^Uj_F0~;qk
z^e;oN{p+#Ty+>GANO<XH*RiWufv$s4(QeQkGt8XmnWWoUKN!g!kj8#CD=`$Sareo?
zAS^K|Fe3fEuYAl)!>DOay!Tzmn)@-d>M(3FZ5f*(SH{n(#x5`_Whaz)dgta+y#2kS
z!H5o5aad1nj%myejZf1~jP(0nkod;gQhs?serrb&{2WbFbKk!xFFGc)wp2SYI;g7R
zOQM@O%8h(q(bz}uareGRS=TbEHSF;`9zC_l#~7uPfpq4akBw@c3g_Z*1_%6mjki5r
z5h+@pvyFlMj0Ki^N$PHgeM+5L31HvrexDNe->L8J4V=Lni27z*cc^oNhUkk5%^TY7
z;*2|Vh)_qp$_8HuUpi8U^!o91<(td1v$brchE*XG^HX0(UVPv8uHKF1e*w&-tk{B3
zJmOWCi2V1(m}o`h3fb3xZitwz7s0+ru){l?BPXM!EGCsJxc_ws(SYN$p73Xj_<TDy
zJ?X$S{|+x?(EVZ-#@Oj2yf$)L2toR8YO|5yc5!Bj-q_Dc!8^*BU><zmzq~9tSkjF`
zb>3gR78FEHcpitCNM4K#1ZddALWC`o=cJRP+zIL`l8Ljz4y#Y(kewxXjB-o4x9Zso
zC@zF%Kk!iFq`F*6P<=-(yeCqe#LdI(WAvHn*1u<EcvLB1W9DAHq&0LwZ~+SuaZ-81
zE9xv#g9PW=?c(CDws2gX=LFv)9Y+MGF8J9O`x|G4SfKD}46YUnyXXB;QyileS|y(m
zHj^5xVC+ZEr$R3!Nw!5b5mfi0Q1|m^p0y{gM9bh6h%5LBc6p|MeOHKwh4*d5+8{`S
z@TDNs)mZsT3IQIzYb|VSc=q({FHwZQzrO>~1;esS8;cusl+yBHsWvm)B9PI*aQnd>
z_9Q92JjQFkTaspe0>c-Nz4hmj<jU-0YV2>>nz&gQ!rBBpl;-8})aF3igU<G=Pln!W
zFO+KLx6E16!0F=B=c}oTniBgg6#~Ae>2-o%>I6p&%1`nhwsd;6MKV#Om5IfS5w)dg
z?1ZzbKdy~;sDx+EHxorDjp=G*(ppQ#HLt!Pwl69tG`Z^W)SX2uD;Dl7@HV>F8TTO=
zqY?xs>&ah<CBHXATJgKlu4VDVR&pfN+%Os?6UzCqiEAkSL*H95fy~!XW!`Z<DDX`u
z_PZ6HkgKJyGNl1xy?IztJzC(WNUkE^KL{{TG7w^Ge`%x+8!?N+_$vGBTH4Izy}oIS
z>kyKG&sx6xJ1=M}f7v@5hAn=6_cY(Jt8vgsN{Bg|_Km5l%_E*c$=dc^n?&3|)I@T*
z4|!?Y!Q+Hu*S<#)J{5+=7qY_9*m1BRSLbhiLdwa^+vo`k5TO(nH7F`JIx@!j#x%!!
zMS{^j`>9)F>hA=;KpAbcvA|0+GNKS33SU<0TQTS-cNW)+pK?->hKD;N&8mt2A60J|
z7FE>l4{y3lkVZkI8>FO^RJt4K?k?#L2|-G_yE_L#Qc9$|k(QSHFQ4bU?{&^sX7Aa|
z%vyK+;$F1DL}W;?@FvK&G!2<mnB)Ge3Agb4ur)Pq=>-+dS1q2OUDi`bDCcbfQ1lzD
zi7zdm+$YD;Y)JIHgc+(-RAo6Unq#?d!gvt;(NI&{G3^{rBV8_{xH#Q65!kCh%HiO>
z;_duE(o{!pWYA{c5SRVV@z<&sbRBUS3&FM=@hO*Ua+$89d14#=EGmw-OvqiDFNA2z
zosd^6|835C?vwpdLF6z8&y)U;R&U1B)-_fw`muA>lV_k35k^qWT|5(F$xHAd`}FE?
z9G<BdR0Zs#t;;>}@4wWPG{|S-!3T;WxvK2L8OZ((04I+izN9bPdxe$Ji}*Hmez`}Q
zG3`V5U@t+l{my_9%IZVQSIh7BlH06m8MhE)o{T$&RZ(wmL9R+Enq;efG6r<@L)ZGb
zh`HVV7Gd>BH}&F=K7`3fM+5`nzSQNa@(ZC0wF=aoSi`-)Mum5c)!7!2`aR#LVv><z
zy-%jEY{Q*fGb6VG3Y!cp5tG#ymEvdSj&y>?%2(EE5!lrWO0VC;tz&1?t*%XwHu8N9
zwV)?{Zslv<U|WrFLgdpz2joHaioB+JaF_3qt<*6Z*i?pJtJpvv8%)om>&tU9tWp0`
z>~3`nDHIX71hJ@~@BVu*fzBSOmd^a`#i(G!#KG6LR3#pI2AU?JrXaJV14yubhvdp1
z5sxF{=7Aa<FAY`B(jb{)IqN)MU*{_OQ;ux#EoH}Eibl<#iD^D*oU^>lcD=nZpr9b=
z`A}6O(|>Ca>Hn0+NBFla<C<>+vK3+;xOwZPl*P>|Pe;uV*c$~W9{wEd?Gz2!QCW$%
zhr5yv^PjXi>_a5#cGFJ{Z1{g(K0Flij*n)}6Wuz?(|9CX)04od`vvJ_PYjs+#u5s3
z)ms;o&ns;|vWH-axZRYUztdVvOD#>dR<Fn)_QraJbT)haH;K_>;i=Vln#Hj3^1XQl
z1f}z~nl3my3W0^!UW6R#o(rOqzx@MJgGw5t&Q3~><T{bkP-onh(4$8uIjhT$1@=hN
zZ49i&Pz*lc@91O??mq5~k0sFXeh+?$L47;piA20?mUCH<QK0b5&JzV6u_5$yY*3oP
z_N-D+{^MZ-d<{7qabWEH$~%wfr78E@S_SdeEh{eBTW5ugL(f%?X~b3R)vK8=H7Z_1
z>MC1J=+vR-sw~rJFx)>46N{hz_^`d)az(ep8?9Puc?QwCwiNCdnC>|B$6&CZ0@P`p
z>#KP0OZ-|L&7x;gQ0RQ$SVMGXA4)y+xu*JRnxnqtH-E2JJnj}hJbP=8ycbf@?$_7f
zp7MD~q-nPag+kFBuNO}%YeFl+JYU*Cf!#3qml9iI8C*3lIZ#m7&&gI}w;8iiD?Dy2
zZf~n!5UV+Eq9#?}hF)c}f}@e%X};kK(lMn;?2D?*{7plsa{bpJ%*E0%#OG60oiOH;
z5K9QtZe`|re)RJ`=nI0k3>;IdI-~l0DmG&3eIPNT=@1Jj+WEs&3(m{mP26=^?K>oy
z!5<&f-`Hrg95zy3GQlkJbA3u0uh@lyL~q@eT+X5rl2lkN=j7VFWDyKzk(o!PY2Hcn
z_+Kxv8M0*mvEF-KxT<-8Ht9i03gMH{u^D3sdM;dhs*=y&%F0hQHwViaxL~8#O@BB4
z-&OJveo>eKo=9aI--(ZzVn2`(;I88wAgEtCcj&vDlB<20;hAulpE-GLJs?A{aQ*LE
z1mT%0TeZddR>40e+TLQ1TcI-+_oASV1whi<@B=H-XWJ<u79}D(ub^hGGxoE1Mp;{A
zNDNg=yML_2g80D<!dyczBIp(Kf0I>BkbW4~{9=Af-=YVOc;Z<jbxe~xMsi{j5iV}@
zyV86Bmh+JN-)`HPX+$%W?VBfIQ3rxpiUO%j&{<1_<G^%(YU6ajVB7H(Qg=wDgF@-s
z-&zBJiGrcoqS&sY#H7CLR7+jIi4a=3BGxC8z$_Wq`xYw1oQuuVHG<9v;^k{353*uy
z4e{gV*LO1N4s?)5&BIgzv%>3DcvvtkTTdx|t+pqk!MZst#U}-+dhl27GX@e*1TMNC
zsMDA6pqCCHu^^-o3i_OH`o&?O)A09o@)TXRAPSV(9zB<#Jv*#8!&dy2boM?mLx&zp
zQNMD83@}L+J9An;bsf54!XYAP7W-(ul@{t{u%&=xF<;N~FWrAnJ!0F#-zn(bO8?Xo
z)7XK7V4xnXXvPiUD-bALmVGyBtwUtOs7W#E2zMDwj#t#Q`}paCf2~?51(yj*AVV__
z{hP7>=+?L%6I9dHPGC%byhwr#A}T!lpT+IZ+nyx+1*<9Wds1BpZFXBTa@|6G`RX`_
z_fMhFarob3_HKvm<WIih^qio-&tZ9mi{-y|Z}2P$yHneX5VT+u@R-m~?%JuJy#1Ng
zt)`nl;Zja_HWSL$ZceS~)Lxyd{<^Tzc%-Kg6#!|S&UZe#7i9UHCAt_06OMo@Xx>i-
zlC+7mFeq7@vfy1e*Rb2xx%$-(TfU%$PWt)W5J@mx*<y+G%igC<s??m+M$ajHyy#?{
zUw#A8ZlsJQ-0lBp&?DOaAUS+LR|LWIQl%Ue>s#qtNT5X_p9a^11A9zZc<YUs{ieFy
z?$=pRZUoa1;ODTDncE3M2~|V8i=H*L#x2#p(*iFg)HCstKPdEl!0}qqgpM*h)fc=q
zXU6+PURYa<Q5^Vx6@g~qXyS*p@jy6Ft_O!#I#9mR{Q#FT^!nGSe58KR=jhgk%pDMr
zvoW}cv@!hAJZSZQHWpoe|C3wxltTb+?Cm8(xqBICW>p2p?~=;2Ji?7%lcQb=GBt}S
zOgULz%-HURlR*h)NpVx(n4q3fj*GpU7(P#li1+UnS`oRiDUS8-$Fl)~bNhzF)^+A#
zy{{#1R#Xe=%WBHU&pPVcs6%xKpjN#Yit$Nn-CjeZq4v(|zN>Y|;i_|_+g>t8fn^=o
ze?))|jGx9*Wp}>ND)xD=o?aslg=+LGH<#_%#rc%xr|A0YJ-H=n#%&z0gn9^6?#Kx8
zf6+!(RY4AqO>6S}%Il{uA?G%DSzD1gxmk0i-s+^!@iCE}3e%34u82lm@VfK0C=fYu
zhWjJ>Nkw~F#_DWJqAMIbDO?pH_m4Eq*6!lFMl%Cy$%G-(NMsoCO=U^hbd0Yi1u8`Q
zEb!eTUUq?fK?OMTG1SXE(*~U8gP+98Qf6VZraafOC?Mnh)gtV_?aqHe?%VTgh@J2t
zJ}~OWY8sP2qc$ZX&$gG)>}errF_e4??@BJu5r(m<UMveu-fkrr`!U75$edota^BNj
znJIVW#r0RX_$aLMr@l2XAgw(h)-<rmG`*m)sJJ>mr_@&6VdwX<+NTYd`xVH5lf2~#
zseW+W_h0iJ&6Y&Dte4l%=HEb)MThuAPGxkb`-O2)xae4c#N73tTrtZd6r(S47|2jy
z;R_DG@D`X55koLp7R%G&#|zUxx8~Qk=Y&`r_Aq~P`V0M_vE_q|1!%JyI*ZC*Qq$G%
z8=Ng1$VXE{J;t(0%Lng7JhFd(N?6Luxgs<6lfkedc!HZs-zmA&g$^3%IAxX7z~VSW
z+O{Usz#`A`nA(uUl=6lc-hF{|9yEW$zbQyVrLUzK=jkDpB7Veh3jg;6Et;Al87llS
zp26^7+kavewm00TsVZI;JYAKLK7cT(c<E`MtH64PCVz6keoRYgzm<?y2T)C2`WA(J
zECPf-zVJ69A}Oe#k}fnBadqtKS1095dcrMjXh-1_Hcbo^jgvtl^^4%VfyAtu_mybc
zBET3eAk445dy2RdbnOy0^pAHfo%S(}evtA-ZRD4y8lkB<6lQuU&jlc3jA=Ie#AH?J
zlS<9<HmUh1VPE2rVVdxJw=vSv!!3Vl6zvH8-j-@UfTV8`1up=pY>;KBGf|i2+goMn
z3!u44d@XOFpMXobqo~T0YOYV`)TXWoVTXu|GUE@cR^a7GowXJML6ugY{^X<0?X_o&
zF#h2QTuRopCD)VZ{(pbFX8}M#5@T|!$yN17hMZ^;nSLAR`DDn0eNDXUVNrcNw0OZ_
zw3w7=%7Hidd7uXLla!yEc}|mmTD*YY9ef?58QgG$49{+qyu>|Lbj^)8SBSh&5W;Wr
zniM%I>Ya|2H-RlhZoG*6%p(@mhixOroSgVMR=PwI3Wi55tcmAd%nA18Mfb<saXo6c
z*?Cf4Tpl3QO^B!ox{rqQAKWe@;Y>JR5ny~BZ;}rpmCTPdQDWbDU!lI!QX|6qTA*U^
z{!H-D3+cIkOa3P$0~_6P?>Xg}CD~z5jj1|~P~Tu3|E3)7QEb==&#dNCUItp`xwz)5
zJygNIof)ZzG`wHTl-1w>2~K<?e)yBYN`){CR`C|X(Afgx>9ARI;?g;psNptJ`@>Pe
zzTh1Io@=TqZKT7C_4M&q+a4?qtk+o}l39K-J{pHhf#q8Z=P)S$Jl(3mpvD-x99bsT
z`Zzk&t)U$=@k(~FOL}Sx6_%o#G9dq9qL<(#yxJKfSz|geUZchlt72DElB4u!1A<j!
z@qHwkbU<QgxR<ks$@HYp|HKYQjd&`7IXMlGoxT&StqhI$DXe-K;h&e!qS0Ak30%j%
z=IA~Z$Oc9nw(=i(b5-!#&W6UvuCFZiQTo!{)LSz_8HZ5E3VQXQ*iG#_rX*!07JSMc
ze@@3^laYO4qcFwPhLl`{lV4Mp5ohSqdpCV<_ic9Aqbtwmz3E;yzR?tCF0u+V(*~R6
z_WgiOQd|;7%wWIyEHXLXVAJ`RKxEi#;Pf3er{lnPxwaD$yOvVtti<TRC<W%<_swwZ
z<@JqM)k&Q#oe@5Ik9e9Jt4BpjQI?UifBv%!6aHrzHbV|x24%a1e_Do$@HT7Al)k;_
zP=ccUaadk@Xc+M`$Ifvi`gL+`SV-NhjYd<~eHkOC#=U~uvO9RXif_W4TbNx{VzHR;
z7tx}N)>mCmX>1hc?zcf!@9%*CN32tv2dX#jQP@^aR_NI%mr28olf3^O=rm#4L`BwT
zkfxT}*tgMJ0Oug;fLxFJV6~e@^h<6_HOBM{j(HOH_SAvRGkytc@_V~gmT}as^`B%0
zdaRu_u;JRIjw85*;NalA&t(=Ilalc45rh`c<6<d@Q~c1K&J@_$8xU?jgHxeRDuz)E
zSR(|W*iC&9)EL5d^@yN^s}e#9%x}+X#a>IHvvx1Qrd;x$r!lrk{Myqj8UBNQGI%61
zqAFPNGS9BqjyKNPtZ^M%H>tKfG|aPiEy+Dqe-=bVr&~G$x?SDl86uR<F|^;)Y8(Q;
zPFZzHoTmYOvcVOw2=57wlTC!LL16bsq}@57V4<XM7i#Lrc4I92__EH1*w5M<r`Gqr
z7aOLyvN-aimg-G>L6H9Y;mpX(g6v{lB@VQ%6U;XR32w7+9;UbV4E{@@h6`<BsoHH=
zqasryPFEKjaIoqmk%qzpeEg)7>TuH0uJKH-H)0ctI)en(h=)7uub@I!qT2=92y|&6
zX5GxpNvf{=Vcce&S;QR10M>QV){Vx9tL`rXm;J`WH2ThG2XQffDk8srV-@PIy{kb0
zT^d_;ZMH+A&V=ULR%Ns?`}SqM*oK5gY<=gLHW9#3?#Sf5Ej~^pv50=?eqX@iZNx=N
zlWziuka@cB=V)9gsNfo>g}$+p{S-#_dgn4)%Au?s^-n!x?g|yB>BK&yV0%dGTn;f7
zTl`KnSBu`Bkt0ia9UlJv=`XoSyW#$4mB_2Z>;u);FUXz5l}(j;n1Aq<*$~re&(%az
z8~GFnf^Zs<#l!G#S^HjDg5!$ww?Z%JrAr==tKPnzxcD7V$m>(SQSg0>n*fW_RrKas
zr`i>&p=R2$V}__HC5br_-A6?izEBmJf)buS@5qQySk3K>V4<fN<aezQgx_QQ6?sO>
zUtAM0CpEV07$NIGKRvO~X@A+?asb?Pa4Br7A1f-VCx0=gFSB6ccts^)ishr?ed@GZ
z>Lb=gh6+Qy04J&j(<QzeW+=1T6>~x_4Tzu1P$d{pb_cu@L-2K+`M8@36Yqmcwx^sf
zT#83nX*y!wORsBy!`17$L7^i2^EYo3b#fA$nbR?A-`N{y7iMQ2==;?lA{-sVh4#Q<
zxS9(Y7Tc9ER#`)_R2AE{6isDTQwa6b@CJo%X!!`QHC0&pT{}@yO1hVt09X6{rOEVP
z(wBGN$zL`6ffgW9RKq=vS4y<3b@9w~MmywiDV$4y@9;ErHRYz@mJ`$BH;sk$PET;D
z-X{Kg&v47;jobwH#4kZO6STtwur7<ecm#IjE0dj~xtt8Abq9q|O*|Jh=GT)rDk(#Z
z0QW9Dt?kI^iq6js{hV_Xsr1{LOV$eC%#z4>f(Xl>>LW2ryGicveEPh?0vl$gob+8i
zENhdFI}Ae{EjVH~r2Qf6!_NfCN^0ZOpHek>rwGo!rPp2Zb`Dx;;8OTUyrtrs{?4ne
zDw>v4$wX6TVREE9-8jzMG(;u#3VeHPc4-<D;|cL9FYf+EDp_W~`EdC1@RAUGu?9AS
zNP4xAR@F?JdF6ptE>?PDua=w)h|3|_gOgrQqGNhTG|NH`l}mDR9sky98Z*8BcC*U=
z#asUANkRw}BVBLaE!>t98KnLz)^4R1mi5cKn%cJi&J7irGiJU}eSfEC=kZ0|JrB-<
zCZBKoZd!BVOXUyh1XEtzmY<`@1tS1Z?+Mdc(JS`;tvD=D&uozZr6~8n@Ig$R%~O*G
zYiJj~U2nxbzj4PvZO2J%M_8JB2?Z};!p*=KuGyK+ksG0OU*NMw2X&Phim;1<zOhsk
zcvI<-o+1Yx3$lNkLuMTiyj>~P-d6fB0<$aR11S2_Y9ydeG(5?EuECIMra_*>UE^qd
zsPFoHG4U;GDOGB9Xq*zu$@i&0Lw7&xua4-wax%ZC{v{=dWOiA6W5UjJaKs-MgdO3y
zIx4T&otKqFZ_As$g6;#gI=GM&IR@<DZPiax=j4cH<GzP~%OKej%WIQKTpx5nS_re-
z@JTFhnP-m3)uu!DKOn<d7X#F9C^2h&ibs_#!>z$<XAgZdmdS<yQE9nXs1Ey+JKQlx
z=cXp&p1XlhdN2CRO5zra-R()ApoVMMRLTUBrAAq+8Hxcaa%gwy-h6C4i|yPbj~jMR
zKRIb3z&{x2(O%Kq|3!!@JragC@;?bnL6RhxUK`^Pfj1?0^BGv+A(L!?-LUB%?TPx9
zQr0%U5mXSo)R4>le?bf+n76gQ@RD5n$Ya2K#B-u}GPG3_qIcyCo@|gu`!1s=8CM5H
zWH^;o(u@V!y<0@!6emIzsID{?klQ4UrSPCUc$)hj-GA)d*Rwt0?5Lt33SlJyvVr?0
z4tv}1exD-abBf%|mz?KZK?`s&uu1^@{@yyZjrG!RCf<=4T1>1D_sMR+q?+!Y>(ifi
z+jSXBZY#|~f<PgtM=q}CwV^BiYW9H&4W<QSL_~Z=HT|2)jz7=72+<!L6`F;2Tx)h;
zDoL1?2NC-(3COs7dWV6B=@z6G-co`Z-AwPIFXnw@l86oOuM*`|gY$!DhPocD7#Tv5
z+<IZrwz6Hi%<qOC5cww=-e7&{+2;Q1X|{vL0x<|`_PG-z`B7m-{;h;|&cBm_RLg^h
zp7A-PJJBibGBV9a1lQiQ<9eTkvRV^ZmQwTzBaB^Vu;q-Qosb1%(zE=86KdwLJSJJc
z<uvlm5!@Db?&Oj<<0)$G(kyP1f_Ffe4_h#_O!dG$|9!;v&IjAXXz)dFA;2pX;Mm=w
zJ0Kd9c&Xr{XEx$ebZx?Ne@alGrZ|PvTr77w%OBN-F_&o63O{U7dXt)terHhm#=W<p
z(<c)nc|;g_BJ%a6bp^2@NW$xd{P6x5ZQ++%X$9IaN$IPiF$6<u;;)FSHU|jTk?Q_W
zyz);X)Oy!`=r&!2`@V$QRUaA4^w#dO_JGVp_i`e~rSkY|W;#HM1tPOv&IHERJi%+2
zjgBP0L6Jo2Ht-*URtpcKG{0%4g=jsoSJLJvRl96_aSp$X3>E$5HC`#+iq$PC(Z&%c
z<?6WE%6ZNJA(LNLUprsBZ4cBgkkRlJSD+O@29!*bS$TqAl^h)^5Y7}py#((MH+x@}
zL`9hQ!me!FfNOS{-2oFyS03*vkIK+3`Thc^N4uI^M*l;BU@82U0(k*9Yg=<P1;97s
zLEWzE#KwG9#$hXJ_~!9PAe3su75bsJOa<q0{I}B{FSb&jP9Wk+4f0<aF&e${i=y<O
z=5yC0xf$$#t)$X@=P)c#)Ed5GR}<cx5lyh)ha#$n?q4*V)%r$wz$!!3->K&DHYS?-
zCB&vjejPB)=??TgR_-Hb(5mWV65s(RNOxEfmFWaTrc~<AV@IE*bjgWlwWGd`Eyzf2
zFouLx@cM4J8}CHlh|{9M=)5CfTW<x=%Sh+j;9-C;m_^YZ2ExJeL!owX*o!RG$=0hW
zp7LEw7HX`%LQs35T*N&pfqGlLfCG78jc=8gFac^I$QT`heaW~Dubi9TdEwxz>>FEA
zu_X1}r<t?n(Vpgszb!2~9GU(EPy()6(MDEU(v$Geviwu44Ih}zfJI339+Jnf3G_x>
z9w*&-ky~{c+fi=x(PkZulIpithNzUmh3X12h1l?PO&vjDr9S9`X|`NHL+wP)Np_aH
zQ%x99JBBORWH;_hxeY(`=vVk$lx0=~>0gI=G*hrSqhv3OxKDPU#Y5FSouRxQ4NTW3
zor?)qaMw&}TMk4AhVhWWM$)D!vtfM_(EBa5bXKukEdE5$o?xpQUTKK$LgdfaY`y8x
z3Q)DogXtN|j7iZZ#X~>pM4r#e!Bx9yEqtSl9md5(7#LBUlw&#AXkd-5w!bV4h&Zfl
zN&P~#LkH-xPw)P_R?;%lJ`<|PIE>1iSH@g1>vsV%QII+JrMOApfpXLBuW_Jz^9d0!
zk3yH(<0{va`<5OZIKyUsT9p!>pB6|pQKf-Y;dmr{SvF&sVvD!9Zm@f7L;h|4v}$O?
zr7WqoCfnbkTI@AW*nt|}#)l^c2`0s6Oq7(_MMeQ`7A#dXQazMJ3Dpio=>}h!-@;L;
z$Ppcgbof+CQ;x-^zY<#*n5Sc2VtN@u_t$>p#q!tO(*39D>lB}*fx7f@z(&YefC|si
zM}-bw2FDnq>0<cTyo|~idC~CV^n%EXD6kmU*!4adAt8lqe)%<G({91?7|Fc!M?+jy
zR!M7^S1@RrlW1)-TLr`0Ps#E9L1TQ&@D0w~z_*c~-77-u5-Kamt`vgQK0O}0q^ApM
zmPUsqRaK9DWcKB<2ah_xFJ|<oevV%>$uDo~eYaZfVr))(vp~S$=!u|4X1<{~f49z>
z*fen#5LGvQCffB&RlBIsQ?1W4(UOs5pIugoqr2?0x_?yh>9)3AEjs3vzxeU^t9b|K
zWCcyqA0K3OmHWs0g6ns{hZ+)-8Iu-QT^#Bj3!Cv3o7BBu2Qcw@wC$f%I`@EY#ks|$
z`igN^B9CVxOnIGuEiFBjoT7~KW!}Dsb!%8EnqJ{iQhoPLba?4|utpnt&D2@^lY}$&
zsCC!hy;krEW&@VmdmgyX0sUKS->cs0Pf5S0co0lA_j?v^YqR!(neh+%!{U&N%9yCA
z^k_Hrx8#0P)X=qUA~psZbGw}$YdyAgVwJ6@YJGi@8lmY~*Pv*B@jpeolasQxZ1?iO
z*p%R}@60CmPEE5`xu$H4%!U=mYi7?R!LO)s#;zk~|E4@X6zjp26KzxC?KKk>V;*Hi
zsGH_jz1Zr7$XYAu+C6cVLUEOSB5akfhP4uky?rq2`>+EuyviJtaEJa0KG@QW#nfbZ
zy%ABos)0d<>%X`31B|4WvpL+_xBH6K`OW3EtuY}L$*G@E6l&|QlHxT!e=%RJYhe*e
zZYwAMNNm*#)Dx21@@CUVh3cq5(@f4f3M}ycNDUVwH8}q^VVI@Gwm;<unHkK-pV}W{
z#lOEa-ArUTnD%b1Zt5c(HF}~v#P~YtQusP~n$kU0v-SNDeOhWNKR>_pyP+3D;u!U;
z&BH%ObyQ|#Wn}i3JCZeMFCQz;Pdk!-slH-%74mRZalljeIs&4K6oJvg?)tyq_2})b
zcd2QpVztJcyUcQmD_`anGMBv_)-I|~jDA^QB3G{K$&HMmqadfJ0y;{6`l_?hAM;`}
z<Jz*i@MkeIz9m|etPFX%l~F$I=1edN`sh8UNUIip{>S^3#}3ZDuUSdS$utt1<k5GY
z6#CbO&U!?%UkWM($eBl+nlVBR&3CC;X>D_VpWw<6yQxVkE9s<~e+?F7rY1hs=8Qbq
zITE~e_I}PShY)=%cJ0hXra<8%12z$YiuZ?Ps-eo>UYeR0C%VaZObjeDGc!&9M`_un
zQ0L5LX7vo^>q-mf^(06`(^X^}e#ZlR196k741PEBC;ApGi>{`-I~@&doSXFv%GiSZ
zH#2AVE0y}7EAL`I$>@yCDDqsKTGMAWpw_w7b#cCXo~aH5RtuvSKF*FODB2r~7r$cs
zc0w-RqcZ%^5b?9UCej|Z{yjgS4@64`n0E@Ue!i9`V?Yy-$2}~;+xM`X(!9C0ch;v<
z#W&|s|B|9v`1E=GWy`+N<JFFL*R)0xdgKV1R=WVZ-&eXrz7tNq>1lrAER-w3k+h4l
zG{HM<2UYCcq!<dKK@DzTjXLCWU~94x1^V~8j{tWEb)G|ttB=&?Q#DSRu)eE~e}4(R
zT<+F?ibXBc53AYQ0Xhc>>-4-(qD*TtfihzgvYHN~oo)>vRUVnAb;mkv;bMPG`~nVw
zScTQ(g*9N=fc87@tv1mjcg<tLuXQ*9f!oKrrZ-*pX{kAfF25&}QT+0?_}sgvWRsVG
z+TBV)z{B*~Jkx9iz&1Dy?bd~d=_EgquRP#XA}a11Tr5||%)L6#Ya-6-WZGvRd%g!e
zw?Jw}rO+e2rlQVPn;9sLx-;arZEmqPS!?-~`yn*V&h{!G9%<dDYky|<=Tc-0&94!9
zvehZn2rYKs4-2ptuXvu{`cMCR6BRR$<2w1}KV5%W#+{L+<)ySmR;MZ}WA9vxSJNA7
z|5QerSADt@ts_4I;`)u8BzGWqr?3pG37*j2W-WII&|~e-s{!<*Ug5LWyQSqn$BIQR
zOjp?tdQG7o7dv4eKFL@9<fymXS;~q%_O7Y+mDW-#D&2<>|4V~;ws+-`tsf+R<5sy&
zT#RR$W62%<ZF=)heroft-xFFte;lYk9TO{P$b`Qy`zG4ys1Q&{w3@rIN&WtTS0TWy
z4)v-&oL{>AIQT42YvyN<5Ymn8aq|04R~|26u!grAq!hi>4$8|CW}L@XYMNXsKKZ3g
ze!piJUkrha@-pG&{+S_eyFAfdByMxl;i>t%%KT>J{$qd`*jLGRbh)^fYQU-x+la&+
z0y^_D>r-i<=_15_SklXoiNZ+pQop$G_Yl%Uy?8cO^yU%h@o>n0Jw6nwvnm8^vo@1n
z7QLxbu@*w|5_(<JS@g!4;Pvb73-I)*;WdWw{ioa5{on$rP3G&1e=nNBxm`GVpr5~%
z@J4_2&tO3GpO-a{4f^WAb^7F4@E!6K|NXCH6aqdJ84&;H@0-rm*Wh`8i@W~E#d#(E
z)_=U2`ORgnkn#EgjC!iFSr-CqL>|Q(Qk4NC{d`_D|3MTR-v9~(-WQOmo%vKK0{bOg
zH|{=_7+tHd(0u2qRe~In0ixJ1wUCBRj9H)%Q-$LbmqloQ=^AA$SpOcveD)@Gc2-16
zkTI1z>L<9J=QGN%;D`0NTTL%IvBwd3gY!V@?|%ZFX4X^t3H?RxPV%UwVuWuGoF6ar
z9~)|a=lmpv?%*CwGu$`1Tr~_C-67h{cfZ6uqpP~p%eBTqn;+3P&!A!w@I`A3XlB`l
zdWdrL-OkssVFogU*1;4%S5m$M9OAcF8h&ZdrA3Q1#Quk{X6kY*$J&Xo91Uh+Xz*<E
z(o!{_0`D9^v+ca0&=C<r|M_OQHkp3}O00@-YCZ4vC5os%P~SEN9M&frD+L}_fU{?a
z1TgcAvXEpQQC2fa*3kmXtG*jeX+!PLP(t4Bz{4y8N~1b*P2bLT5<)#ckOzm`XXWd7
zrDq5JQ|FKTr*@M{eK<|-s;{r-;o+eg(w*z|p4Wu>S_M@8xM=iEjGp&o{Fa)Z#J(`i
z68v?FeJBzB6%H28H*qUIF;&`J(wG0QH_tkf)D?}T0_=F$>F64}<3O)cf?v+x(w!8<
zPERO$Qdr}!tlXWA#KFtlkBDy$-rj*9K{T?l#yRAIE7epib*o2ftA|nzqVvA1A%UX9
zi%d6PwZ@EC43uSv_x3&Ovt@?zd=C7~ek-bzV7r+}Jf#NMX=)W4t=OX+oWY*ych4yS
zqshGlj8;I=)FR-V&%&Q@jR~q~Yd1DGpI7<aTboXb0aEB=``O0dy|$%P1;yn;>U(zK
zV|%ZJFX+wC$sp^G@mNU=R`ou4o+4&ruUz)B{#8_PxKH-q3IK`j#-|-9y7$#Am*b!C
z&(f5#5<#>rTN|0>bVIrQUlWoE_mAQU&a?G5E0JK;yt6#aacN#81wnrs5&$=+-ZAWM
z3x(=+TR4}}u5*&sGD;dQ*TVxjiVDP&s`UB<_Atm0r!VC)l779(yow34XmmZd(N@xz
zwPM*;0uGR$L_*ma{pkKR+G{IYeKX+`C0+eTs#JjF3g}UcdHSI;I-@k^Yih+uWNX|q
zZe`I8ASF0JeAdQ$yXo-YneyD4kfy#5&%cI{dAjRovyZ*!Ip4JI&tQY)^DTQ8sLx>S
zZpP(#FR|mtmRCS~+g22DRkow5&|Ap+a?8D2%}Lj^8DiHZ_mX9MR(zmYs6|a<V}fFE
zKvmpU>%7u<Fx`9}3TXU;C%4-EM*oXt2RmEjVM7FXLb_qVVx<Xa(v8U_eZ*PX;rE=X
zo9C3C46e?$8i?`r#701GQNzqGFvNlrUzwKPnO|I95YrFTDkkD>r8j`cx<styJxxef
zR!US4N;G-pp!jqB^zYg|Na96Sw%CxLOW`d>c2!IyiI}8@n!1A82fFUIQ-@*idRDL|
zfnp=6;Xuj6VrYP2kg4RYGADX~(wtD;xwSCR;)%O_5XrN)s3<OHpecJvU35V4Ia=QG
zK6h3Uj$K<a<%^CG6-!z8M#0OX;z++5oQ94tZ}W(a7h;c}Pz=PAfIv1~klJLE_;e_3
zS5<5Qv8cd4X%rUA0)dh5XKHFwx0i>vJ0g!QAc{Nzx*&|mtMa&H6Lu5VHCuv$1{Y$&
zl&>%6DigIZ{3k{Mo)tu)4B$)EHIClR11yAX#!N%kAb_`?=sj&BahgZ}S^BHIKM_cb
z6A}!7g4D#T4zTeH$z&hS>fWn<R-cLEN=jDcm~uK8d_8)T5D8xFi@;r3oF?M}v*><u
zqDvCWk5CtBQhBmKp-O6QtY0--8R@MD2Fl(I4G4Gy18=#OS~5O#HMSzpbR<?(t^z%l
z7Lx%vo{8JrQFv@9nrC={5L1={G!BM+#TI6VALcrH_hOzshx!KlaeMx3hh9WPy^DRS
zXXM*4%m^3+>0l-K3wu?SakT`lxMal;=gP>hy^5S1Ux1q<`q12~D~;7p`mMImd<9fJ
z^uq{UZUQ21j-faj<2TK>9FChXe78?OR2;@{&N9_CBuHNfq_hX<#sO>2C*g5lG(yJU
zra8kx0aYO>LD1-q6UH*YXes?}&~jhLL?g!dT#HE*>@vbYx79XIX_W811^BgTp_hw>
z>V}aN{z`ZP>2Y&E6la1|5<jb{n^++BZvu~<0k>J{%m1<l)c+cgXQuFwyQ`~+FoZb`
zuSQy|753U(E@^2|$;qev8eNzY*QDbF9`EgD9}vGMC1hmTUR5L!d?|~}(k<_rNvWr#
zDa5r%{#c$N-$Letbe&gn`gP-ba(^V7LZWW>+Yr72>ilOc?Y+@9K%qHt4a6JVo<dR^
z*(J3)KzSn3x`}3sXSdjy+-`TWYGcm_Q767<Ie?3%60h9TWwo8tn-|odaStXr=4_~e
zqEj$CimgfZNak`U(=vl9eI38$J^|=Q{l>R0G7lu;{wYqU)_hNm_xL%HG2h$w9L0?r
zS-VZzB@#gv`~+mwuYtLc@afZ)){1h+)rfC5mYy_NEJDufA1h-NpPNIKm6ci_mOY@p
z-hT3W-g8&36?BcdI)deGM&a=z-Zj9Z5~M`nRkqZwOV47*Dp82AurgC@siT^U%97L`
zk!tvhh!UOH^p>-=w$~jDPUOm%y&i2u79Rdo#pNDL4U89!bhlH|;RA`1pIErt=aYSO
zm6S&$S8m@-f>U_}_Qv@T*oEt((-AYg!tzz!m<<UNUyW^u0Rf(@j6YZxdyO%f9GvbZ
z#lN>^<fpGFu%d^SUc7vgpgkPPlp+9VQvC0B25BlPkdWCIoS<CL(HHBM67ys1Leahp
zbfY@_H^W0*oI{~$p$fDn4K{NxHn)iu_}`xhs$Frwt!`vIN05uk3l(B9MnnomL_`FV
zluCU6wX%KuD^z<th10S*y?a5vY1QGklA^wyH(I3O>-vDb+Kj31n91~|9IaE8P)E7%
z^B@3s{c$f@dn)vliFi-^Ps8_CZ<1~nc=*5&y!%a~8mlc+6od4?r<dQyuU`nMNS!lL
z7cKI*xn)3@IKQ9A(=fjv%$n1j@q5)}rYtJ0uCr3>(~nfB_D|w}zAWfvrQ=qXr;%UW
zOwcO-!%~Y5?B%6kiom9|;Ig($V?754oDd~bjTnyKY5yqR|Bo7bW_q&^$}mkh!03&+
z5$I1ErktB)H48md?dSP*yuQOYiESV(_=L1NDWoqC4*!i)RY0?7=6}FB_qN?+8+jiU
zqJx6n(Ee#Rq2VTU1^$z9xB0;!kimn(!HS-^DgVG&s2C9SHplNom>FMAB)0n}e01GR
zj=#fjQXV1+V4-`hanEjJ43`p4S6GE4Zbp(akI?FN;OF^qB?>h<#gV5Bux%*rE7%b%
ziHV=9j)d7kukz=f&F!F_fLtd<h5PNk54d}l&l4$&>fk*vJ$T1DAaM0>uc^p7QHnic
zAseJy-8HykNo=A-zOmqI-m#2@iStWL{Y(<7=R4*=UL}HHGBU#;A6!=#-5`m%el5K&
z(b;(P{hWTWbJJ<Y?Hf)F89({-^g|M!3iIr|pY>f<e$h<}k6|UV73MGL09KGWsebO@
zsyzIod~dr}j#SL^kE(&<58aHu0uGkFub?~9NNP0;0K&a>*j35^(2K7Yr$!1Cd8rGY
zXhWFZ166Z_iZdZ_2ZGl_*QZf=QSN9=rZtG*!96bcbqhdVdRFRwz~^9Zv=UXlSrPHH
zM8e197N>IB=_4wkLr4mI9WU~gV8|h)twJ`|_{Uj@3#af|V;urJ`YY@VMdc~i6I3!C
zFJF^2EzP!=_)j@4D#N=Z+gE(CXi(O{jZ6D(WSc~AP<$3okN7)9k9LnKeAX1Os-n%x
z%_CxJ|91AxR1fKr{Vm|LQdO2M<h|uPG&4JwKrqn6P)CWY>&RhSRTqE9?6w3&6G~#R
z4N9{cz(B{<`RaT_bV(>Q-K|-MC(U#@K_LPe_VW3M5qc7KPWQk~0@cl@QyyUU5?LQ%
zr5R5qJ6Usfj?ft2Z7)52*~~e4?q3rS-UEPS9EimMNTNyo=3t?A1_Q>v9sGlqfg}Xf
z9-0IPj7?@m#my!z(%f#*oPyGuaHszfGHl@g)6*!?LJq(!r=l_(_FVX`O@Rm3|E-tn
z!gfHz?*O^CF$}(L3+U9ypqcGyukHTT2ZS^6+le%m93e6FJwQgZF2wW`VZjY_|DGpO
z^Y*bhSSSY4sM6zkNuS^6TR#A0x#sOZXUcA!ru&#^|H#y<k2#lHJuJuKt=mH~r65sL
z=G~#N*f2w&wPRh?7oygIWoB4cN4dMmZ+WmS+>eceXTmefct7{S4~vW`s8|^42vm7e
zs{<rx2U%@%EjJSkU};`S2;ML;wn!^WQdPNqo@bIa+3ot%^k&0e`9o2Cy0VKjR^(IO
zBW2~uhU&BLnd76E#uzZR&g&PDhoj@~=a1L@Lxo}g#8p|7UtI&K${I%6oXV_YvTQ{q
z9d4a^tY6KpBEp^<YOD3h4MlR@)G9ROGxvKYnxnV}g6YB#|Fa*94NwZhb=GvEPwkVT
zi-m3Aq3pj&kO5*!!^dG`clvi_8vSFLyVoX^SM_P>c3|9GM5V&oP%$ScNPplYQC8&>
zKGCo^CbT*x1#x+$CI|#x0mxnM1H85eqK$F!?=KBUpXAD}(B)&~OC+W#|BO>7wSsPm
zMw0?PxR>zVFyj}V<It&^?ay`o*s#qeGmY_#7h?G5%R)XZ97Srn%E)2RN29bq#_+JK
zi;3?K$HTdF;*H^-2tVl3LB<1pk(HR+8Q+JTfC1&&<oUh_Md;jeYqA9s<;}VW#yLLw
zOy$_x*i)J*>ZMa4{60AWX89A4ir|^Pe^bo>Gk+^J@tW`j-j|K~BBd~B21ePZM+@ee
zo}a&lBOWT7(*nZ79+X!Vhy29_=5|(x6WmBB7fc%dsid>60$py(4pHw!%{IWb$of+)
zY;8`_Hyr%v*HR{dVZhO2X-$q9{lA2Tv7B)J-rn9{)6zt)S8G7%>@H=lm*`dP8JwY=
zke*Rd)%-Tex$Usu_UdFo5qh|Gx!N)1FlKbpc0+zZE^Xv#?PTP9U5fr1z#rpyp8|Lh
z{A)x2E)(f6Vme?kqOnpCvI-hG)P}Lkk5gIR+LF3&^_29(3>hgY<M74<bpt)Lv}b|Y
zuXsYxNm_3~TvvG>_#~&f6R(xB>OYFAbHU}7?b}g8)aqirW=313CDw%n<~arS6(#l&
z?DLC#ZJr)eIFwZ6+h!1ZOh@Am`$^9fT@vQKRxJ;TzvC}pClm~7!azkYwI&-LV0=Hw
z_@#k$HT$*ZGfG6EM(FOR44dr}$oNIF#PRG4`tmBHL+iScZhA#s%^~eE?+6boyrvb%
zxbd6uX0Ti7;Ql=6HA%_b!tOP_jAbbkDfVW%@Zjk^o;%d1+ZW9<?c%_JCiGr7AeP%5
zv^ywwWP6DW?GnEfmn7wtwdCaeG`8yUG4$4UK)IZEfR{r5k{Mp6&J}Bdv-%0%Q}>hb
zY#*v`aQ=g-`@1BSxj9wVCKV;tIkpx|R5V4#7AliIajrK8M-FfX_0g#`EP1>5X|LHI
zS1~B1K{IzkwwgJ&4{Gj<Pqs8KO<%dlR*&5j#vE=Cd1)?AY#_oAl|PVPJ8PZuDL^0V
z?-7U>{yrREa<>!i#pR5!1BBgPTH9X%d9HR^58q!v7`TnU8yyZyE0gY{7}x|WW$fSI
z03B(8l5t<!rO}HgaBhRM9!5{6@Yhjc#0;7PtC>Xr=J!N0;2X>lQb=xKNm=mf;RPjs
zV&lP)u?_}MYee6U9~GTi7a@j-&)~o^1lWwI3lXrp?X3%2lj7s5E#BF{{V{o%QU$(?
z|KA=ZmHG}kvbM1SBQnBby?}<bCaEc(xd{2ysD@C~H4czt5Ip@%AG`E6iVB3<KWyv%
z-@@O%?PhTc?F4qlyp_4gBCKEKu4>GKOrm9h{-81te<t>qD4&MlXJDFRe!Zdxu%`w!
zG;K=|O|pw$DjkI^`0UFzq`Rp8Xyc5@q|CA%wlVx?i<R2nom<_#vz=oBeRgr2#@K9V
zPNn{&GpZ7s^%54ML9hU1?}IPk;QLP3aAB4|F=EVufPeryd;k!L-?iPHp%f3&B&k(5
zkVYb>#*PC9&Z-$S`~i+YkT`|)<G?Xxba~=by0np%iZ81k^`Zu4^}FBQ#xc{^XN*9%
zwe72aId~|{olwBbULo)5^MOKynwB}0a9kX=ch3OBhNa*+;E}v#IU%+)({a~Wk+{g9
znE^T@xdvVA#&)dh{5>ZWgC8SzC%znbCCdl+MQs{OIPjoY=w;(6Gb$qCoX75iGe|M$
zcL_~5D>mMK=;KukDQzQ!u0}tsWKKRKPtc$7BOt?DybUq;r$ZAke~S%L<r=__ZS(@!
zeFTFVd2zA=4U3X7BO~AQdr3l!Db+(r{z}L#*he~B1Zso*T@x8-{1~%-2p<caPe1#K
zCzZF?esVT2P|YmpV54W^E6ffW(zCp)X@!?%#`k}p2@!uo{bq*!s!Qt`7a-zti;RBT
zJmJ*yy}iDvgv8jD_}nt#QeIiQEG4)Key{4Dpns<_*i37f#NIvyXc~0(cKRv}e~w;a
zZcX?D!~I51x+)8aP})V+N!(~RwGG`WXa&U4L23w}R&LJC$+yT}v$ZkcOH!uihYRfa
zOk*4%RR)6=Nd7ZufgT?Xc8`Vi>A36hVTg8l5*hxa<`M^&-~;*fGYqb2z2M9+?UgSH
zCWx~%S;vo|4M!ly=8i?qQiLBe8qrl@KSpbw?VYftzRdu&w}5nIJe!!8q7OhK1cJog
z{iz}%`a;gs`oni-?q3Ld!CS8LPO({Fe-_}`?1aEYBie47a~QCj%Dqb^Y`-g3jXOvA
zMD$G%4yuk6;c=fi&-$jhM$E(;7qKcx|8UlGVti3aRJzjjTox3k<QQHW$`jpv4}TX_
z>*dh2KyIH6NQ&o@E`2SzJ=(!_?k@j1=Mfpf;fi1aX*4q?2PjM&0Y#9JD5dGDDt$Pf
zsn#Lb2i<W(9>>ceU5`FNaMu>MC@C1IyLeYU17MIuC__n^kzw8rmS{E%0!5rG?saf`
zwH>jo^UzNq9pPZA!n`LqRs&)1Bhchp#FY*s`A<;qep6r3_-g2ue<*aSQ}l&dnjG$0
ze5XH5dwD|jl^Z*HZ-9ZD{_bO^pqr?ofy`ke7@yLWw+ApU+->;KPIJ<8Z=~@B_LJc7
z@MbBPo4PlsCp21%s`$NvG0>t^U61lQl>$eIQ}_dz49+ZU3y$U?%zBGm5#%?4ys&fg
z4urrb|7pjN0mu+6(c2;RnS&d7JEG~k7wERa-{Ug7WG&Pn%oeP00mP^N@E#Zr0d5|6
z*zGh9Kd$j15^wFZg7Q%i8_mQ_AN)yY=uGmSzE5sYb*r<FzHyk3&AQr_%TC$xw+F2<
zdwNLUj!Xn4!Q#0{>>5HE_EAfM_Ak1Z%%lnGQf&>sAh-y`au;}B6WCz31f3}~#AHm4
z^Kh`c+Vnfh3D0^GtOCgY&!pK)ApM!lll8k@Sh=}H`KQ_*cWd5Sx;t5JwQZ~3HL~rK
zq|AyAR?fB^0%~)ue<^c=%P77wni|!-Y~73=>g2q0wzFu;_8%YpxnCc}1(pI`+he`r
zZEZkW!`JWA)GWtbY#ZziYf!X`PvKRW#T7ga30ANj5pt)%+~40*@%n;4@(2okS>km5
zGo^gnEh#gt@V;*{Q*&buVyQ88`D3T+MIrAizjY0(RC_yy{D|yvrNSd0pj;c>SW|cJ
z%!`Vi(VClUXkc;_9GBK`1Xla}T+$LP6+<0MQOLdKz_?!z+%@CqSKuv=(kB-k%S^R~
zR_8y={w3_Hc1hQnmVBlIb%HaN!HrL^lKW*7#Xo(U=RkRj1k<*C%0=d*e+0B-De4ys
zJBIZYCAR^c!TrQHo^1-BimQ7+UO4}Ae)a%d)#-Eek2s!r<DGxqQdZ!5f_$|n9c2@2
zjPUOYhwSiHo`<-w2ZVuMNd65im7|2Egb%Po!eEprni+A`DL^7Qi#$mToXYUp*TIFr
zr&n_Hn!Wj!2Mwk`qSI1Ryr1K<P4rq9BCaO=qD##{^HqDX-|=-qoj(E;^f6fCd*hD#
zCzd4%{f*lX?|#_Q2}({8V+WMpdV~Jw0BA&m$d$jP*Tyv9UCs8)j$ZiXl~|bM9ZOZl
z;xL;az@=R86`f5EH8YU;Q~|RM*cAZ<OT@R{ZuE|!bdgmPZoUX`#2ac;uCQN26!JKg
zpV<eN6o8+G2zL?45IF1IpALbAq0&Zy8*gg2kA=hkhFOfgG#PNp1*RkwWm&7=|4;IT
zU<e37u~-nkJ<Xrm$fTzl&)C|XYR|ZE&Abc}n!{au)&&<hyW)YJ9Jgq%MDukjBJ1>u
z*h#?2H{BTtjqmHc%}AuTBI+(rCnT-j{3Ga6LQ>}wnP3PjL{eC2=ynW|<iL%1^hbVu
ztpJl>PGj-}zs%ly0hEJ_DifVA85s?a2_+Zdg?u_%Z~bF(zJyk4^}vQcwaEgUo;p~1
zz=n|UOSp99w=OUf2`>)=Edo(;o4v<E8Wfy8-p9=-Hexnm8xjr*wkDC7_q@dZc&%&1
z4Hqm=6JHH!U*%#NO~=i`AM{)<x{;oVfGCHK2J`+mHKW4Ds~D)Ecf7hHF6arL5bTfg
zNyop@I&W^SR{fyK1_!%hYX-vW>rX91=D(hvca$4na(4!BUTf%&{L~!|xybR+g|2_X
zZ~$<ea&g~M#OKBS&nF@^zdWPFF{x3LUeO$s(cSgFb4oNY*R}+OeXayFA19{lp*bxH
z?YUm2{hY<wz+S82@Hr(|)oc!zN!0<-;75FUB!?1*W!K61B?S?Pc3V>iadC;jXWjcT
z$KXqG8xMqY=s7>iqAD&MxSDl4QX83$WCIDFnW)VO1InGO$iJ@}zxr}Je)5``CQos$
z<;UXT3NWDz7*IX9MX*OKA0#9q8Li<H#Y>-72rb<s?dQ>t&mez*>+_n4_eH&z<lD(O
zxWnfeRTHu<*)OfKYS6smv^>KpwWoygynyS{m4zD3@-ILHLDfCqKZYW)gBlTTH@2V8
zzYI#G*{6cenxY)o{Im_&gc!Xn#b1+DRBK#L#5|`rL=FF!vQcPA9L~YV2mFE^O|=nM
z^*LQ|4<BDXw#Ngwu=#VWYkEaMY;=lIv<^o^IgXK(1qh*Je{!fV;z|Q6D<if158bdN
z{>|w)`jOZB*0sE}6Aij^RXP2@+(#2n-9-Xr?cNXscvp*&-f*inT{XreBq&a6)TJ?Z
zIM(9FF9sP{gMHSj_Jm5p;$wVqbA4e<YKC?A=hDX7G<mbXr~+0%unlZfW%}*K&503T
z%0fDD3}%PAV_ycV#V-QBsr4wU4a`_;m(YyfsENyY41Gi<hB*s0d5ON4Ae+c3`4kqA
zx{<{4E%R*z>IbE$hIl{)>U)Mjt6-4jH?9AyJztDhzs2l&2H%AbcxXz3Z8aRubfZQ0
zc?Cr%#q^)MxW9OzQRlsS&zq&b6}Ps?^cd_O;4>o6zKXbgW(>6FKhlqHc@}B#>+H=3
zv1A>r1j+zznf~;$i+Urjw7oK=y8f$Y@E0&LIZphu{s{gdj&BPL&p&5*ZlC{)@sO3*
z&su3W*0-&aJnPD}4?1Y`Cfe{y(fAPl_A6&0_SZO*QY*cq^9*lfmDN{z8yj~h4X)oV
zaYKM+>ahNu`d*r;{!_*V>bp(mYy|c-j>5{`{ONF+A)q_Y(^r!d8@CsA<C=KzIq_YD
zt^q=Yw00r_-}858^wL2yXVxy@0hh8hZ?iQdhAc_MzKqyB0T)e4i)b>=2dm2Hl@^#^
z!V0wm=<E-Ra(7Hga+t5lNG>8ikoS{{{JivS*JuHxPD5sL)(Fp2Pa#){o|yBX-X(>y
zWD!<Ku=%zz!<#SNYIwKgVhLVOQ%3SPC3#|cF5@)3kg*P)l4C3k5tE7gM%EE-wfpY^
zzuE@hR80x@ZXkEm6D<D-7OojG<V=5-i17Dma~|6KH!{-0)_ZA5O)S-&$r#`4{<pPC
ziMBT}C`c?82NKhCG-o;n4Wwktj{Q}Ios5wGo4AY{Ko|{HubzQ0u0>@#afFoGZHp>v
zxzeUG?*K`<wAGyt<_>g&2qRy6Pk;;NbO}#C!1e7LOfGPv)uAnK@h$w-@pe}elEPju
z6<Y>Os(!E>YAok)*nLwW7;X^UPo&|(Tq6Qc3Ru?AaqjiT#K31O)vu1@Hm}RKffA6J
z>LKzU6=1KeG<Lao3A{h`v)KRGCO=fdY^46cVpitP-X)P`-d8j&*=;_75A$TZHAt)A
z9J;4ayZ2RQ*MsZ-6oU_70Krpn>|PH7yR(wwl}@q+OObv(4F5F_SQ9l85$Lp5Ih;h>
zzzjp$JG9~RHF((dVuok^3%7od@7CZ_)KWkbiCb0_=`t*HUST6Qr;y|~J^+Bqi;-n|
zdmYRO7Jnf`aT7DO=+DJ1ktWm7>Qh26LFE!>>;!%OS%&r`pG!LSm^A`BCL4bjS3&mI
zyN3{ABuXd3aX8{{^@pjQX?pyG1Yb6`iHL|(Go(6kBEX)*^5-}gkU_2L1TpL^22uxI
z79#Sp-kG4TW1{!lbPY?_Z$H3%1`8H35X(Jw`a%827@N8~%nv7EIFc(qk8{rpOB~^)
zVFRBzSjjZ3{+PZcz(|#~3{aS(es)vhx{?S|L@58>Hi*G?4xc<BHvR%bMuzF=c=5eG
z$vQl2x2f{@ovM?X#4Y>?DMJo~<v)QW%@dp6x9o*P=ZS15cG0>_T}~*2hVt$Iu;>s>
zOsvP%_2Rhm7CbB1^cDzyW}P~Z7tVZE*z+XSbAEvSgT}?n5z_N>bzDW!Ht7BZ!jdMz
z#1nXoU}}p5t(1}xID=gT@j<?H=Z;<#1y;<DV;lxx+0hEien577>*LHDcDa~?^hy@!
zBg9ZEPS{!{#kuQuEZqo#!K!7*Q87}O1_>36`UvoJ`0Q<NuP6X}rYkz&B?x!jF!KQc
z8XL0*w&QQVZV$diHvQfH_~^chlB6q|LI5bnC)>2N*uObCQ97#4-U7UiVI5DmSi)|T
zeIHB+lTIvp4Kh#70{ARd_sZaF!aXo_7@K65w|Y@sHv|^NQdWyX{NxTc%RlEaL(YqR
zll+KuBrwzK#cfAOtHpDC#Ewa~TQG^k!$}i^Tp0O?zUa?COMz89;ROvkjNduf{Ja5p
znl=9)S7#kpRo8Cqy+OLWr39qAOOQ|rLApUYrMsjXK}x#2yE~Ldx<RG8k^Cml^M2=d
z-t(t!Wi9raYsP(#agB-ZD*G}~B}n$bKH!IB(ADSu8UDvb;@Rl{pv&cD9B<j`k!q7Z
zRoj!OEELhh*xtyahoL~edf4Md=E!-PjAZvB`62<NOP*Dzc2K$ruNtCB|9?HH2e6kw
z!@!uGf$?3{0eS}*NJ^2%zNIT{3HQwC#ooeq)-v;3Li>fUc@34NKkczJJ|cm{BV@{M
zuv<mtFWb!B7y<}SAZWFtBc4^Nx~LB65I7wQgu>L27d~o{B0iKae9un#{dim(by^=L
zZRA;hWC?U+0LodxpS6rFjtp#a6g=4*<#{r%edN*1syUCPwTCRD)q;G=W#;w{e|a2D
z6Ofe&6XWowvZR`(Vgk^lZ8PTlUXd1D{~z=NxYCuiw}wMwTJs)@d~oXyk9W*7OufR0
zNyjeJQuuPDuKX`KWRzxGt@oL*s%!ApsLoSV*pVTlOyu)+Ya3#ft6r4-;xFVvjHyec
z5)VPAEJjWFE#o1%hGzoMx<1iVZOnEs*;<cL#TjRo>H=~sfkofYs>Mlbe(n?na7lMF
zTBZdZdIssuGCPjOyI>dKd6=mSb_!4m+zA?#B45kX@7L7T-EQXu`_NpQ<j(4PwB<KD
zdwX9!DShJQFybMdCPDR+c`o{owL~LcgNE>O|4Uj%x2x5pqS*rE?HQn@WLbWY`g|&a
z!;~$4ZgW->Cd~K!tB&@1Bz?*#oD5sz)YKOO0<%|6$$DSQW4NiV*8%@^<hK1@G_s@D
zln3~V6E&xX6ei0KMA)j2nV1uA&p%VDI5K6~Xuf5wa7ajnD}cVCWOsvLck_=AmRk2G
z!OhV~AI+^2`ns?V6EX0Uo8SBT>)1_SOmv{*rBABzq?fo}?tULDv40y*6%pn{9X>w$
zq%$L=Eu%+ZDfb}DlJGV@ON&qUkWvs7$aeESw1heW8Y@^5XP%1zi8tDjS2EAd61Us-
z^!6VHve$cH(<qIfVQ5ppgiPI@peP0oAWWqhA5^Eyx7CA$C#5o4#;pJq5YK73_80WB
znT?ZW2(ub~GlPSzYLR>3pMf1I`y|2!LLM-%qoblgK34nLs32pC>-(A-&Rq^#<{!7x
zr>vkVs?h;->c;R5KV*uP`{g6?wevUZ32uttrzR$T%)M=hz|+mzu%FdG)OaH&rl;4}
zOzT}wWkUD9c$Mc<t~m<n!I+3}l1mg2_8%}cVI9UYGQCv95#`mi4NYiCu=R!V{AeCj
zZPTEWQeeDdDuWQLzxZ@j%*$D&zR)rIW>#qW$UuJf)u=f)jhDw*Ss+>s5b?NuczgET
z%TUk@XP|XA?XQy(7+wsPI=1I?#ihk5x&q1yqOXTG%40W|*OBr-oyNxdWYGa=eMn|2
z0!n89H-et%?^;s#uY4V#uMHQF0v&5IY5)msYF&5>#xD5pD08aH(@v*RES6eKg(>-X
zYuR@TMN7MB3sUuPf8EFqtiu>?Le79Ju5YB}p2}flWMtLjXuShvG2;AYucAlnV1fIQ
z{PBeRspi-A_mae2Q|hfQnu^^d7`i&sCpACX(M~@w#IXFGw&MghFaEc6TttOGjcu!R
z{SM`ia`sq79ao%7JL_I<U3)`X)~fy?f4VlB0P&z8!N-iO@8c1&#bIM+`9NVwV^q)Q
zF#wtz@RD17rl;z>CDWfQ@pprI@lP!RUS?=6VYoV<hJnHHLvIxrv^5k!o`e3OuKE`s
zi|u%;slC8Rm!*oyqc8$8Drol?Hg$?9@?tk<hxa(1k&I&ODJ4vNN43&>6sXQ#s!<od
zm@XA3<1qLtR7Q)Lz(z2MsUrL}sW#|3f_7osLG6QXVwpo_Yjyuj@8I<?J`fpN57}Iy
zzl~8+mh8(8;s^hR02OvcMfWY5ZrW_K?b_eEN#X^mNu{s<#Q=`xh=B-Qc061Ak@i&_
zCYv}6egRd9+4%Q`jC7x@hP;wlcu27yONEo7u8tzP6JFl5qq1w6B}^z{I1)<e_jfOU
zgfI@iJAOffMXC5!k*Jt{Cc#IgnZX1!js5(zrf@H2eZC8zD#6jKnA^dn8sYJi$<gFo
zT!lBIYA_Sg!|CY28W>Cj3+h}mQc<x`GLI?DzIGk_3YZ{33L?>$XSO&?jfdt1ioeX=
zRPPuxTb;|Y%jpU-!;~UdXmbD1yx4FBC%s`{UgfYKGz*?Nb${`K$<N!ZxFk25nU;l_
zx&qZqQDG)?!#=S(xjeTvHLMt5kCHRLgkb+RkWZ{ltxit<2dv6bMIL<)6#v~~BO?bl
zdN9XBSGuao%Gd-6CMlyG3^cONgs}l3s9G}>GVf${6%^j+hKV(i&3x<AQ8qwQG^c0R
zur^NL6R(aP^wSFHr7@z(-QK(}!N$iLpbUTaX0raVIrk4}4>thOLP2J?!f&U;ZG-$C
zIfI!qUA4rl^6+xN9e)5IMi1jDUeSZq4!u{eUTMmv^BnGd!In~0c=hIkGJ3|my^rKJ
z)=)-u1g(&e^|_B+C7Keu;#>Z=#y<;PD;W`Gl4=`UeZPEZd~xJVL&M6zAhwW;D!iu_
z*!=5*cz5=3q|mLAldBr@RJ@A5P=KhK>NFM>-rmlR29dq@FCg~}`B&~43bVGmo7}MK
z?O}{_WNzZT;A_D9#p4M@rTll~!t&EHN}wVv^3Ppn`k}MwF@UPJEZ>#?G-`)r@E0+$
zHSmVuiRP_j=TW^Mgx|y0pJVj?;H)Oz#e^|Kw<Cc>*N%O;n97n((G%|RC-GJ}a4^<4
z8}AjVqi$tLchqXLh)drl>kdV4!<O^Xv7R_Kn=oW7Xx)Br>9?~pXnHg27tE)lWX#J&
z)Ncbl;TA?<SR`=!@cL1a+CdtJ(dF0bsPRmxp33hv99^*=;#Al`yja(U0PZNzn;P&$
zHsU$oApnrEvde*12VkfCncm06k9WL!Z$`)`W%*Zas!4mIJ~1o*H&LV4eL)F~k6;2w
zvEf1^o8_$s(iPhJGy7(R2D|Otl+}7LY%jc|>uFWh)zw98)O-PsLC@p23bk<fOdRYP
z_Za)use3JOk<Ltcfi_=XOizpD(di$SP0K@c0&O;h8h{oI_DJnl{+Oa?i8Sq!AhKgd
zx@|?wp>%?4!VV_qtQ_D!rP4X&c8rkG+O%xO_z8?myY3j$Ruu}$GvECvFG#b&>aZ!V
z`8I><H8n?5c5@1^%jZ)7YqQV{dJaXZG#y6Ngxc(?mme;C+96EJNWA?Y@5m8=khM;m
ztsCneSY#b;B=>4?!!1Xhzy8A^!P+UgZP~cinw+g`&^unfYigb|rZb7xQ0Y!=z988`
z(3Nz;S_W}p=2<$cj9{yBsMuW!X3vfkaj8SiK%N5@UTf{8+7f`G0-EEsU>pZc+_<Tq
zVYilU+Cln+a86$X<Y-k+Gzp~Cr<?S&#;h#tw+6oek@m@?)PHKVkhQ9+lSdlJ*yu=7
zaT6<Vt2fxBEA{T(&H^r@XhS|T01>z(EVc3-!-7iyFI$VkZEReKeX-A3UCcj*g8Z$a
zuR1<I)f!|`%$mj@mw^i&sMGfrj#+_mpbnQgDh7k&X(5fA=%7KEA}(4aGjl&9Jt`=n
zvDT1O-O7@mbpa01I9$bdC}v(ElTO8Z@7;G3pcDN59_fc^Zn1QcEX#GZXZ6m=y<6=%
zycz&24t52ok<6g3R=g{~R(pm1B)A4g#OTy)3%5n83MacHFjypebN#c=1`;v~z=Mmb
zGQCM|PB5@_6d5b9blz~B=R;Uu1;d+?JrKjC85lfeShl@TePHi4Q%GPt|9tYbcE(*6
zn!Q3B>25-CFqJ;}nmiZ;-a=bJp(d2!6Cm?VvEa``;O-Gl2czmAuD!gY_pUYGZ%Z7X
z{H+qsHob`_uD!tGtqRzVMP(V^UN-@0y;;wQj(<zWyV0YMg*y~C<iNTIgzR&k=6lbl
z<QNah=I(L9Z==xeb{&_{?q=)Aa+&q%5FU)s3#^Xm)`M-lvqjNoGI~EwUQKGXE==cj
zk>8%lo70oX#jmup`5K4l`hq<e5U11<_LtG$76lWO>lJvk`x0sGHkm%FSB!4W##<|=
zlYS)LPqNs-ALsOH%Wy%!YiiS8?5s9EN3E!5Z#}lsy$Gl-c6PhPKIvrwaCJbqe=hPC
z<dPL;%r^fS8~sF*G<TSs@_(kXog<&ef$M7t6_pFEPxlp&B%sxQU+d$Ty*H_}+^co*
zZR}ev(h%DL{rF2+k$uPM8pbNPR^%!X1U%-WVt3Vd67)YBo=?C!+`2oq{{9U}xZ6b<
zFK)TNVAyQ5dRQV2?_0-b7U_Fe{$-Ae-u{z4<S$Blqe4_X&oZ;zgY^XQFDRjO!+5Ql
za=YVadm74=fY3u_`>n)X>VpLPd_|0a&YZ(#8r}N~Q@)1M3gz`v-0kurMo<KFH!XMY
zOyHn-224zlN!{8vRvE3RW)gzeLpQdw>uqDtqzQbtAytSHNnnBG2s!$qq(E<;yj917
zUemk>0r9FF2v8jG%xojb1X&8ri9vaHTz=MFB8|mz0OF~;=cu?#yMk83fKtHtk|Ely
zIiyCQX4mt>ZhHH>R12!Tu1QraYLWE!pS^Dn8lasuH8nnehxAl=gq1aa(~su{I-Ih}
z8XVesB6Ho5@XO7mP5ToC@bHVk5jWK)GPC!kCS4Dm(2plY?*~K3WiC9b_mn(0A8^e=
za?scUnMRMG{pUs=;;h7UDSkK<)%s_^h!ELq>4wA+UKoCA*$ThIFfv7HVUbCAF{trt
zbzp~noKVD`PxY^XYS{&@*}Hv_RF&_VXWVac-NrrnO7x$9H}*m1`UbG6rDl=Fc4ffZ
z0Oaa*GfJ8CWg8VQ?c9*L44!k&BFU_sUz;Je^nzQ%T$fbawdaxF$9SY4e<I_?Jdtsu
zIqrXxKPDy3DQX}=ye+}%_e6Xy`8*DouT3pChd_Z7Lh-{adeoc?=w@3hPMoGb>cmzx
z!>U%z9Tj<<n30?$OIQfv?}+k2a0*I3dpEx{7y>|HPy#VA0kYY}s<XmT08Y69=uU!N
z-4&htyO^1)u_YfdT&yRex~J6mIGb&0{gv(FbyK^BUt>~}Y!wXKAhneRsSvt^5&kPg
zzsy|YKaSQ%S@~IC5mhtgPp$ENY#}E6I_EAPYsx?z`wD-re;Oj&+J*$j#x+d93dOE$
zbK1AkIa&|uqHF%)8$akC0liKGehk^X9P6W_A~&c!{AKjF?qn2DnU_VA6}~P(;4D0V
z<-?(P#_n&S3u*Z<I|I5lD&VN+Pc(Sw_9f7~rDC1i6NvBUymJrk0c}@(#rAdU4<s@J
z)6J5jLjVwK5$nDi0QFX$@v+*^pXow&`Y+K?O8o!;);=k{EXP#6fZqgvzq0%Y5#sN@
z!v%T<g?>mYLY`sElKLo@m`&H9sCq)cv&7}-mp3EX{a04cn$#0UKF4XF)EPF;Psd9L
za!pM*4Nxuy@NDqc@11k}yy1M2$RCJXxkSdRDbD+_K$(gSZ!nnL#w+@K-7!GQe;?wB
z>n!hya6bR>UU*7)?tJFShJKe{u>fvV9^;_{Of@qw5x6B7rg&-)+J54PV&I1#Am{@&
z4np_+w_dA?gp?<&bcx|<KuJ4idLGU+441@S&9^}Jf5gZ~b5%9f)w?6Syr{lana@8x
zHgY%@#770!XMf9Twzg5{J%Q1xk558+vY30??^mEWU0I!bF?@9KWh;UKGb1T$JF$LZ
zZk|zjo)NxTqL8BJ!3!+^hb(9kpH5EiPPMLAb0*L+>pZ`)E8edUjMDO0=yGUVRanIO
ztO!ZnUG)UV*#SY(i|PoOV+PUHE9RO`tG>zgQH>i^5;-^=)}f@&OOwD1Uh)?F=Ib(m
zh92!}U7XyHjB3qEGPbm9*E_!tefD)*!Psi&@Aa^W4z<>}8iw04JSDDQ!F7a`4mC+_
zPL?5Yf<gpE3UbS1CB*9W61#kQIch44U~k}*F$<z;V`3xa@izc#G?Z~U?`sVic(ztN
zdM}8+!dRFxGt<COnsG>nkrDDIzk>1^7nci&E>W7`IuHN;0xKj$g4_X;o}PZckq8w0
zN%yuZZlId2V~oNqS!27kKR+l~`!<rJ6cV95ETK?XW3I0G3WxhPs@~tbBdNrlr<+?^
z`iHpE#Iv=9438VD)!9reS|D8>-v-Qmtnxr;{o5DKrF`Va2Tdy#>^@$l4Smgt`X7Hn
zuAjYaN?H!YdR>3h-uVeRR1vlCE7^i^@lJM;zD54Vhv$OmkMLvtP@v-=d_%v_&gG)$
zhh|0y3amWnY5FuqfRD^ZGN&as3r+fHFE-nGHyG{-Dtx#za-atEvUrzc>O8eA2e?P|
zJ4+#`Eo7-a#fHE9isGg9m09)#L2aaVOMY_A4pwO(saC)UOc!w96UUeA&msZha<5G^
zToKiIk1RDe-x{XCz-lK!m&5nI*U<wNya=e4?=S6K&Ja<9S$e1jnSI)Cnv_a9oIeBY
z5b8?`hw@Vm8J+0<1CJz@WEJqlH$O*F>71RbAd*LsJ)y{E%n(Pw>t>+$dk;wj4@lE2
zs^Qrx0U4QYAyf-VeNP++5+7l2-wxYM^H%lP7FwnN9X^Of$On@z=#Jcd7#pse$lC&@
zne__L9<+B%0T-b`A=$J_^(Wkgf5#vYr5j_+>@nXl-j!YnB;v}>`wr(3%Bs9yl%^`}
z&jIlAqh2>+!;gPksQ>Xiy@8~c78ZJ*jqs}RoTLN##|0l<ql{y?QHY_t#<@hURyyd&
z^BfJ<wCPoq=ZqsmhRmJv$;k#qkBpxK<M|hY6@ME1CiNx&i!{qMs3^`1KW1Sql~&Q6
zeKi7)B3fNJ_b3Kcmv7<xy{)jq&w55UGYLm#?E$_#8;=D_K`$R8Zki+GYIb>oYzK3!
zJoT&Uozuk$8_AcL`Wdd~zJ(sux9<@ru|5?s?wZ0mftEU<`5gVbl1N<b-lXS8xv&83
z2Igo1sroR~$S-r<+i{wpQ9`c~_^Y3Px0|{Ib5?V~U31Pov4OSEm}iKw2qip(8^$-3
zEfCSG-nao7DLi@6Z6FUIwxxjx??&1v8%5%|&l?D>d}3bOdpNu1WMFldkm!SUDs#=(
zoqZVEHt!A5!eM-o`{xg?N>z*-S&)#<GvG!TW+I>XS=0G)Yj&S>&IzT7A}qI+g&oP<
zL91e}Rv=@1idmcrm<-4SS*ZfNtf1G+F7%M}zaWU?X}JeLXRvAf><!NSWf#I?@%&k7
zgVmb($x-?;Gj4LQ`ol|~mpBcgH24}6ZQQQXrs)upSM52_6Jiu(Pd+h)lJY1wh*<(b
z8Q8;{WN#osf=cFB&o|X4HgG$n48+(m5ivvDa&|u4Gai3tKg=_m8KAxV)rm2Opb8OL
zx;Zr38HxR_2xsm@^tSIcbHJG!z{XI#%YI)M5W@An0L-b;a;?6d39ZK~HNtuY*He>_
z#!s^&0sHs{ib#xfkK0oMpl>59TuCEF&15Ia2+6Ur9?Kd&HNdfS*<Y;RH&I|5JeSog
z+h39vqE*wum?6Yhd%;|iYiKmdW7nQ*PJrVpRtzd&X{|X5v%(Zm3OOfeNeRI8=<u6t
z5c-xL2VkqS4g%_>GXAFg+4rJf{W=d=^IG+YdnAjoCbCPWi~q(jyp!q@rD6mj8{@-@
zvf2zao`^hm-=Ty)1L5TyMs*QfR^e1ZlQ1NC5*w*<uK{T;Tr9Ug`N@iev8r?LY1)qk
zAOooh#Q{G6WrnkaEl?M{49t~Wg0Fq>+z!q=XMS!ga$1}6zuf?{`c(5UNXY1MkFNCs
zrWTmJzGQV${$Kik=qMSM)17_2rU9WneP<6)le)8i<<A@vqPsl0Lc5p&!h6~3iqqlQ
z8jz){=-dL%OR+ieuP-uM&NZx5n1VpiCWQ39Ft<3T$g<waL-M-*0~1RzaS(S~aI;zs
zFSum)x-tl(GU&`Yq0zz_TCdu`n+9{8nS9FsiXM29%+k?;z18o?NT@y)rj@&r%#XPr
zFRo?B5<zWUVD?rnz=7qpyrt>R*B90z>hHuH<>)=)jkvARJU{D#;-dN^GWAVS61I8@
z+0Q0Pg+j)&<LQ_VZIXvP&U-f1jnpqTwRd2p*^FxoDGr^?^PQA@vAHAul2h$(M2cHu
zbt!H9bZYf?<W4>v<k;k0<RCzH7x$xjiTNu|3VIy)nfzULa8(|S+v-=~sQ|FX&|iCD
z92R!zd3>*a5o$mPp-@+Q4_$<QOAnjd>vjTgj;q+=&n+$fz4m-XOahfrND4{9bYrcg
z*gT+7Rr>>e8-+bhh|Q+v)%kS&$(hAMKjLIdet8sghwsaQE@4@Du{xk)_;lG%j^k_B
zJZ0PLI8ro8!(8%aS>;X8N45H`+9GozAPl5J+5gN?3L(W-dinBYcE0fo>akbQT)&04
zUto7XnsPbG#Vb20+B@#>>qqFh()Lqz*kEsMftKUdiU$Tnh?~2u3ecqkNalBAshP_W
zWdld_$R6M>S!rn1u(@Hx^|R@5eA%<2>bwZ|6eGs8o(0(2{1ysVOI3MnxClM-gV7o^
z99S;`9&Tgl6DK$N4cD>Ljhk87I?_qllqXGPWu2aMpR)2BQm2DRPr$n&fI5D?->r>{
zED9wu>qx`%1-dSU24<&>_GCO~3^)isP^(En_qPsrTMF^4GMwGE3oNNSGPx0QAUMpy
zpjmBt-B1a`##G~Q8dhFa78Hea?cjKFh`FgI`#CpGTYef{Rm&L0R`Qk#ab|Ghy}Tmc
zJr&Jn?FUH2-wl9*Wn$8tV?)DWYjz|M5mQ#C0y6qmFD$3YFn0kL-E-8X{K9NFDsi2P
zwHL^~U3A9~^yljo_{H-fz4odQ^sv!B5eX|+HN{!j>>S^!KokJOpDL}U%pSaa_w#S7
z!mJM}q2V5JM33Wnm{%=;f}=SN$KfYt(06zNw+!QJ(Rc}T!0+UyVIJ3y1y1Ac!{n{D
z((EejSX0SNlf_}6EiZ4UH6T{8XgJ9BymtJngz8p?m`Gac(yJqMwO0)bVjftX<FpCz
zao=NDH32dKZ~vo+lAW~F2l}7PN4Sdmcny<~BwG<d7IPByVoRdBqei~=yY9}z0&Oot
zVETI}R$M~H3f{*uJnrG$Z*$#P23rkQSS*(FHS!QC%k<Bog(?fJ2qtXx=boB&9Z3O>
zM)r6kC*Z<BWW4@Hk+;oAAGlWpF;--lcY!;LG(~k*@OEur3-iCsNHCE1a&qyBYON>z
zBu{M&up5haYTZsQZj}eUg(^0j@IAF<4Qa`MUJGoKv9k7*@Mf6O7daX!Y`iMYH6nHm
z+xVfa$A`D<8!iu*i}AVKSKRIBs?Rk2HHoShY)r7WzMHZ>LV=vbu$9Mvz^NH{1UTOl
zP?_(Kz3B=hqnUk2CEb9C$ot~J1Wx}Ka%GZzp&|b6rE!LG!bJ8jt7or5h9RB6xrSJ*
zfGt?RbljYtY@Dfs&G#9-6VW}t0$##|+7&MGa%@>5<dwFE9h#!@gn@2}5we7-!ItsI
zopimLnrg!z6Vw2OQf{zD2qC1|yNL=5m7(Ow&$hUe`*9?$9jiJOjChLy)NAXG-!$F+
z%*uW!koln59&=YltC9)MaJnO-0CJf>la7d>;S(A>iHDoU`hu!TxS*}s(#Omy%QsU&
z_u?6}Q>ph;5c7c-n=K-g-6(_u$_WB{5TUm&SntQZQo)2{zq^&ei|uPL9jvti$SL)l
z$HGge=seGl?w*l@m+a1}9tYz2*bf{uytD`W0GK!$OOj-)LnQs@1)<_0a5u1C`egjk
zV`N}L+T~k3!{aY#=b3conRe7I_L){5qUX+p9)j_~b~S9aE@{Z^VJVC;+;W?A-=#cX
zmUUMH%U__faqemTYpxvP)2!}OA$m16`jbQ?;bf457$G@yXll+{*3^M)3$SioC0*-*
zfmot(!-AgZpxj%8Gcz8~_cJM2C6u|&m+yXi<YR5<ef9zfFFd@jsG*EOPzBXz!i_LX
z%>@nvePC<UDeEbf<C|$KZi2(DqPQ|V^ZKny0Z_}4!tB?${fTP~D6Mgk1+uIuIr?q!
z4xpn9CpM`blm@#VVHj-vY$Jli#ep%yj|p5OG+Qo6Na2Bf1k`(7cx&GQ{%cd;```uS
zt=!^N=M)rCV&Ne2{FjK%Ys*^H7MiU-<JS(-hB9iyaMDN8NS%ao>L@`yvO^n2vjqc!
z?4-TcKf3+0PtrHI;(+;2S#;`j)NDO3^jArI6=<7K#h(by!gdxSmH=02?aWy~WZKRH
z3>=`wTbG(28522_MmBb(Yk-K8suF+<He8aSmB|QL2f>ef5+~fhEGm8OiNLmGxEYfK
zqGwGkKu^Pb^a}^AxOWZx&`qGm5mSVe_Sx2Z4O|*j_Iu>xKO6b$U*YP$;#s43uYc1D
zkdR4sE<^C8$JiuEu-VvN1<1UX&ygDk;kMVdb{)u_FTd!pZ{oy;!CAVl&x4o*+6@=P
z@UQ}0=NPAd<=Q*{%C#p!{NPwo(GJj@^%{Zt=ZBct%c>Q4{hm`dP=LKF^HN85c-_K^
zUamRxO`;*^&kwqR2Obr7C2@5D-c}SDroVX?bmf}{CYqzSDJw7hkv<~vDpXbdyrR%u
zISy`v(^O4yYwitHl_yZ~MKu;_5jw}YR#kshOKI-S4f)yuMK^0=zr5-hIkI3NM)(4e
z#p7E#eV%<l_lE5immVSiix)3Hd}Z@j2z5=~E@tcuN%_>9KOGtpW||Nk#Uc`{(CD;Z
z+8&x#*vNH^8pFux?u(dB?E@QeU?{^%F-Qq%B!|cfvDN}CH-wd`{n-hc3SUiXBYs-a
zB+TiZKg_ZJD&%W{jApvKI+hAV%=BpZt1qU<RaqD+<Ty|TykNYYf*5VFajP_ql6;BJ
z^#4%lTa$<ecU&*=dgNIDN^P%73k!n&1uBJTc1VYwl9Dvc@z_&G5**}g{gAdlHAGSt
z)J+{D(-P1QNld5!e3YdyIRGAjADoeP^k~xbz#UYp`oaykoSr#*KVMJIDL3QOb0XMK
zyaEgo7TR?IhI~*ckgZEAT!rN9aSH4CI$NE4mYy$ZK$4PNOmEu-7H-<3s=r|%Ky=+2
z&jEWbM;iqR((aRSIZ=NlcKXVU6f2b(QA0zOvy%sDL&=K-pgDa$BPTc0f)Fub&i7zC
zgp_~?4Mb#{6T~-0b!P|=**o~8qyhkmXD-cvX1p-s>AqZhCg7~g#pS@p=0Xe18Sl=f
z0an!h-7e24G)b^fW*L<7{LdOlg5}>06*b;+LX$A7$IXogVndRY)jcOHAhh1F5im(j
zmCnM39snZLE%_yJbKG#T*Ugi{NAk|mLJ`AL(M+5jna@>9R2<Sz<&?R@OpT`Xsf14*
zE`U-Cq#sCW^Y_xH?G(~KX@opp2`0k(kpAePpn9o`)nJ&W=1ZwQgw>7xhFs*>OLAYH
zl5*nbHoK}}&@qvgSqGGs*BhO)e<6lg;ZxS$$4`Hv^GHd);n9PqWT?DSPeEPqOl|(X
zAEVG)-4Xfg<D{wNthQbk8d(s-5X*Mr2DI8%?)UiX7cgvfRbF?Vx4m$`5)U|M*EU-H
zVkIPK>@A(o%O8l?oMVB?$#zQpyZ#S)$u{rxonqn}`Sh5n2L9nN3HE;_oAUoKX$H^7
zd>$O!-Qgj3pqc%IRsj{izj)WClu<k4ejj(B%&)kaI*U);)zLX&v{=!~*YIg*A<=lc
zV*0_`XHdx3#yu-ytGvCp0q(ev1~0Shb$|}p+ySaSn$*<th#ZhjRjv1wcLgj<<yrc@
z$eoD3woIlWbicg6;4QI${5I1Xw2V;Bu}e&*UQE0LnXp0P&VuvUE-)FJR=hDVP-K7G
zl$;LBwVM+X<&MlHZkjP(ivSUF5bm(~@J`CW@}t{hdHfoEi#_U$_`}q3G)m``Ql^^x
zEO^qA+V_<XXj&*>bK9SgfzSgsbpc+lb3+ucdrU1h7?`M%<5qv5YLUN(Jb)x$3(Gy{
z!_*}px?@w?#dZlr40ThNK-<a~D6Qf?4m{A<ajoKj1>q7b$1a~n!UumQ<3bGmQnzoI
zrs!X%7&HT6BeL&jn&+E}3nfrOiGc3!bu$;>YY1q9*DEM98pQIYPjoFm+C$tDQiTM4
zk^DJCkhT+CN7O$Tx$|?8PvA0G-=Gwz@(~dc-QjHz-n`hXFR>tA`=aCdTV2B#WN8E;
z;J{3?L6h`%P8@D%LEG;T7XnKH=>(P#9{CbTl0{khQDH5qKgdL(HPIMK2VS*(Xn0`V
z9`Sq`gw4%7$|$y^FucQ#2(9{bupdMg<d`a~O+3I8Qdch>CucR+fq)N-()B;kPUH`}
zy+?90(9Ypfkki!)!!{Y*BXQQ_VN4URF9dc0fp#2>ANoM)rSyp$nx~l4PaO4&>gQ$9
z1JEsHimBoRs=~6_ZIFLQhM-5$XLjP{$LjIIk%Yf3GvQ_cW@RP6)o^*>ZwL7k>9nMp
zK)W)QX)WGOwUSjd_mjJBw_#uoxasJ`j(VvpJbO5mxg!Gt`ata}Iru&M^3MH#5F&B^
zc8if9LY$l^5buM!S=FaP4MtaYY*WWcB-<8KP-;SZa{RX_%UHd(#!Bt=87Og)J)FJ-
zMn-dTaEh5gc3nJUr#1O5d+9Z#6;Ndy{FK;}d`3UaDv4=w|ENGO2oZ;SDunH(xp6d@
z1P7JVQ|UNaQ`1$vu`GaSxYaecQ-^4On}F@b^R0lU)8uWu{vA@n`x1gW^ulLN`eEt|
zMkfhsr}B6cca^NXqKe$&%M#BJB6Ja;Sf_;R6uqzQf}l!39k5r&>9==_lXi+Ut<K73
z)asmg(0}9kDw+W6ZE@>LdGQuov;qO?FC$VI{65)LBgg}yFbj7p$QXUEC^a(AWmJD>
zUhC;M>EUaT3QTUEJwN~aAp#v(;ssi0%zP|I7#k<ya}{_^1W3<!R^lW9V6Ul*Tm=x-
z8USkRK!aS3tQ4B*anxKsb8ouo_w#Yzve8!~=Z5&$e`$r;H^-8T<d!wgx}>3340O_k
znwSXVLU4FHqw#g>r?31mqOZ#NmTp)k7ry!ZHK+gpUPeFuu6|Ig_p1DR05r1w#Fv*r
z&_74fs&$pbzU11_OCMXfYgL;I^y}lf3ms_wxh;StcMf0F{u1G4rk>MP{(zU1V16JA
zOdfwc%IH`dqSDRDgi{m+o>gq`63il%7`jD(;DG-{Cin<*TvA~}2o#Sn$1W}-%j<V%
zK(bFhEJ#g5JKN~Rn$24bp<#hUs1O}*nl{MEe}J=O!?(ir6;+6xuw`Eka|Vw;1|qxi
zsG7*f%2;G&xh<&ax+6QLHrxp8Bn|IVKamN}w*MA4A38j(Zv$Q*eh<lFa}r)O9M)4~
zDGW^Br$xn60+17hDPqDR6e;*SaJnYRZ+JUF0>aB6g|QWC*>_Tu6jb5OANx@+fE(uF
z_%*Vx%ftEP`~MgP9Si>+fLw6U1~z>0*bPpmwu0`(n^{rcZ27Lb)@GrEm>yY=w+G(b
zRN}wD4i2gG;2P0Kt~OPcD>Y$PZ(bQ9RRhXg-z@k++f#)`*1s!+x6#?V8%V=vsyIPy
zjR%#Ku`mOkSJmYJ#1%!)=|XJ`68@?{;PEnBzty$)RnvwL8)qiOD1H3(b5wW2%mFN%
zuBlq`(|o?tO0!%AA4^>>;Td8H&Y!bG3_T5)32msI5biu+Bo@y4fauW=hRW`_ajhQ2
znhW9+Af@>sVIxnO=|U*zbTkd%PN%29;OO!N>pjnWni~^iBbhZ73dyI4M{40ljaT|Z
zQL%JCWCk&cK>^L<#$s{}3F$47bo3iS_<me;&KyVIJM5asO(FJPz_EUjtc$j)O2xop
zyoas4>;^HO+ad5X05dzo$E}8~T1i?JV8jZ{GcfH>(5tPyh*eVP!Jm<O>aN|@zWx8(
z%*VeKw~)KtCr@B8$O&ty>`trpC!3Ua^06~N4hF@%zl%1rYBky*UJ>j0-Ur}_j6o@t
z!?5ZiUi#*ZNQ3pUzJTltZu}2a3&F`VxL>FukWaV}YD_TV80m_zf9radUs(odgTr6o
z^#v>4KUs<k)zV{gJavVpjyo*M6ID!%W5=t>Vv^az+fab;s-PS-BM}`?n4O$yfXGm5
zis$}Xdrr$-pgq%kEbWDSZUSZF>!P-8$*mQ)8Ht+cYeB@X^1wBrxMD|G3k@X`wc&yo
z<}3HfEfTwqsjPGbG=p>~>+aOrMwmO)vlgLkH7KF+njbg{!@<?Z^+jYH-ChFmaZX_3
zMjj#iq2%x^g*C{$Iu@U4Ug;-Br#L8iEt;`Ilm6ws{K|X&6_^8(bOVDU*qYCEs0UvC
zf?O)Pigne7L}xJ!u=flQ!yyn2(dPH{>6j0)qf&j4ox>TZvazdn(D42!WdgNv6`T#!
zqa-ue-~jhkYbGG{&B4e>ElZ<!_ChYa$&3;iXlQvg++m-S@Td~9zht3OX|R6YGeT+!
z93t0NM~Q}kiFig+J*hRmr6BmC@u+8q@hOH^-I(;GWI6Ub`!J01fMS3^o=iCXu(FU7
zd8Ha)aQ|G^nVMo(ndGt}6Bew!rH=?Kc^{Em>#0ahX8g}c0#Zx5&D!t~87V8)V457b
z!7TQBN+m9c6!iQMuU6;PtIA4mcYwk;Mc_*WmBpz#xllo3b#M6Lf7=BAq(A~iB?##S
zia=Q41lfp>muoGUR`U&==IZOfk_m4x)>c_EDpbmkEA#|iH+-iVCtOXmPEmvVequmG
z4AaMLLPWqa2FQ=^Z#g*p7OG!2gCwKc_~FZ`RtQ@U_QL?Y{>_fNLg~?BRPF2U#d4`)
z6!O-+nWgY+v%t{TR+@t~vSvkry0DuX$zYdm%r{2Z9_K4{cQ7I#uT4%-(N+V)=7-vg
zqv>w)YTmbS3Dn!=bR1~7)50!koTqX_0L5Gs7JiZH#fMdwIMhb96<8)J1{0&hi1GOq
zVeL0><`{wSi1PGR=Q+5MQ%9t5Kjb;++)v6^nyTh<kd-TOaTCsdF0k{8p(>ued>$LC
zsGu-|ys8@Zajwm`Oaqi*zt5WJJ{9$IqD5dJ(XhTgOZpJ_FXtv{U*eUg)T&_ei~=2B
z4w*G22f_-{FBpxIIYU;I?)~CIgeVO9G-9sW1&9&eh5ewtQ6{h^n^NXT_^Ob`($NMw
z-#Bsz#E=v-K4Qy-E*g-%_*!alM2Hjzr#%p)jb3@-e9T~>VUPy2n({HN$xFkyw20TI
z4VMKsy!r54w11~n2F5@JU?SLbBH~aA)Tt^ceE<HFn<b}<@ui0LyBWq8-zdVkS71HK
zeLrm}%Vp-w)oLPttZJ)v=%_A^0;GR+@wW<&BFR6;xe-JGWC%=L^;d`{Uu)EG_5FAK
zurCX0-JXX_0;x(hXo5MBaawzw0ijFFcRID;Ui^>mQ`@o|Y#9BCa1uykK;$SQr*|#b
zw?Nj9z{$*515Hjc$FB_btq6Cfo}kwwSQn~^@xUgdV`oU<vmsOD^E_cbe{6RHIQ^f&
zkMuKNT|*sfwd~~8d0TmzDCaw}K&&$pKJ7s15J<kZWc_9?$T(!S8Q?riH;OPv1Cnv_
zZf1M>L#{e&96<{Ol}d`aDN{>ALUyc|g{N)h2jjap>b)bP&_#2-AbZ#~;rBrA)%ifc
zJ6ew5Yw#1SB}5vHh=?{w5yNPa%(+aIF9j+J&7&@`S)Rc6pVvXzZ;-L5$ed&#ciMW$
zg1z1r>+6?=^u^H|In1XW+y4(xmHr0lY;nedcpiabESp<b<$cfo7kOAGOK{jyvMvCY
zcaD9nx+w8dh^O6g@VC<fs|&AP?5)Z)2H$aq((SMkb3QggajUR5aS>uM7s2rQyN>hQ
zWdb^JqvxLdu|8cN2HNOu3D0CeFQ;FG!u_&3LJrO9NxX!<4x+0TTNy{3beNo$hn=*O
z7($7=3bAHC^WVW=yBlwbFb+V_$PAyAn(#aa_(884QogTvzG51RKe+o7%0Z+;;iq}5
zHL9?%cSRH9+>_(%pYIw>>Lb$fx(|Wyq9T=ru^~T8#ia@cX`G{5gi+cD#`h=(g%J|X
zB$A>cl5J@uqpkR@?ehtPMJi{V^DJlm1b6iecdn7K!IsWXj#WXAdr&)%7JW#9teu3C
z``p?6gC=ZvWvJ_DOj<Q_;p+j73F0?qooX9)|C`A08dkryQu#3~#oQH4G9@+1C7%6b
z0x|Zg&hYIzI|FPT;}nb#f8I>Q#HBKp`Bwz05O@j$+8gygy1%m@aK3`q<=is{=jVY5
zeo*X72*^pNH7S3-H|!MUIJCNY?cS`%3C@y#baKFSp4@Db+?=o3{AwiiG8ADoOTn*h
z970H|ywIvOHpi0V2+6nanEMKthQ62<n;)YR>kb2<1l*$_$WPOZRLI8UcaxLW&f1;7
z9x)E{ux`cS(kK1RxTMK2CUl7G+@&!=hB|Kt-#)A_@)BNx0_r3U)BJdt`Hw8Sz$Eu5
zUG`{vU-rz(0FWYNJ1Uh+12BOjDoa^Jc)f7G281|$v!#y0i$5<Xy?G@<B~(=*4}TPf
zT7Gyt5qc$Ku+A++>j2v3TnKXwp39wegCYJa-}TmGktFhid}fe0#{SZFGeB|Yn^M9W
zPV9n^eONWADzK70=<ot)x9@m7nGESNEt;=Ke#I;XDd-DKaaPZ8`q7kLb9v#k#AQF%
z{Jjs6wMuP5-{Y>b@6jIr$;rv)G?za68J1G+Q0#xFeh3dfgQRC5K-5-RTs}QXe89I$
zA<I#Ji)ldhi{}?7$vqhIs8&7iB03OvK=@cElRqBuG}H=|w0aR}xgNTpu4R*wyRf7b
z)!#$hXu?6B#-vlSr`n4e7Lgp}u=QRJ$-X7CNHSl_UgYZrKoc1HZq7lK6)YI14kg55
z2#evnq4@qPK+H2o)T_>Jo9{uAWshb=l;dgO%Ih({LR?GgBBfD<<jYe(aIkhOY<=@H
z$R9_<I~E4V79@zPqvNkJAHaB{wGH$XoB)y}kUNq?i_4ja4IHV|46%pXU8sHv1SEi!
zR0HN|KK$wp(StItP;j5Ie<ZZ}gk-a*EV49e`_Vfw1teLAPv@eRX`2ndHPQWONVK~{
zV+$fNL7Ie#!*0MH7D=*^?P3FGimoqja>SK-Q)*v!d5;sDA$UXNve+K-fCnj?y1EF#
zi5|-&)FiZ`hb3DG&9{hy!uoEgRQxZd1+~)DUh;Fm0*+&QMu?RJjez?1*?j|9nV7PH
zr|VXSX_z-3K?H~?x%_=Z;FC51W-pzx^lG{9#v_gfC*8+M2aFBM!3L0s;*T-lJouMw
zAe^2B+)wQrpSm#7Zy!70h`SrVCLDL7Qq1wx;L{MogVqPFhlzeUZ>2XkS{enDTTtUc
zh(<Ve{eOE5A;*Wy9*|Dfj=NK^4aI@XDUUowDS*AF!Fp~$JYESCzTz_nx)xQQpN{6+
zV<Sc3gVhUyVHN&|m5?0#Hnt|LZ~-C$?;xL|15kQfP&xyPrhS;+?~QeG@;JR<%uuS#
zDYY7jXSSf^fRom{$2d~~JV`g|bQbjL6hd%Pz?}qqOnR|*mJw42pf=+jwBcAL&8H(*
z6ppGvW>XN3{jE<(ZG67H!ev&7l|DOecU3lxrTWkxw&1~-pxU-s-qAC=z+{I!4NX?O
z@EwvOT!1S<wg_j9I(PMPX<iyZ2zrKxNhn49R53>{KK2%7d*x;nvoJZf(zEQ6{W$~<
zqXfO^@lO^81dztkz(xj1p!g&;u^uyk*y=m3Z2xiqPKj=IXE`0!Ne<MoQIez%$)S4{
z#e$)Jdd!SjFwM`^*ftj)p?sf|T)K=v#PCszAHY|QA&Ci}<D%~v=<h$YgI#+sk_B<y
z2L-OPC$1GP<UKAn#)Uz$Fr|I90^}``kX(`=j-(otLGDMmtsvv!N8`&fma)xJwgG^J
z;GVXZMX?I-2;j_pXJ6={H_UX6uV0_25)mzTSHuF7WDa-u*is#+(kpWAnZCQ2X=pYt
z0MA9!uKy94Ix4hZWH|{yojB<$UIRjOwslrUu2KjG-k@euc73G)bWX?(rB>cT(dDr`
z<tn@^&PdfLY(d^Q%k%d*60_G?W+CZ5?Ecrf;rp>a9bBC7KfZk8aX$8ZrsjJQPYNDZ
z)nNH^!s%k;a1{V~I2sYo$}{59u>Y+snvgXO_Z^>4HDG)eh#E8p`$v8ZZQX>Xz)@Ii
z&-az~F~M`zPdUh7n28BuAhVSv?iD|^4Jm?gWz>9ZO;-1vEEH7xLrN6IY!$FVjThh(
z<fsdh>br2c2QjS$^Rk3p>`VINm1~DI3E<y$49;Wo)j@h7-#Din!7N$S99P)QI3qJ-
zY{CxAUuFLUK`{S&Q0MuVlX)}S@gN61mSF`_*g1uHyYgR6hSua{R^F!f-I`j7d<8FZ
z_~TrjOMuvga>E01;Q30xRh`>ZiG)i@7}5ciC!m_EFE4h>c0m)D)jX^~{#;ZbS=!vX
zV}D;Ft+@(B&9hA_pRhd#L5R1DnlQ+>dJTLj>yFd3qx3^&!b>|KqK$@<DPvpb*0w-K
zdU{DvsLbKies4}AJ)m=HslTUtlptd<#8i3`#a8igsjTz6HYL`NU-`>AU)2C7KF}M8
z$&ELN;G8%^rl0=YQ);FK^Y$jl^m1ewcszij+W5S-IH+CEb@O!wi;Rb;r+3IC9%jtF
zZ2?M`rM5{)WaP+i_|6RPoB1HfamuF-n~(b3U%Vd777@5mwpfPKx|V8o!)~~d9Iip@
zryS^gM^r>iOAQ*9D>l^^gM)#?ZLpN)I^jO@Q*(;6tELpWjoT3$YdJhwmV34-w|;f|
zqZud(gK!76tx=hiZ*su0B8gT5G`LS@M4k*a!O-T=CTp_Z*9AKOmcCKaf&l{aZa&Ek
zh}1@A<N)UrkO|DGkR%>=`?bbXKFVQ3)V0tb?%;F}b|tn<K?pq;E}GOSi^#+Y`;)$W
zwtUc(I}X(n#~JA9>!F-HvvzM!X;z#DzRHQ4dURYi$Z<KGniC1XZb$JI!PK7Obzx*G
zS5O*j&A(!+sGk5#wj4)=m@Ex-(Z#+Ty+DJ`@;H?`5HIEiq8P0*8VgO0gn0#P{tM~<
za<b#${%G0bQW+CKz@AR<Q-9(@d~@T=gZ&kKfqUv^!;I7@(}bCp$)NmT&{8L^Vd?@C
z7xTl{Ne2#e;ATr~%MyKf7xK1bXzy%+HDFmaP8AOuvu^q-D_OH6Rn~7B_ae9VkJWiA
znQQ5u8Lm;G4K>vZoBWM^ecU%sa}+K78--V2C`-=32-23-OwTefTK{Nv*!xwMX*t&j
zFK>e|$Yc~Sdcgd}QDv|6VHzJ3bDXD^XS2!@wD+8>4Ji#2nZ0BTWma~7v0X%1!99Dn
zfjeD^@y~omoWvg*D#K8;E7ZHgLHnK{U<|{q8S@WT!BFSmTz5Qp3HWqSonV=nN7QuP
zuJvA9N!{#o!uU!{4R~L#LO~U*Jv%?!p!Cpetfho~!VwoIW=gHj8<98AjgFo+C(A6v
z+s&TfrB%rhBVC_wLly5akZeC&eaS*!_Gwq&s4IoC%SMG8RYI~4q{F&a>=hT8Qc^LL
zmm3e4+~>5GSEVI2;$9PFxK;b$_LskVL;q)E>!IWE>QPn7>umD0-PizXXQ}gE*Scro
z<+%DJ)Y&-yM~!GOv&NnqHc!#VK1&@zSfBmk+JM$(KPWo^s;hQ)gnzs-ckUnD^$k}i
zg??2VRBj!h4fDeB(6xp|f2TA{>U@DYiWZdh=_4H#ZC1W<ucglGpBXZ522s&bOHuhn
zPXJtB;y;ZHptbinT4}?>i)U>;{`Jq$ILCc0MYqx-MBT{GU1@MXh-{7G_y#<X%ev4p
zWa}pr#7)>40~TIvHTUrk^xtJRnhJ*RUbXo}6sLV=t++)s4zAxTOxi8c$b+BBp$Bux
zMt~C^Ap$Z=m~``&X;AG`XidS;EQ{_tZIN#7#Mx%PVY6r$dOLp6MeA>rW#E}oMa2l|
zz;8xTR{a%I<QJl6YJVd)sUWT?LP5pJCy)r9=sHp#Eq*!Zq6)**2lnGX{mA=;re`pa
zrTbs5W#K(QG6GOL)?37+!~<G7yM%d0afG`!5*-ugFiU~Us?+1IGlRy2GRm!Kg7Poz
zkr_a@2l!RNrIEuKYwgf~&hon6U(A9akl!&6=LMqE)6*Cj(Rar^82|VcPj`nFh~2ex
z>?i$G%<xBbk_5GMs*UsF*79AQ2zI(gkVG~H?US8@a!c0xbaLUi^PWxZ6eaJKXnaIJ
z-L&u{p7bZSzVP}JuYBJ#>BjwnFgjAnH!h+z*hi=$D4^;0E{T7$bGew%Y~+tKG`V;s
zeZsEk-al|>94@vIg)o<+BO`yAp{{o3Vu~LeQr^>4IKzpO?#CgwggK53A+ArCCxgk{
zC@=2bs^0jx5kD6mi?%E~)9}-_^u7b87PNd2X^-n?t;#ezYS^GRqK}Qq0ZT#QE=fXv
ziFH6fo`9Q8d|ZOVRW!zi9TC+5gCtPG&Cn#xHnVlFZ3{^nU&O3u{Z~rS6SYqF8o2NF
z%ZVJE#;D84Mfgu=W&C|~JJOS{uk=@%m|?JWhz8OZPBOq3>~4Qv0krk8_==;byRPo1
z9bioWSt14Du%d((>ZWimD}?upz>Q)KtUo(sj~m8hzfIFZq{>krpJsQ{{D+RaV<-}3
zAjlqBCA}4V@=JsX)I=2y?i?0-9&XNw{<FqtTQwFi6->u?|CzZKeKL|wI+?E_c%kjB
zUHbvREwBp*(aDi{kBrg;WZ8^&?9FWdIs-SC%zv{vV~y9H{SyuLy)3EgH!#$;c}qOs
z=qi@w--3t=yQE0}_um<dP2!{X`COyH$JF6t>lMX8Rq;xg6Gz=OrA-|S`}K7>8Euv?
zMU>>q^f(7r#NQO>nNDy!8C$J}KV3xV+`@gQGA-?XfZV6{9~0x>t&Fb_p}QUokkjB^
zc)qz<kwUUC8o$~Gv_xQ$d-B=vsGUS4RNDb%VIx`kQr|^P$8+o3aqUGK!CKhE6OMdr
zt#5UXPS>miHywalj##<sZM<#zWQhf3O$Sx|mhvj6v0*3Ot!1Ero0Nf9o^sl|L62tc
zcm?L?;k4rN`$Ax6A1=Yl_5|q`3qc76CxLUpSh|~|-@6^=R7_U`R<C(GTL?B$<5M%t
z9zj-7GxtNq#4nkJLE4rQ-A4AH|5%15>M?3nWUCxk*6hfdgwCml=QLM2>2gU9*7P)?
zhwdi3kAeKAnjX(EKw`>ZiNC;&iIqiURE!hd5xT`dms%K!DUpQ$w{S3>>yz^gtir_>
z+s5(Hr@(v0*L?05^R4HueE$CoKu9MFO85&%M<oF0^giHjJMDU|@*eC2*CU4T+UV)G
z%Nc<Xj5irD2PRmD=2-qxNzo=c1uSR}U$CZVSzdJ1oc9wYNH>*BgCq@*@PQJ{ui~xr
zRy9Y&Zft&~0*t38Tgog*@qDkrf6Z953yS5&Uc?9P0oT73h2fgBy%gm?$E2=KcZB!I
z&^Lv+!_&VW>0BC3+;)i!PA;ceP|AGrn)zCHRD#mrq?9~{#zy(TCH;7Y&jTF9)y(Yr
z_UKU!G7Ky(dlU5XPoW+&?#SovjvphRl0ml3p3E`QZK>%X`2!euz(>sir@(&G_S{Jh
zJ6hI97JTsB4!CJtvRMBoY$Oc^wnh}rJ<U&a>lD}QaChq*d@aa9UBH*Khy5S_Ox%C`
zGq<acO_jB%kamx&zfaZJ0a)MAhFA(^V=wb(-_h0(R{Z5%e7Yhi06L<q3{Q$e1Y(_S
zyopZ?p$_7kyv4I5{5)ocR+sd#U#bLp?Vvo+PT6eSUJ+9Iz<!BP7qV>PMgUq*Sf8NM
z=IV2ta;-wJi?>@gRD~bmM}Q^AIlJ?myVrWSS=I^XeP&x<#1`<aFjG(+c`95{73))%
z6cV5V%>V;7`Qeo$QUOdGW{aHA0e;n*|MDg*n<Z#txvW~VK@77EHvKr&ItD{rtw}bx
z*wEf;uH6y87#4FC;)9y#<QKX{CII5bu=LM$wmt0hfKxo#tkpVQ|A3`iJhcGrIMGQN
zJ~YqH4se_$4<T}V98r&Bqu0;4G9Bk0#sxep&6kiG({FGX0UwjZD%HVX@JzrZ8>e|G
zca)0jj8=$I>8?s%3Gv}uwTk*zE=FTPYP7I8!;3&a*>K>)l|z(XawcmOL^gRZ<0WUO
z^Q8-gm@*LyWY{FNq!u4iQB5_6TENN*%ASLXA71|uI4lLY5B5V@=`ak^Z=|<9aK7}v
zdmn7IiNAncWl#<LpNx?+(MN|J7xYfwIbgD+9%l+AE{jMpl7dygt*Qk=yHMLv47=}z
zQ5ise#v=Z~h#kt;S8)Q+rhxWk^L9A{=VzdTKR%iQR6<w%lg9b8X^ID9;9b%qz4op>
zMyBx8`)#Ql5T@YdGf9G1^%apdv(|T0A|tte3ii9C6D{O@{4$tbCes%PzIyJ_%6y9b
z%soM^vH!b0^=s(LG-cGccsr%1N{91pG-@Xv*$`lF@7+dS2b#hJVZBg<b2qI36>y{|
zX2<i0bJTYX7D+T8S+QxqL9`cIO^w#SU7k2}P5hn5%Ly4PY?cLloWk9uExgzd3#~0F
zs8@DLxR3%|M~s6uf1VsA3+(D-EILIMOxqk=bx%8eWqEyN8+{{r1!Y~QD`<e8@-r!z
z%GI?yCD6gh!@msdOQcZnz#JbFn&)$*qOAEbPuEXvE9jg;bO}yf3GH{MGLYlW&NY`+
zuZE|L2}VdyE))p}<-1YIz6*~y5`SNqy>=Jt)h~RlnrkWMGGiBJqz=6fMTg;Le}C3<
zyNBbFL$)cK6~GbhbcO9z>`@bCSk64vKKx97N?`tizIck}kRI`fE+*8L{+dngFDkL@
z#i!g<c<n)uRqzME5R@RObiTXCYWGd13Y!XNi?UpPSAuhQ&c83)6wBeTSlo2JNly?g
z7{Tc$(F8R%@A>MvO37V83E8+0K@H7X-?80aI>6<P1=8ErIiQ_>c5~P~&tWgX#=+a+
z9LtZA)Q!quxeW69v?r>bDS)E5I(Hxq*5Y8pmNO{qI73XxiPao^;EfRmHhXL+%WlC7
zuuZm1v*9f9LY3``s2kOSkPkmm_vjB<<TtIKLhoBmT{mw)q&wf=A+!n*;2y|tBHa>y
zzLDyO$KEku6h{5TAv*^#yBq8{2J-yt^8A2V2-cC1HkKQ}(VXiEEgy5QdEe_E_wh#$
zIul5xXn$!)1V~BOuLw;sA{<=At|4xGjyyp?KoQhfZ0VyQ3%#rS<O&~=m3T)R$gdj*
zK7eobUQ;&{N80Ts!k?1sLaLbsC>*x^D6+&4r!CrEzqMNp>$mFbA~TLCSLoY{SxTQ|
z75zxO0Kww+L6`3KCnO8?|G2_fSs&X$9~p(}VANAwR=t0g_7Hd-z6-Cs&9sWs`kWn6
z93R!tRG&inw-qy>Rf5zFOmxuH!@u^x<5geXR364pv|O%vij^}(Gm3(Ucs)-`HJA&7
zh3wnLWC(tJS0Er$t3kw~JZW=c&4$=xy^_V%`--JDqrk;A;mGsTc1ndCNQ(LALlyoX
zS8p9vRriGpZ{pBM3DP0b(jeWTbhng9mq>$jgCN}^-Q6V|Kt(~iq+7bXzvcTI_m1&h
zhku=ehvVLBuQlg0pI9}+@TWtAI-;{*4AzlHUs#9*G!waw2)&s*&9;D96S9b~)4B>V
zIbBiT@Vw305#xXqbljfPDoO-Lk0lxKigNg2wxHSe%@U2*e|)T2kNw*)Nz3(F!TJh!
zdNFi|05$E(R1<#Ik7@-Is1f=)pIW@9vQ6&~r5r4#IS_5_zL_on5kf~xh8`hp_f!0H
zhi?b(OZ0(j9k7K2-Iw)dDzy>AL;8~!zq5JiA%j0Rw7K1s%w^jp)}(lGL?jb5OJ#ck
zZ%wwdzYTNoI4A39CL|`Su<p_g?txU0tu4@m{mVz<5gR$$TAsW(?z!gJT(2PjNi<=f
zd$^yEe+BJVPwZIj7|yUxLo$9!Kbnom)Wrah!ejWW<T-2<9a6F~-87;_N;m)YN7ZG(
zRNc@VdcsNh-KvGhYZo{&StWn~Q$cju?pyPp+v&D#@UM_#HO#2&nT_UaX?c~2KHafj
zuX-eOCQ51Vh9hEBQX$3KzFuz|qw0~1q&QW<KezN}Ux^zJ96$8*3TkT^^$x;~HGaL4
zRBU8?j}~2YIjI3Ditmmm0lz^uQO-X#z&b^<Tt(GtEVH(>GCn;kAT8>tl!m@GCsU`h
z)xl@L(i7jEXpA^np6yfR6RmTy8luqd-%9(s!0FWWvm4fO)gXkr@kJj7AJds-V9O^u
zL&v~qx%!UHUj7^S_^Y&C|A<t0RrQ<N-pr7;tZ#4~$d*9LYC4>3w<0qzy)q6PH~iWG
zGoilJS+5t4@s&7T;1@FopqkgQ^qTa1BKJ3i+DfzPMo)qX1xQ1I3Q8f0<t<7>&B-$`
zBWBZPREc5)BQ*mNF*9sb67&$=PjAXGf3piQ=J37%Od4A~({H$~+^P*u9B7(}YIP$Q
zz)K9VfrB$-5<G@>Sbj{a^}|jg?u-1^;NjNQG1|)qP9E{4F%e0qzvX1J;xvaR_!J&;
zgTV~po7U$WUYCJ2lJsi}HY`^WIi~62MAGw+PzwJxH>0ie*6sZ-)HHNu_9hjME6{`E
z0p5bBQ`TPraG3A<-NZzd$b98c26~u6r`pK>N40XXHwy@z^Z!bkm#FX|rdL*})~(Iy
zA&z*svX_LrsqZ6@P3kApTa8u4`5E$06o;N{+;k`%mq`RP*57@b$cRw^qXd&;(;YkV
zZF53aCgxAiHCVlmed|yaM6;c0cMB<d<h9kAhG{|w{Y5Hr6&qgP+7pn}3Kp}l(sb(j
zIl}zgI;|9E@|dfH)IJc>JSwejM27}3wr~j(&pAHvsMbjs<_FAaJcd@08w>VR@DutU
z?Mv$hZ9W?*(wEcE6We&~^cbke9!0axJ$+_jdXQIWuK2m~HvRK6$QalzBB)ANYmF$s
zcDErH1dccAQ!gCIUcS^Bc9fS$$!w1d@@}rIVb$UjY)L2dZ=<jBca&O*d{#jC?3p=N
ze`qT7db%Y)JKtf4{Oy}PHfrkd*1-ieO~d2DHnQYLqLa!2f3V|mD>)pm^4_q!vTDc>
zQh=cxo)gpf#Ea^DC>4u@U8p$>3@wwI_9+&ei$E56?8Mv;%nj54QYwFIYcm9o+{0gY
zN{-L2H4)|IqHv3qQ*-~>!~%C@`sWiFK7r}9lh;|t>L1rbbpjYcK=T^xcgXFbr~wjH
z-?KL&QGoZteW!S!(}3QVn9s9JAfE?@pP59%;R1h}eLh9$6{q5w;=6?|JcfeoAu`}7
znB2M#{G*@xmP-g?AF`ehSzRV~wR10M*I=mI6qh|Oaw=Z8-)<AW4RH?owEfF#r5o-}
z&G<5pn%h1FsG+%U4Ip@hnPq9)xsbOEQz`x+MoGOVA4EKf2EAx}54)H=CGXBG-4S^T
z#AcRuQG0Tp6!8AYMwm&RidAW0rGOS6XA$7dn+|<-R=Ktxh>uN7h>z=?dyDwa#+0x1
zPr7cQ<MykGGYsWiVx&-P4Za!|;AQ$uG=YypLnERUCdep=dIDnv<35^aPh(nt{2fGk
z2K9vb7gvyfNSbhgE2m0;2L)glZk04veIqRABj6z9`YiyQZ92p*c*-|l&E8}YaYk}&
z6VOf7bZ{uaz>|k%Y~=Ak1O}gb-b>4B3%!EM(z-5|zj#T%f#E+<_-8sk@xgr}1DvJ~
zcR-W^p~mQHTW>oBb<tSFdg_5-?e%SiLZ&5Y(bqVf<oq*;1EtbzLxw`i7E8kZz_d9I
zQ29#JWe+`~rFy*!bv$X@p#4SHu`aW+;Cgb?gf@qUm`t?^9;gJ^{N=I|0YR$wE2eD%
zJiH5Rl`sKQ0kjAvTp~i?xUSsdR0xWG?}Zn>4z_PfI<_LI4j;*Z>K@pOIHvutNr{8n
zsi524x}gfdwiS_tl2cThCeP%~tD-O1z=4TXzo=}4aQ{y(OjslIyJ5~vXAxFqL~-51
zy;>Zcs^|*p9U$<8{`{DD{wBj+pwyO$B*SWk_NO?xc!P@c7uExS4NLQH0QTzDjDrCT
zo*ZYPUwLF?UJ`&k>19-l9|>}!;wFB1j5qy|A6nHV!?VF{Qg!ol%H7e}X3&C`elNL=
zu#V=$Y;8#>EogHB0oL_<W!w>z+>L$qIVD>E)9_5(JMnOTV6?x)SUu#Y2tA@fAP#u{
z&V|xLUHqK_P?ZF_exnO_XWjMr8l-_sW{P<~(-H!j-=JVHK#hmym{_V-_B&ZQp3Z(}
z(0>>8?yWi<fqxsW=D<8qi;j*rp+&HRv5z9lKA4bEC>%%H#AM;!FOw{E{aT5y1tBk*
zhQS)xcfIwezn|jwhnEcqg@;|7f*5vg`gFPyEv0^GJ*7py=e*QUHRNPaIjLL|xcgEX
zcku{PBIHXb7cbuMUCplc#rOy$t)l(JRRTFM$sPCCWY9Z5zXxDD)b=mW(Ff}A&E6W=
zjXmG<1?9grkfkxNx+D5?-VC1mwoC~0E?Nk=7I7lpJ?j+A?yoGhbboAI1Q8Ro3tK$x
zY9rrdp?ONMss_h^r)~k~cmu083*<%wFU+UcU^CdS;dM36v_x;!P8w_AJ?vIQHdA%~
zDoBDF7%+5hf*HV<O`7Nj!9RZiVqWngCWN!XbY(2PfuOa4=Zr0??kY0c?2~z?zv3Rm
zHK-52^0K>GQX<h5UB7_*4Spz3#kz~4%35U1dfDUQv(Z#R1e;t=Bj5U1Ik>`_b^PvM
z{9c$FG*B2Kw@b)eL=R_mG_WEOpdi)R?BwkkF=Me<Y%I5+hkCLQIDkk~A0-($X^W?1
zmO7C_y@eB3!G#_$hbMRmr&x-w(<Hd3Y`q=<5!iRj7x?0}LCvg)RMki<5P4D%D@YYH
zvFJQ>5s=}P;%4}rsGQqt0Bg{Anj0^ERI~o1m;|uhQ+4VHUal>X+@bvwg=@i8$0kP@
zaTmx2sgP!p-Na=jYaC|T(pSxqmOfQe)89-9WdwIqg4kVe^UI=@pLo{jAwY2o^d-7U
zHjM+&S}WJ1o-A~-{ktDs{NgXC5i7bhQ(aBB?I6W+FIFanV5e*CvY%UjNXkD5^^|{q
zjP1{4$w))DLy`Uh*yRWdhtfW)RFzdODYmY3JX8A41BH5?DjPRBJ#ssV>fP)K>$wrQ
z`R4zA=lWBw`DCkAG78*kfZM9AI$YQOfW~E<+ZmhfvGQymr~hWkI=zh>6k3waVV=pM
zKwPAwitlV~Lg*I%|D?BE927y9)Bm06UYPZ7^u_!4BK)ZAZ&Ve!zAZYCxnlBYpRR{a
zolkZEFMA~9Y`}34X3dM{rEr^?`yP*zo97UqjA_-ub-G#*KsBdG`*Ga9*vcPZ70D0C
zL-GblVmvF>B&)V$Lm2Qi)?p{h*3NbyaMA3PC2@iU(@G;`yUwBlzIwX6tnZ5mlx=M<
zW^KK$y?<jiIWCS90Fg0&`C50<nVygm0+aFjX^>s=m20#b+x2Jp{EH0-B3Pr!afQrf
z;7@ctum=?SBE25-vD*H*QxfvkU-vKTa5x+oTs;+cBooq^b>=9@#x!kGu_ATdp|~z^
z2&l9&p7A_P>N__##fv~oHpun{9rPK+$avFSYXz2SX?A%R27H`#7`V6-+!)&|I;jQZ
z>@86$Psj`#tMak37QMAtHMRPiU21A#wV!Cn8QL12%xGg?Ork4@{E=-Q#+o|gVY8h+
zkGhm$Eh7SKk<s;71L9gx3Y7tIzC=a@sm+l#@nFs~4hLR;I>3hvU7XotK^jfsL7E`=
zfy>B4Mq0I9*e$aa5exLX$ejqg!Fj-}z#`xDiAhRu%L}&$BCBzci949^kANS1Nd*m0
z=Z{mN5Ap3LJ8y~cchqXzu3?6u0xbVN5-D=<!Z+xbL)nH|!G<yd$KSuN*SXW>=_kFi
z*a~yJN{K9HD#v-@U=+$l6&TW*U1Ce?7vaXLo;(8m2(qabTm7!{xL^Xa`-a9lNZdQf
zKl0D#cgDq8YlhwrM4LMt?Lblm;v^jL@L3%)!J44A9F7B;nAD7kc~mg4<iHFxI}SS+
z?P-UlOOKHOKa42S4OPB?bs{Up1OS%W2+})kox#qyG5WU4Aw?jdkqbBmc{E^>wSSKe
zP@{_7J~cUO7^V;va1a1)c<(%X_tuv7XU~vuC<9?<LyMq*E3n)(F|qlipDJ;QiI$SA
zv5+;p|0Ju#aV7jJN$*9lk(Z^T@UcPmaxsX1RZgCVZx9%M$=bQ02j!trJZGZ&BP69X
z47zg`xULV;&Vj_s*@Z=olNHr_z0_cU0DCt0O{|uKQ#`Udx_sCTSo!Jlp0&B?y2ko2
zH}2Yg_<W1vR=|b<gsC$zL%~kxAMEpX>?;oBNEqsfH0*!^vB&p)8H0FEwmcTXz5a}6
znGFfNo+B`^M74Xf?g6E_<G>1t5SFa$hiE)Z2wg>h9m(5$q`(lg3{wP?(Ub+CBGJ~K
zvLzRmSUDw(Elp33Xv`sW4MM786M{5a9W#sa%}F*BTX?E5_Wpk?kHUxRst3V$OVrS1
znq`(AaFX?^??g~OrlSmO=-k3qf|=1nQsKnKXk5RU42<Mu#Koz_+MgIeO^T56xh?<f
z4c!X<lLQWJNH%yTzBWJat!S$EU(KnFN{i)Zkl!E%9F$3ySlLR)B>?J@R7YkZnsKnf
zXg$gz1a5B&$u)p19v*=J-%Lk`Cz$h9eY!6Yb>buNS9?-jQDsq(V`Y_XL9m=jWtURO
zSYcDLbM;_(z?wb#s`k#0JMUT0+qAT_H$V6-{{8v;ErPR=AncbMd01K~iMt7^yN89l
z<?`NYljG3C?0ih_&xkkXTs!R#Dz5uSagnX9*9!(0tJ*pQBU&3959~-B)NUUZ`$3^O
z2VqIfao69x2y{%YY-tXDtQ2iH(jgtfPY$u!FNm~mz;4%3OrDmfwXZbOjsKp+WYGHK
ze|Sd&ee}2X2ndhWocKcy$qs>FyV2Un>%vDgHFRB^GTYCQtOdT$ZBs~-yW#i`?0w)+
zc7c8SiV8==t@>fX!^k+oQ;&r`@L2>g;Ydeap8i=QaY9g|B=lDfTDratU0CQi=<pVV
z6w?HLB4>94(!S)q;yRdCiU}~$$&h#%4>Bc+Hlt4#!3nduq$Hw{;K%`|z6om@X1^BR
zS6}%+)iLSQUXZBruKC$?#Gp!rtB3&fu`@5^2~<P>{3U-_d#n)%vPI|I`jHR2L>}aZ
zGyF6>9X$6#9ePLQ-`26)CtO;Zom1xEP*O+t<LmEv9z|t+&h84UmXj#WFS3L5x~i8s
zS3)D!XW;S$=sy}<{8dRhWkMDep&-uMMWfn{%GcG(&+m@u--iwNr=pH*Na&1Kz)fUm
zd%v3leQd1my}mb_gFfD`))^GMU5kEh*z`RGCc8d1(XgJhX<mUz%(_8hGWv@1SfN$c
z?E<>Qzs6E{uA`c3R{TSIpAtdtn*JO=epIc*m7nQG=_`_=^+?J7;xc#hW1}W63(41t
z%zs3K>YCrZ6@jG0OMfmJJp3T4dskhS*ledUnsv57`BelB{$(ZmS4RUD-MREH*HY2s
zsdpP6Wb`Hqel#T?L}Le<3P1M$)IiXw6WG)=dh`eID6L0MdAGHNxqPI?q@`t?^p294
z*U>8Py+A(Krj*=CCW!qB&i}}zlvfW6@4va_z=0BWeSY<Ig;iLN@JNfUEAMi7$o2kA
zu5esv;)FViGG7uV5=%lv^rqJg3^Wkt7$|cNyMn?XxckBVB5dgzcrDLOU7ia<^a~P!
zIA$^_GwL{@{1!H)n_9ifG+#VWE-9g^STWu;l%<4u<9+5%^FB9bEQ)5q>y-6D_W6OH
zyFCK<*YVC}aPO6I$r-$+-a*6zpBjC}<9w~!H}O-60(IYJC^XhSH!I&<kw2Ishq4&I
z`kJq)xE?u^LqA3=%Z$fWC*qRSiP>8}rf;D7*(kn-S}h&-1o;kEANrx!3yqd`TLs_d
z+#DcTtH34n04@)lv>K9VJ0t}u?w4;84316p2W?}4b6>A9_>thpS(@J`xvTaB_&W}g
zatle))3vp)IJ(^v-y^(xQk`Rc`HT<Ba~7ABmryx5ED4y-pkCYF4+_7pAioQziR+xp
zgPOm5_>PC_A7PB8Y#1S5)x+E1^au*blcMJP`%>8mz2}D$ryULCT;vVSMd8@$sU$Wd
z5bI<ACjM>ciR&*Wf%p4C{G<)3g~j!#jp8HXN4;~7iM?+s+1N~wEbpyhv1bUGzm8Xj
zwY<^zR5j(Gn793|@n2h`5v?Z12=W%5+agquoN_?cj?4%3ltCVFffHVKq|PDiTPXO2
zxsd+0gPx5?|15nn5}Y5GTqf0dffRDbgXXf_(6--J?syen`CwTz9NcWO(v9NR(!w_C
zkc7a^QNWr&q<fFVVJ;CH8ej&x07mnz)Kq=PfNrT10Z6g9cXyyY9o+%i|J*@IZdh#l
zc8tg4josEd*k5>fwI%M>ey(87L5ky<t@=~$iDdpGmReKy<K0G=$o>O|xAVu1GBSjr
zQSLmCfLajZutt1xfW&c?F*_Wro>^O`gS-o)a64#!YqwmQpF1i1zVDZi#TWvy4vCIi
z;qjGt+glR3kP79rW}fkxE>;|vi6l~6hw$8Wm+C#UCP?&Ce>qStbVAJ6r9MH4Y_hp_
z5P!FAjq#g;PG<^gez;NHc<rf2i0@D3he|BTXnDh+0>M=|tTcW1*vv4WFj-FnN95N`
zuSH7}X`6M0Aw`=d!VjUzf3+Py_1gd*@&tEd*{*Kx<4?5Uz0=N}+N2<Mg{fUp=%6*<
z-UPd<Ljq_Q`#h!5;N8y@A8wx^o0!i0{sx1)Pt_u;43R<_j|GiBc)t9_JC1yVpogcI
zZY`8nO)gQPdqxEVprzQ$e0+eK&@fY2r4a4w6We{ssm$-sB?%T>r;(JQ5QRSWP*~rf
z)u3&xGon4wA3LXcZ7VnJ-S~hC-h$lyR@=hEp4b2Snc6-QjHauWTr(j+f!gg90{un+
zNp9guC${dBIOKWVRU=>7S6B)KpO)CXy8s`n2oteU^hs}xDKpM2b1W~ZNJ*^E%!-iG
zRo5DNd^D;CKK&<-A6~4JBoFdkq4qvcs9SH0bNEK=i7NCX^`@|jVZ;4RY>nQ4RY;}m
zxiFQ?zN(?F*fgIr1?>|=4)^Q(n*%aZQg1I`LnyQbh`1aXA9i1#VKNAsdKnv%U-#^%
zr<fN!#&0hzV%|SQFoC1b%#eDN{j?5lS$o+`<Yn-K<yBd#jet3-E26*BwDJcCgmv{E
z0$&=pd(O4KpI>_BkJst6EXs^dYa;xfK?&~us3(Krk>Lzi<A#*?X{WEmT+PV|2=Qzc
zvxcy@|02PMWjEz~=h<%Q92e(_XK&PHr>QkLnW_k=E}?e^kB?R@wNMnI&0p!o)+7l%
z{|Wysz}D8#ZXX~9X%Hc^?CW`(4SdfDX=_Dp)qU}Z)po}~7~lUFRmYF~DYICBQdZlJ
z1{$HoFyRvt0Q1$2U5d6ANO5KTAU&QajmJj>yT5QvrW{if@6#c~;j;R!9~C@$Ma&L)
z3Ba6TSv@eAME*I@Q<rPZ@kc&tK_C`vnYpoFXryFvnqCeB)Hh)V?!$y8*YaE*AHOjn
zRYGNh_rWS46ZuIkYlo&<n(5npP1PpMneV1%4qX#|ha^i113g6;HtrJ_6d{o-eKEV9
zcpst^A}|H(ZyvcD7(w)YSO-g#x%s6L3ycRChUQk=Tck6w+pJ*m;ul&9q9f?0zTl37
zJ`h76HU>X8JntH-LxEH4me0***11gl#`DP{yF|c4-~K~(RjU~tQ|}wD^)Lb{N~f)>
z%^E5-wxk8Ok~%tbVQD986)|xbMoT$KYl3?a&JA*soj9?#2Kn7gmo6CEI=?w7Je8G}
zp=;%qXjwc$FyY#~`kYjm5)jH7Ff3xcD&AIRoSdv0e#|KOho&9rH#YpMh7PmfITw#h
zBl5F&yelou%H2OvKlvHCs02PBT*@0BnDTXcn3b{8Pf`}XFw(!)2B$PuVSuBR+rg=;
z$#xz@Y)Ulvlqcu+psVfdC9~|PnDbZ1zo8&5&R`MWtk8cM=r8c%^h|pa!^1ZlFOO@!
zveOc55Rz+zn-YlK(n7i(sFwlAkb|Aw_Wu6W`K&RXzlxhrN725kv;79^U%aB$PDIVj
zy_<X0{+r{`zS&IYB9{GQ*wRo6>7x8`rR$3sj86!irWLEKtTZ@|1nb(hx)*|C5PhT4
zbuOE)Ta*m{b#Pr^Nw=%V>^CdM9oF&oZ8L&i8~`@Y8J@&v=F2zRB2<$YS7ANOiQn7h
z)2`zBLHlX#`g1%*@|M>j7LqkbgwOQG`J)?0wy@LIrNBk8(c18}Jg|y^)wr;lNGvVe
zqpz>;d0zqr^ufNm$=i;Em;CYpoE!wW&4tM{&kL-rNc>M3q5U~eJfajs&Pa(LZ+W`i
z7%`6@$)&``qWl7W>-KCA(rW{ai?PSLrj?#AXGDV^!e+z77}__3e3MZ9b7aEO7u{cA
z*N&d?&&a;E+CCyLEDiW-N=+`xSF)mugUE5z_H2`(5N6xAzWin6_U*=#zk-AG`pRJj
zqH7%ww=M|8Q)r~Mtd-K*5_G~Ph*<>s!7MU{qOXm?>fS(E$|_-*Fhu3iMHw{016}Kd
zvT=yb#5o0sVef%`x7=^sOd#_wS@<BS?$oDC5h@pEJcYxX2LSEvI7xec`w-~4V)rQ)
z{r@R=mU~`8@FvHHFQ_JkkvUcN&s|Z0`xW21+Nv(8nFtG!t+7G553i%ar}})mTz6il
zPfbxC2@HIY5E&|&mA+Fh!ngFxkC*r(Sa0xjG*(EKlCm6eLvs^R{R!Gj@#vf(563&Q
zbmrF3^hUFgC!D3g(OEIQBghmw!+1jpRYrdyB)~iPB^v4xB3M$IYFMi-kCyVp#`tr|
zhlL?FHCg!_hLcPA13pOzM3Vz}wap#%@=fnISnzlC(PO=JS|Gkr$Z*e?wu{rrCFE7S
z7{KkM`-@-#!i$G5YrWMT;WbHcJgYXAxXZ`*oMt2lafmZ1C!qRk5kL2C!mVYMJT$0D
ziymhCQn3}j3tYGnXuf@;qt&p*n~3w)6M^7Ak{zB?5HLU8O>4)pdr^0!IX3*Ol-Qb5
zAjwdmZ`gz^O{gKJ(nR=Q-|!l`-P4~WC<sG1w06QPsV(L_e#GIl$_9_|7;#bB{jGkN
zp|S2BNHO-7Ze3{X)9&+@4oZLP<KG-puMZ(}`f7(h>ugKi$`?KL93P+poICXTq(LGD
zLmhno2tJ0hw-f|VlF+IZ(ki?V9_4e`sp_k<eXqLLon^?<RuhKEw-XgKhVvoft4*45
z0TmgEucA&D9idGxG1*dWgto2q12;;hsGX4By_K&hE=GpIv!4{_CRe5bFKDBIN4lzN
zJs;yj>M)doE`8ps4S7MbvwuFKL@K^A?{JjROUh$*Q94C>?769u4Y~t|j+$T?9<I00
z^EeNfm&seo->AqdB&pBXm_ENA4jHLE^YBBgFtf^Ye4_wrokjp0VE@icwd-k5->c3(
zjh8S_Z%z8DGF2IsaYOnn{=Hbuu$yHG>8B@#3tDbK$2IzD8TcvbT=0EWU}YoMb4Mla
zpPWKpNJ>o(9F{2VvYUvi$y!%m88wevRaJH%|6bz(5eo_!-_>@679fn3^4XFUpf6}`
zVzdVL&4vB2{I~BW&+KP@VR#qG4;s1o1RWc1oPL&3rSe99i++PA^63`K0n1`JhRuPh
zP|VV{m<MWVa|dvudxNiTO?Ri%({JUlzC;VsFU^B+HXPb0{Uus}p?NJvU!k@`0qY5v
zT!8BM8(n@Kvuuv=oP7@WErPC5z>h0W#7`!%|7$OJ<8l2%thvYh9xnW-1SAm-_x_zJ
z+=%e!gtW$O{1AtHijbJl_%XuMS285&9fbun{jzGwl)I_E)oGEW2abUaC~$fMY~Bii
z5s1k5gFp4gyPdGdkn??1=l-L24T1F-F!(r&qE}taPD#cXVVcE^GLKAQg40svXa@hV
zTBTvOfV=;9nP<g9@NZ7PF8RMVNyr$@$6Du=IaD$am4`0`h~nv>Bk*%J8A@wro?vlE
z7S~R~Xq_$fJn1Nq6i9;#`b_B7>Q#e2pg{^h-yI+-1smr>D91Z)C8KF*-Ngeslqhih
zN5FqV;~-XehboJfH7~ow{O9-!ZB2{IPz!aL#|@Q8CK}AyFY%soe6x`yV#8p@vPV)7
zkV(%8an{-pWws(f<WK=+6IFVKfuK-Z$JO5wq{B$ys|fngwIQv}O?wI!_6oahHxq6}
zO~SMmn)&L1k-(caac3x%Z3x4;4F{gs>kqvuWZAbVOCt(C`$Bekga_@Kmh8xKqQs5e
z%Q;<E*JhGtnA(+_1M|(60fC!WJ7G%(gKWbW2!8}pD&b)AM|!?+%?DBdmv|JXm2LX$
z%#!b3Eem9t)Vj}K3&C^W#AjLp$yeoL-oZ_h**=!iO=8`WVIC4vK>#TRowzsibv+#q
zLNwu&+c(T^VP&j)Bb`*4X7JU=>f-ZDAmE+<gluvgr{;lFRZ$W@ddxwk)EF0E>ZR%;
zvG_asN7rhmVRppe6!ZsVvBe;o@PxsVnpmH{l+(f{dd5=q#-gsE;{ffqy|a4&N^hX|
zgqBOTq`nI+{Q#6(Qfk@fwmT%^)0#o86$6SaCnljz-BLe-6eA<~=~VNgGKZY!MOm8~
z(dHrgUi^V(M-T_Hf};B*Cu=%h15<i^uw=E=kDTPiSa#~>MWXEserVxML6f-|^ns+3
zt03C|qf#{@G)h@#ED(;9p&t}>q2Z139$DuLxI4EMn}2}&Ni9Zmq6=?hZK<rHtj_^Y
z)r~$59QTweNt=PLESz?f*CpSa5GfA>UY1z2{&N*UH?HHm@&B5J@&7dq(ujv@(BVKv
z(<JB|G9JOhjX|eeUw3+KcXl$aq@Vqlt7hkz0nARG+Eu7zlCgIL|0GOf{LVFmU=q80
zbgojwasXQJtLn3Rg7M6f_9!za`4)s!3~69qCs8l2Mcos%$U6RYCe5k%g|8R&!D-ju
z3*0=DSB&c@OA`x@$)pS2aNPc<T!n%!Q!%?t(&L{1?ch+XSZksD0~M+XvN$vn|F<Q1
zWGwR%W!0Ke6!t7;$YGF?qIgdvFg%hOU{umS?9D9YGu-(ek)!6R+Y1C<SLCsvm1MkB
zR<j31ZLdsm*yv1F(8Wv^L$S%m_u9#;wcEk*Zc0trN3L$5eZ0kY+@M1aOrr1>Fbk#L
zwjnY6RW+TKIu&Qk4I<UkxoiF%Gh(-gWe|rQMyLR2s~mgCE;d)pJP4s4lY4okTNm4H
zqh^?9Xob@OBK8-EEbOaFqS3733I6wjxxmpLR;u0SZ8b&ruHA&=9RL{8{9F<_yOx(_
z+iSRgi^U_P=Ts)(sUkJHzxO?5Iq}_GPX!fNO6j|mzUX10Cta_K7dU=`*P<V1RVid5
zA?{W90r;BC<tGw+T&lBRz`0u50n>(5f3So+2xgv)6F`ui`Ew1yq)&F3C~2yUPToLP
znSdReQe>Xb=QFCm+DymYYrP#wP_sH<Zl^Y*IV<<=ykgm|*wyWJGG!n4_6M!^*{9Lc
zLApU5G<nks&j}$D37D&p!ONkg=1pg3&>~xcc0kuf7Gnr|?Q<(bQ5fdnzU_x7=QAcf
z9Qbf>f}!ynR;iPjOOodZZG8^-4>g8GX5+0rpc_Kv`BLReqM%~fy=%36WoG}aaWqpq
zF_vY#2O%{cH7P;IV;@@0Rh8G$6iYJ}l8}?wla?EM{+jKAE~p&U1T7J}FGzvp%Y3}i
zD2NUZU+rmG2e6vWd_oF~P)fGs(!1vb&+d-586K~yK_4k6nKqCl$tC|=?VXbNh}9hS
z7Kjj)h}Z6eAO!X)gfjjW87|?k#;8t6Tms#mk0Me1@%X5f*D+`*aZOm|TfG0Gn2-lz
z#I^LGadlHuuE>00`MQ{mSQP#rSkfKqpC^2TeNjDv#DII{NA8bDo+ohBycA@Lky~(|
zMuz5hW8q-pk@+(MAQMDtc%^D1w_Xy8DrYJ_GGecR*Gf^K`nS2vYg{a(H1cKDE<vax
ze4&`Ee=aZ%w<oJ*keitXI(Y`Y1Ce*=@Wpa-5w#Tr6Zslb%yA#h7~H>eH`t~v`DZzg
z?9#wOc<5KPA6Sm1#9<3G_^FO)obrpp=ZVeKKJ<ltArD32C2WO_Ggx><pMV@qiC>cD
zXk_veBV@FHNp4({ekfBcE<nx|hWqYOX!M6nJZ5;<zl+^4JrNZA4li=ROycJ<q6s6u
zsfMaY_w~NL!C5vgpX&&@>oC$2sK*euV3Q!M*;3*nLzUh?G!Z6Yy>kQ3Jx+X^%qI~w
z6`olQjhpWV4oxNMf?=+E?H@Zc?kP>S1y`8gVZQgs6b{DpuX@AU?9LRiII58nAMcG#
zHO$GlAh}oe7|MH;6k#)rQM{=?d((O+ZtxM;l@XE^yX<4ib%}T>;JLQ4MAeLt>P+E8
z*bGbgxEd+Y4TJZS&%=#Z)N|1b(1TGb4c#5KzI^=-4r<yCT&r#*-^+#VcbzoIg0C0-
zb}1p}8TH!2%nNdQcRZTSPVXr#P@gAoh@AazZ$ySfWf2df$@u=EDv*hW(nUe2I~aVb
z)UL>`9}iQt)x{wB8)%#xgWkU=u`_e$!MA2#%TrFa)R?>F2GbE1V^SkFe@Ey(n3h$A
zm%7c(I(BO?c=7OVyKA(!FKJoKu?Fj;_M;2~nuCYDfF#8ezZUMQ=zCtFW(C>TuC2%W
zz>2md$|KnBUi?vEF`d1fp`NAXuV_ppF{dXY+V9fqCWbf@>%m-*?a_S}_yQ)axC~pI
ztao|uQ8o^!;e@YbFFLID9>a&nQ8CVk@DIMvH-9|vHn&yegaE%2KHP1XrG6i{T1a?S
zJj??P1vY-So^+v7%_%7&Ji}%e?bYZrzP3P*RJA14=fA}3g$Y!Z^}OV;Q}AXt*bx6v
z*g69~&Tm4DM?+z%-1QWzr3A|nCdyU25waF&WMI&yB6_u#ZYoBG4$VzY&MYh(olSK<
z_8-8qr;l{Vq~oJop%qT{u6ziKSh3Z|>Pb%Y`9`nxjM!0i@cf~=^T?39QNv4>`#NVh
z{gdjZq>s_w%_BwF^;u0c((}(-H^~}%YTedRNEPQk8Gonpf$US%S6j7>_Q7);W>UQU
zy4x~djE{Nvs<shUuf(zyL~)cTD$VFZBbMiT^3xhpf2X-Hd`l7N2vUolR5spm^Kp(V
zd7|ik+4soay6fF0{N(=(q?go?$R?OKv18r;Yzep3iB%w&?4Q0rxZvv;(B^hf5O`;L
z$dILu_N9^KtFcsc#pxV3;_L`c$Fb%T`&92e(Z3v%zba}hpZ*@E?A;ilt3022QEfvZ
z)Kf+_UeNB&JB3NE={xW!G}KL|cl7YTyJ4S03`+O$v{viyNI2I0I@!TU&P~{gyB3y(
zlWZ*<=wNY1|KG*wd&>`9UZ0WK>ID{+{E{PN(k?aDFx?b4dUf!l;Ms?_8O|}`;jjD+
zZREZ&sxbL=)x`Vk(0zMLj)jP9pV3*t?rw`?7_ntVtV=BTQ{!5<YdL%)l6P+yy>*3Y
zo!uPIjLl+JTISh`FiTW-sE7QOg50H5wOMOmz5MUw_Y$)!JOp1O23isAG@^RKJms%2
zoj?MCmAs;qLP~XhpZ$?JqTE|~R*hL{eQg2$Rz{wjK(Ium3ntU1J;Ea+y&aWsR7(H-
zB0MJE7)n#F+Q=*krhiND+D;)QPQ&FC4M&Bj+}=u3;J%M#6n+~>`2;^sz3kVE?vKmd
z6B9pRV<46wH$Rt>IE^Zq@lR5>%XD)nF$qb>#X_5H=$&0YmG5OuMa5_2rK+h~0Pvp-
z8VKs6#8no+5L=o4DA#+J#FNj<4tbYJS;YFx{D7T(WnE2tyCthARBJD&0u#mGjiofn
z$OX8l=EO^>59=jxozbAZu``m%QY;Hq&46q-)b$#7%(x^Co~DyfbpGZr``4Mmw2BoX
zAesZ~cP6>WQ*g~Nw+)0~Kl@nfTofZ34Ln#56Ihpl)A0K~(vT3?!)347WaV|$m-FYW
z%<@(s*bl8#=eubS-af20KGLCZnI<N1ORHKcw8~ZZwxuFVCESciWy=g!TFhMKP;46J
z>k&!Gk~7!>%F*Nwy;rYGvL7*XH4OH3>F6u}nh|)VI{)tYBUi)wl=wLAy0g%jqr~GU
zo7K_IvGwJF9VH3b_Bp}l`=*h7r@*_&Hsy6{uv}`E?EenR>w|~;FRQ&Ox0298i*wP(
z;J}FFpn|N7kjnla-{t+HVp>arn-dYnX5@!HgWI~KOK;~5GRSz%8yoJ4vPOF&Snm5?
z{VHRzO)#VHaCpsOXYR>Mg6`=|VX1&HtH0x2Tvl!VY+Z3WM&*TJ{0dj7Vg^Gh8!fAW
z9J;lAXhgRCRH1^tQCek*rCpirTh`dkj6c7a82f**tkz}4VY`o*eTRRe#A`1uPED>X
z>F^Kx;WUxvwGyumV1s`PbK1)y_YFQXfZ!r1z{1ZNRFjw24k<kPHbz%f965xjl2Y;&
z!w`uxx3ME6tH#9M>2+u!Thw>2y#}2rtefdqV)w5Q-rcYxn`EZ6rWbY;He|bnU8AIH
ztK)MirUyhMf933VSPi_sPS>t>Z^cirD*J8_@WpDHq{fc$t>$BMYiTKjSxGAlMG`$v
z(8x&;qW9$$meA5L6%3AA?xb6zKeZEa=2a4@FDS9CaUJ;g>}+#S%f_p70mVXKV}Z1+
zvH%0ahTr{$-+#*^`J7cQs$VQZY|43Rs@`90N{z2a|A#m13d3b9OL-e}hA1-NZ>>6p
z3tci?@K-JOgqvn^5y%qQCKZhkD=M>=lmErl5DlBe7QvKcpnLUsoItbccdDHkx}0@7
zsljj~ZgNaX{m0jijt4VzPv`*{TND=z@T!b7T*gd(353Hl&^HrY5m+^exH?vz`?k%=
zw7<0qdhfJ}Gf?6^p4b35S3*+{NZ?BA?cKO_{51zhV5H+}!LryfSaI7V0zQT2rN(2^
z9S6y69ZMx}V>ax&ky$|rg^63x41RNN-(fzCE2H(3J1*mVGjm}3A;Uw~)(|EdxMkDr
z!n7sa<+98J`i$YEdeVgROsdPc|IWH(A+fQs`qQZ2l&NhKQXMyMSGj60+Y3GyHE2Mh
z0G?ZA^D6(i=flL``btv1qt<y5a3zEbZehJjth?IUTI{xlxb>{cb&VI*z|KotdSNr`
z$-f77u*Q3ej((Pdk?SF%sQ>S;Lfb9#PJ5E@Kl-xDXG05XV8`mg9HLvl_%kyjVmkyO
z`OnS@=^M;;TyLk$BhY6dG)^0N?0xiiYnr*3KhI^+cj*B>^p-azUs`-yBqU}FLYpLB
zS_cG|3Cy1mf_+BvW8Dcidl@5DhVBY#&DU>~sAP}HHx&=2tUICwLzC^zSbWw{)W}l6
zSWwRM^HOx*xya?5kgMw|q5+Hjw9+kdYcuNg5ICw;JFPDt8!QSl0#WC__TnjT7RCqU
zjqn^nFIoE^D%ljkuz<E$Oib)h!+4)7!?eEa^lwYRfoCPPU)OQd_#wT)K<c%gD7cB>
ziwPCi6o6z@ahB^Yoh*#o_3cgLKyvlxhIipb9k7dyW%Pfh1Os_2f!iqLnnq`Cp`kC1
zUiJU)`bLiMe2Bk9cTwxiws(pFCo8t1UC7XXb=Ca;?K!!i%k2qUaBwl$5R`|^Oewn)
zuyfBfo2)y?>zOdt^hUAK&a8q<t=qU!Y8;!A&v3(&N%hQVwZ$pNt!H$s9ug*K`xMJg
zKiv#be=T2u;J{2?Om087#R)-};ZmE-W@f*_4=Il4-?8VyTll<~$y)5+GGbtH7){c}
zj=)6o`ggMncRX$!e<@YhUWd=;PcHb>N|2;?nwjpHt}lfRfxe<8ijpO}EBmJ;1bpTf
zeBX1IMq|cgs1Kw6V@a@D8Dv^0T}VLHejfY(JH93N@m-8G`l+dfFTmoN8KL?D%c>M%
zzYNU>2S7$>GQ!Zm6*w24I(3HS5O_AGBIfGm)=&@mck1_h<x1NJ@@l^$kr4hAdh!Im
zrb}}~nh%%?%@hH1FuP1IZ1qyHn#Be5)X`SKPxzO~dG;4Cx}Gi?4Qd2@S`uey6ID5?
z1O*1*5yH-wTIOzH5`J$Sgb>IoPD~(B=plY)cSx6*P3&fqNWwiMLn#LTZO%}K_X*3`
z7#>(s^W{p_=aZdG#YUi{no4kf9p%uRceyRko=md<7E-T%Iqi!s8~K^KQH9;`Mkj%h
zE2i;3l2g0`RU~&yEGn@3fBON#`7a{2j?dqeeGzX?hMUzJJ7n^f9$~;UH3Z~!EK?eP
zUtn(P<Pc*ceWTZ9f{6q+hTxwoV(@k0)0=u-%Yjy{J`EjeZ~qXFyj1pYaR5L^C6=u0
z>22=9HIf8cIAQwCsF{*ir>t-ohUHDCqhb~+q%lis<|JbPY^rR%t58>DF{-htnz>ut
z1OBc#N^`u*GR*d5K1^}G6H|pgR9}hM)M;b9Y}qkE`RQZTK|8c_fAb8K&oo_4k<d~r
zpY}~hubf2jo|HgfB?oi8$XHbFjlsC6P@6by)+WOutvq@Q&IYzRzjs>fY-@a*e_%Ny
zL~pa-D!nh|oxr?;HUV1BSY|{$WziwF;mq4;w3z;{nE$OpYi)N5Y6N~%>B!6u>o@FW
zF%%muWoNh-<qqmiT>NSLz2Jpc&|ueAtatI;Q_CRl9mDrK<I`b})yC-NVCf0_2=8_@
z5KO=oVG67Ju(*uNo}+hAZW56P+h{QklIDHMlu7ofPI4n(jesw5M63n#RLUqw+C!#0
zn93?D4!$-u)98oT`#%3nYJZBl3x6c?6l<q3rc);vArB2T(AMiG+weE#LB5tV41J|t
z-tXouml$$CtzqBZ8wDW;3`m+CCG}zqBPF7SikFag9JixH;Jb7z1e)+C!qj+=QZ{&B
z;iUgUQeO4NYgy*_h-Nl>3chI9`s(pAPP2Q=yFJ7KPwY{jmZHorCyc~&n<u6~;E1d=
zj2k8^I#eo+&Xx0H9Aq<D`g{&!W_amg6!fiyycxM!wwb({aZWVTpxG_JH#|tcDswI{
z_pC}H7um4lqGt~AyT`a~ETd=fdv!_XoI~DMU>dF>_P;?Qg1<`EGw8A}sR)AW^5Qhy
zT2Q;2tK98qehB^UT6=r`s>_pOHvO?7DQkPP?#(u!3$-A4G+Y>rzZ^*G+o%p+G;oNZ
zHgP|TqSy*+<%$`cYX2yuig^y;W2=twLn37o24D90OX4&%@vxJj#=?Cnme!Br$sYs4
zJpvI;XbbOh{w@Lthh$^ov*!oDRiU?V3742xPXaLMD_HkT9M9!ytynueiJ;VWR_B(&
z4OP>-$5+W{PerXr44E7<!CzJf{e<|!>DtMV#-lm_mzhX2D@pZV@e0lU&{mU{A^s`-
zWs|0&xvJWs*5^imJS0*k3kK02K*r3D2G8x&I;sBbJwxO|-T~tpk^(7LpO^dy*CNp5
zBB@5OFNYd`)k>j2J>8oQiVY{5pS{*Z<9qTf`|S1$N<;|U;Tl8$7S4s5TUyrLJH68(
zMgD;EPk=HwsMU#vS@0f2u~H3+UHI2AH#axkM;~=72p=GX1{^SUwAP&prqm&heyA(D
z$Mz@%c8P~(-?T?Xj87nrnQlI19Z?WM9njOGX+Z4lI2i#{pWKAf2INryA|d=CVB~^s
z1nZF{K<u0q3(clS!Rd^A+B|G#-x-3_r3MBLegK17n^2?<%hRte&kePXS;rsc9}{c^
z+i4dzJYV|@h>$AZ6L2|DdiD5Zu6TduixF3<A?M7n-$%rVGw>A?Ukvqxdo|B=2X1?P
zdYgn}nT*YcRo(zDZuYnc_g2u^nW7OL3er&ZsxWg?bX9i(JaEY-c2UR``?DA(^syh~
zLD<l#No3DNWirAC+fiiP@C3ocCZt>I$IMG6hEISyHG+&PIDBekd@g4-@^+fFn0)#c
zAu$9+6vMwIDdFAC>k9s3KxMd<k~aWRaHmhk_X9T=_?;<KjuA}M1$2&W+}^V=Gznw)
z-&i_~gE2bQ;B&IlJrTt_zF-Y`^dCPqMBqC~2?nan|IN&eY5(uc90U4rb)59CPD6(~
zD4aPOQ(QSqt>i|wd#RkGL#*0k0F`S^x0iZk7$*}E8p$Rl$0F3cGY9|lkNni)>w3yQ
z8&y@xREH^fGlX+5UC|JJ$ehc7l{$&#X`fEsuiIbAY=O^m?X)uAo`D6o)584F!GGBc
z;+sMd$(GRwp=I_J9hH7YY&t_%8Ijg_?O=m@Fna5M%LMU{G?7A?ONMz>4h1>+DhZZ)
zJC0lOVu3A^Yr&Va+{K@j_$+?zWdD?1#B@;#;Rg`3FWz*v1ae{WdB+UOWH>V}IW<13
zU8=gKg?OUo_+u^NJ|1i>!GxR~o%t3jJl3$yLlT5oW8KF1Qu-A`{HyTceCy5p1k2|5
z($L(<&_MiwzOJTR`#21Z7Zm<Tv?omme%PF8p~-P|I%~tXhP8-C8LgLdZjD(j@yr?$
zdP)jYJTy(wx8INHF#M-m3kS2}KrB`tk5Xn;Ze)6yBNiG0HeRll+kRd3NwUtHr9@6c
z5U~q-c22_0*JyCzt)ef!i}Uki<H*qcJeW+<J%WNTs5el3$$uM!ye$1Vk8X_20Hrhx
z2pH%oNB8S(!S%j-YAfr^nT`N9mgw}=cbi3bxIWQ5zn21MuU^+PfG=fO3vsVayLhr1
zx}My|8WHoxrXs=LAr_->H}PS;=QuUqx<aYM3X^S`C+RIeYb<8j=6GE1IzE8WbFC7)
zCiTyKb8*l1gJqdUXlQL#=+kAM59MYI{SG}+-OJB;+X>fFWpzxrAG=s%DEhVKU#07q
z{3VA%Z6uCN+DEpck0VRzl<FXz6?ar2cG7f#?oE@HMELt8&56}IeXc%~0|buhlkMT|
z`P-S#k$pBt(*xs<PfcJP%ZK^$*vTD%wXK2phA$VN3UoS~T(AN<7yXywsvRLoow0K?
z#ctwZB44EHfUfF#B@>C`J}i5msPSAQ+=H)Q#4WK0=S%-NVjoe91QlW^jd63JcOasM
zzz}p4E9TZ7Z$&b1mtrLkHJ&!^S$0W+m9tiQA!GEasxh63qCXk2Uw!H;!w!GTu<()&
z6$U_`H-BFRd_j3XB#u;O<{sB~B`$^bsD=_S-#RJT_#JJRDwunNiL6JWN0AIZ;j)+c
zK{=e=VTgxZSNEP+@CA<H5yqw8>F*aDU;XleP}&-!E4m}Pl9-CE7@0>{cfa<}UcjT&
zJ3?pyS&XqK{e9sCn0LyjFIOHsw1@wDaa<83>+YoEb!h5+E-=GSnOEc=MI|8?0B^0~
z{GJbeKRRA75){w0zJ~W740$L2zl!KzdhJb#Mv)xU(Vk4_k)D?JLesfCav^ESu!2$Q
z2_D{7zn<*J@bCCJ3##{N=O2;wwonB}ah{5Oqf^k!8Uc<aibGSjQ2B2*bgBXLEsGM;
z9D4~X_GI09k@>p`)qN5z7&hNYbzGox56rT{rIA{g0uUd~$>J925v!YAg_jz+ET*)3
z73G|mNSIng&Z=nNA#_k$-~3#Ff2u5=$gH=_J@s;U)f66X6|aK5q83aFEsgtok;utd
zqhA5e-)t1y#apT=i&SfF#lNM-!ebZMke0qA+aFH*mq!zWr&6SSt%Y_)1)+QacNrM)
z&IwCB0k4o=%cj&n9!p#_<C~wKQm#m1jq|i;>IuH;6IWih@{;QmR2YJf*T7WyT+IQu
zOhHJ+3^OG-(+&00J0~YOCToHCVfauA^@73JvjX!T{5}@Jb3iExkCgk<f<k*N{<<Lz
zn)3k;y3>QK&*<~i6c!PScGvJikFs4%&Mkl|Iy#e{L5leC{`AL^=&<^7VM#ve_nNBV
zKaJ$|C6Wf^`*w=07Rm4pK2o+f(&t$s$4n|CfMHxs+5i|NA$2=v(-@dV;zkE}08U(j
z?}Imnf5sOKkk{~9VCLRQ=f}v~MK4!6ir0yye?I%0ZVEsq?U(8=e@O6`ouA&l)0Xq)
z#o%J*J>nMXcTgXKzU*Fq;D31japu+7BYWDW$Et%E`&Om5FESkLa#kn5INNytB4qDp
zlyllz*iE=0U3g(C1pd_lbDJ8efQ{qQr^d<aeQ|0I6g$t_TOy?_Yg_T#Cfk?4CMse`
z0HX1kB&_Q<WpbS8i<(Q^Wg}|#3UzYMKxO7P^!eZ3ezkw|smTw6`qLR>Z_Sb<XK4H(
z4d73|Yx>Zf2Po0oBqq9V<V`;TK40QT@<&R-gnpkmk8R!FCgc6c(0Y+*!0T{8T3+1E
z=#Z(zx{vfNxoJ?!kh?$ZuDdqTiqr}dNBoWqEj?39^eIabvMsAa`ao5hzlv(1_2fAv
zoBecjBNNOJv|*Ur^inJGJPk_2L?3zDU+E9eRh*6@IH5CH4pp!GNbA04LLzu60GpV}
zl7hQ7X6dQA7y;fOaR2h31$<ClRdH!HFSL)pVJG!572PYUVw_ERwLVotlbU#HBR|LW
zIpTv}uKh*i;K|`N?<`Nu{bJvJzS1wE*PBg<l0F&LP<1@lZ-@ierGC`dK2(u8an#1R
zz7}@|ea!rN=BT&jS<F&rpq}EEG>_%{i||Q8dGwb6`ge>^jyZrK{Wyo3S9D(Hi)(}M
zil;9i_}T+kr5y+eCrCNO9w@b1Oo$z0rlWOS-<-N%a?GdMK_DJjE}u-)&?gFGvrSv1
zhRhI{?e$c1thY(0|4!@%KQ&E?(^3sp`FrFGezrFxkZ^o&g|W!e<Y_^s@YDa51x)|9
z0{%h|U5*wopf0sc>#-6#+lr`|u?n?5#T*ev^`}8QIP`3|XzTCsOJ{5;EORg<7wu~)
zm`<z}*LW*He3em4!J~C1E%Obd`?@&w*ly81Bu8pA{yiEk8}`Eo*R`Ruqqg;eB%3!y
zxbhBc(9JacFS%mCdm-hrPh8vVdw&Xc3YtN2VNQJyBO`qQk?i%+VJfFUmU&}G=fO9l
zSVhbSE4t9qDcs_DlNYHrpkWD%9rl178Voc5hSuOT_9X0n=`DGAPOf|NIv;;uNWd3T
z0(aW_iipo$<PP*u)5Bz}eVP5+3Nw?EE-t9cX2(DkQVd3GW24Cp!5Ua3G5P=!7hft2
zwU<UU-$kiI)k#60LX&F%f~Ya8tDNQ0ivxp;&z&FDfjUF>uW}Ej`^$6TPs6=)lNt&3
znp5t<?X1<-5Rg-;DhEhtOpS2Xzp>i%pZzRC%Rz}NBoDmS*B@#9;}H<f;~!Msmw
zVd45VSrm7!Tjd+8YNw>T9_QX<Bhne4BFpR624{cz*=lo_f?6{<Y0cm62G;;_;xzLa
zn_}H<CnjyhM(V|DLPtntK=AkW@**S40G1*B4do2LF!O-1wF3;0jWhUZlPg*yV~WGA
zwB|@5q>n1@elq-J9ggfc4_Nt}6Yu4YOM+-~L7%QX?OeXsR6}Y0L;rk<XjrEHCEj}Z
za4DUqoVp_Eu9NxNkj~2ou0(HHHEmw@{FE7(ph)(X=F~xCH;U4qM}pfHT=DOL6dD<F
z{}35O6pAgc@bl~~YmeL;4xosZN?;|zL(N#YsEs*lh<+LKF)<Jv6Xa;OaGS*oz!S3R
z+bua30*H)GvKG=avuE!}akFb9+XW+QZ)brddeu_A-CqtW&BZs=RBW&Jj6njP93%&S
z7E=CETWU@T-p}|qUt)&ChIk&4H@TvZkMe5e*mgHD%psZZ0Ke;$IIZF>0gl=Y7!PXT
z3X4@syFH%$zg$0tL2nqH^;R0^c*FN^&lrFBYhNfDIv>Sos;gD)hyZkO4$#7DCuLGg
zjdhg?5a54lYGgoZQw?yc0uT??qu5H%o-xyGOQsvx<}b)7`FZ3km|3bAadJ6>@8CG=
zJ2}GQ1qR^hOQs}p8pCsom**`3RLecHV~<`%_9x1Gl!{XgpaqYm$Zn-4ddg_3s<J(!
zrR(h)3c}^<hYS0X8%|b2PND3t{HI_58(nV)EZYFv!X<Dg;^+**qWkt#7x!NyG5OkC
z>I*g^N50vOKxko1K#9-du9AoMWzW(kA`?jqAjPY)6j&sJDQz~B2jM+cUrDxfSbSoD
zBOeNUxX1se@a9{!Ri4+2!`+`t2Yqy30|&ch&6e3pR1^N~<Ql>uaY=y@hL|0~#OMF*
z7k(E1XE<I}UJm)Q8@1Q}D>JgvxF+98w8LskBR&&7iQ63%jO6$Cp7<u|oqZ+3ZT|!(
zit$6?q?z=4;3?-KQ_(fYP2IYD>&&%s+%nr*oQfZH3WDTZ&G$SGw$}%T09|*@F{8LJ
zIQ8KC*kTdkyf)ITB$ag%|C30sr#5q6@l)~~U;XLyT4ODVJT}vPteWZLqK)eg_DJGH
z+=Ty+thWpbvhBjPZ@Rl1LAtxUq`N^tq`Rdgq@}y0q(M+Rqyzy$q(MOG4ney4E_~i^
z&+OU%7zYvLj%%%T9>>}p7LQJZ{p%1DB<9-u^aXs1Yte~zn3U=_|J040GZc?)RrSe<
zuTR&Xek$B9V-Yh`)n6naNP2cu)1T`|52Z+ioG+%w7a#$80C7kfUIvC!0o%A3{~Qcv
z7&d&KaF%f6;D&6o?g!*Vs>4u-!KG*x7UtigoT|N0XYCqewaS*4`u(G)bs_vLMX7QZ
zOsl32##G^#cb!)m#>5x6C6PgSowmb@bm2sdMZ=);(Fe<7CmSTT*HW3br@0!Y`0C_v
z<m0>SN#`BK-cF?Zddd}c#-&JT4-|WO(zUy*sHl)@hh<5rQuA6v7AA5x&ttVgDFNoD
z3kwTlN1o6t-iHbItu1iti7npSxy+!T;Orc4yHWNBTYS0x^<9Ro;Wt)e0OU-<*3#{Z
zcxire1D>Xt6#?<|qv$@vYl2m+l|-J?Y_R)U%N5H=tDlDmN^})ae`xvySL91lIo}<b
zPx?#hDLeVFi1`@M?7)kwAJvTOOy{t6HA-(*pTFL+qWB;u@Vq*3WPtw1EWNkc+QRln
zmNjHysX$JTSevYfm>-@S9z1D2(CE4ho$DJaGvdoY-TkMM<qS8n&Kg8#c-uA6Z>L=L
zds8a=`bwZga;Owu4m|Jx4n`sRr63F&HF(%43TX;tReC4tS^8zaK3(G`12F=RswNg@
zo~C~Mb2U>Z5>u|`lTwH6ZpL6nTje|WHZRALz*|SJSr}irwstde3#pWh1s@O=KZ@um
z&^_e+gdmC3>ekl!PJME$4zwa&btuiPl_diV_h2!@?nM~bvyIyKhz)&IOu!;<#7{5N
zF?tL*aSamNgK}GnRo0f>Yz|vg;myl2(wViJ-n{F?67TkS@h-d8*HBA^|I3G?FCAvX
z+I5s*(^zIfF2lUU!8cLN4fqjf?BGw!APX1Lpm02DQTT)xOo?6g1g(V5?o<!mmja$+
zb?e-N)R&}C;Ub}<fI}eGY8MX~tog?cE?>DcHW-vQ9?h;ou9|f$@`WL8hN`KR|A?;n
z=6nZw0cI@aYW$SMqSyJOc3;35M3mEK!A-dwR~7D0{H+V?Cp1zSZS~Q<cnz1DMwBZM
zXyyIPm0DlvAd%|vHb^vLfN3=fq#_kFUzhx!F|x-dI!ZUu!{3CU>J8LP&=w}#TWk?N
z-s<lQ2WK2QLNjeRQUWUykO6Zi1P56XB}Z9EU;ds~GeA{OIs~p8oArQTT%?3<eZ<Qo
zwsHTdoE3ACck-<~>q<^1%FN0P7CY8M?z*-D45t^;EBz}rbiu(VaAmQQgnz>#tptle
zpSHsm8KUR!nP_;=-)VGk=M7R6GNm7CK&^@2Hw0crJ@F)c9zSF=VF!bxo@>#$E5mqg
z+0^I5{D7r@BFt6)*8r<K>IKe#tv!j6&?Sxa2EHcPzbpA!!2RW4g#5(Q8%QOB-f7K-
zb^kLyzMZ^v{Iag>3Z#Z>IO+l(4x<o~?0P&$$1iR_BVE-TSo2){hy6I<9Ckzb_N$Tt
z1PpsVQNsu!0I#PrI8|m6y5q6FSuPK~7h0vLME;$8JWp3reD|e3a`UjyMqqe|#VQ>`
zyICO-57!tKFb_8j@aBuBK!{gZTieyyx$>5y(*X4P`#Mkvbx8XKbDL2!KtLAx+M+d$
z3y&K&1{}vNSUF{!Az=5N%UZ0xnCkeUVeSDD!=~&%lc;8t;hCw6?7}5VdUoL=KKnpB
z76p={KM54f{#^f(AAJjJo+(7CH=r659i#tM<vH=?j^hKCh;nWHj(iNaXJ%Kes%Rq$
zVKBT?G()0)*TL}f^bAeR2K7R5mTwa*Mf8mA5gE2h^Kf4Fqk@TKQ-Lmc)hy-&wl1DV
zy?#)H64v*{Q{*Up9lfeQ;$uL(cb^K^{ZV6kW$cg0l~?;10Kl*_6<@t*4J&Tw`~+tI
z1|~{~!CZAm<QF{M-MHjc7RxQ6b)*R0VF)9?j!@M?<v^yhz-BwZ?^|$cYBk^?E=eGA
z>^uaGkdSV>;6w{IuAMX7WUVKEO9cI7-Ak%N-CrN5ty>avN-lk5byM8A?xf8@>kfV{
z=q;0uQA27@94cq=>738XFwVS;H@;E>G}|J;Z3ZXPY7=$lYzf7}_4JlqPfrpU0;fs&
z^a?oCg88{Y&N*MF+<~OjxA#ycIh)#w2hiu@1`8acA6b<nk!KWfi*a)2!|HG2vSaXa
zGB;Xp_mtVYol!NR5dI~{h$+(A<Hzi_g>7tbKegJ)#!-*JBh$jhnzAJ`s6QJShr5^u
zWCH*n2bMqohXJUofq-)9hzhhkJ*^A&N}>_Hegcioe)hB5PB=qppM^UDR3gZo5U+b{
zMA4%Z&;T3`y{KLAH1a?$17sTC{#DuVP?PVmA2Pc4R=jWeW<NBqM&Hv%vHW9H;0}|s
z5sfz@d)u@Z#{4w)RcOGal5jVB39I|}1>y9XVmf-}@N)2L184v^!ig+g1S-rw5yL=p
zwF6Df=}s3M=AH5!XRo1k=*VRHEvi<-Alua_EQHrCIclP?{KCG&N(`bU#)xN6P12P-
z?ZwCi$fON?Ehw}FA(cMDlnfmED$dOL2tA;uj@V?6VU8^*m*#wV@dE&>0WL(_<6<1V
z82LK7mfBFP)~Vi1Tfq-*4V@S81NhR|Ey{|+b|2$fVFWp8d|v*&N(TvI3q$U@WBwoa
zY?ds`Dq+W&A=Sq?n2^g`4qBL30)CH;>g!&rJaI^-Dun7yD+@S%$v1=MGoYW?2rasQ
zqyS?sSW436g2q=1x?+M*K;7O+hnLIc!qf3IUJyEdL(Xl$E8{^iOYv3zVy5dG7}spj
zbQ3#(6r|owOe6Tcr79(>t4i5+y(_E@jb)b6^l3Uq$oB_(CJLr+Mv`U{K+hE4YDdqH
z@?@GL|KOp9Wv#r3H4fhV1#RG~=*{8|8XB5M`lKGAoUn&@4<b%C{w2s#R<KH>^mz+i
zryH(2%<uc+_o}RAMDr;o7vm`bFAb}Y<V|7of=Eo2m4<USe3-e6c6n<Mr*3`CAtO?7
z5}Y`bV`gcEUBy`D!m1J+t;d`nAxIBg(__ZTFo1|$Ro=H@I1d3bqzoW_dAW|CEBXbz
zJ>C>W(5HT5se}f><s^a-DFFxFUTO6jN2oZMEx_*@X+zb_WhDElZBkHa45GUl8hrQI
z#M33Ch@u(53S&Z%?dvRqmF!OF!B4{6n}S}HtUbYoGI&Q}gWnXPwm`3lV7i&6Wx$I}
zx*#`+i7yo9=!dc%rnsZn$qn>*FS#b^pdW@<ntAJg-K~?9|Kj;e5BFVFRW>l)TcZ#5
zkjk(7yCy(dn5cyQq-c8(Cq6+kPnB;a8h*wYi=QVkd4e&smh{B~b)o^g%rW3KKMZln
zl!GSN+Jr4*u-i89DxDw<Ln)etTJ3AD2IN?@!E5+B^4GNdbT5s&{zR<}1q3~uD#}Xf
z%&CU4uK#}H<Hh&A<HIT=)c+GqH8Sv1qR$z9`(ZMr`PfW(*4I2PYHoUvs362UaGHCg
zIpQQXPEeG-Sm7Z4#12=ByLEa2$=XFM2l^%CMk#{;x6lT^ONxud`G!!X`gbEnH4Z+_
zPgsa&E~5LgsGmgkoH}gTmRd|REHC9)_-V;Y?YGxBL+_hc!rMbMuc!$<8R3`Dz@cwR
zLPKBwVHD;O>M_!~h>nTU!_Be8=`#F{4D~XsyX@iN5xA(Df)^zrJ>p?;MdhJ0B__n*
zae=oDx|N^K69CE!4k^Al9!=pjIdPpfZtH`3WvuCye*;EX#QDJvQ$cvo_V?@A?==fz
zLj7VDuI@{HxX=g97fFQdO1_Lq4Q;HN?5;+a*@*DOJE8}3tWJP8q|&p*t91!0elY*q
z*zQ!-lWHw;!t#6bQa?`d+0b>kmA*IoUtng#-t85xCJ*$AtBzyKO*s}hi#r&p7F))K
zo~=`uuV(xN<0w+22z0N5u`u@*<cZK4g|ca7rQuDTIPLdx`Ol@ky;w~S&K*Hm#(S4f
z;e6F?F#KcMlg#S_if%=qoV+wwpU}xHy!omRe3sM))QL*K_80b+dpIu-KsWflc4>ie
zE*@RgXlY!PW$@j7weeNL7fg)f0Zmp~8J4aWY$l|(=AK5s$7&I~kK&F>yxBw0RRbq}
z<3Q+QvoFa2V2KPM#IT(?_5U8w4>w&8{^zU+-CIF)P(N;a*Ix>UK#uB)OG&x>Q~;3D
zcy3?6dpoCRSqhiF!qZ`9Intk8U1e=mvlj6~%5#(&qRj+L8DZ4cLCodSuVtgR>U|LC
zz`kU(J7DdIe}+Jwv#wt?!r44=W<G(wi?^lM)&*jY>96n;W7E+yo8(pe;rlWfb2knC
z%E{syhERYp!mD%`Vi*Paoy5;YN(Ry-By*1NK&|VS-uQiZkl9h05y^%omvrt#+W8D#
zO-Q_oBn>A?^bHj2HgiG2=3>@o*z6+l78_Mc$awVoqF5O67kUm_-adUxWdVY=(zN$f
zu(G_xd0H~W8T$1U48&O`JZ%?pvW|Utv9CW#z982`?%u<LUYVPJY-;**GAdyP4l)fm
zPz1>ro?s!b=@F3C2os|@`2?X)8SdQCPqFEnD*9Kfu>E9uwYEA&^)61vpi<s{_qo8d
z_bOcXpkZZ@hr%}k9l;qyj~n#(scF%^#W5_*$zf(h2z{1I%Qa7qr-2W67~7yGDa;Kn
z>&iSlf~2T9EpNL^o7e}C{ymZQ?~{ERaWc9A?=4OkT-b5emlOGY#Id;{?t@m#4Ok*;
z?S0VZX#N6hIQq%s6&I>YE~hCsIn+ax{hcb<LaMPN5;TU-M;C3$%nouo7F$n}5tWx{
zIg{q$SvYCPP_t5JVazPHNKF<zY;9MBCRVU-4y;m7dqZ(O=2zHV$5y^>?f(u1T_8|6
z3i*C@+@&|jm&9ZB{Ph!^XS=(ihNHl`eaK}MmhNimZ?(2MPH@s2-3pqBqg>Ohfr6Cb
zclopf<4+~Ezb<Q>+zbZ@^sM6J3^9S8-#sXVz)8|j1aWy4H^XwJb^TT?_GxPY8FS?D
z2oAm&8zhf;IyNF8Zz7Efk9H>;Ng?tBATcA3qKapPAnA@W$@A{N2p7~1SXwwJU(6pf
zf7TsA<rapRRlxAIvWgD((EH1P2RH{IDu`Y==;vlK-4lAg#OBa5GK=kJ_l5{|8|DM(
z)gz;u6y~ufprq_)RS-DQAYXM{y3i)@;nh9G=Azz|yhMZ4FdXrTQY24f>-&X@dI73H
zX?KTA72Js99L|mFeqe_Lx>R1p2W#d7o^m9@G+weoquda!=o#9G{FwHPlH*NZG$l3u
zxY#Ks=W239$Qp9q_i8b0qzjejaykd?Y%(e;qGk5vN$fPqHrT;PK1)h$``Q{sUu~MH
zS;;cnDbjW5nAmcF-6jGVm5VKEQ%+9q!#g-P`EE;DYe}oH*z({pv$P&=ZXW<i^b4go
zeXp9}8*%L5gbTK@ZwrIID+b-JQ6V8k+S}iQ<+DZL3Bod5(QX`(<bRHvXYs}-nCHzx
zpX^t%6rvtVTW)P%eQn*@ubyPSNSRbn?P<W9d%KwIoe(w~zfLLrk#GK1Pw|;wG+By*
zj(TU48^Bb#XWjWM+sSSHs6H$kFZ0GX1%_GntN_e>$3AmZN4$Cd=HSn6MqJ~*Y6JNN
z-{-WI*<*0s!V(Fny^V@OR3PdWt@u6w%bCRE1D5VcDj9XQ*kSrD@C358yX#16&2#Xc
zd;)R%<d$ke0}Ze?8xcZ@VNOifWwg&l?5jKb4r=GAbMkckqJ?%BIP(iNQ-`HaYpLLa
zp?Equ0zNd6!_DzYTVn%>igJ}^540Y;$G8|XBvad|cvP8;p8=c@VWg7AUrbLxueY5i
zLA!V;sSK@Kki7w!MYAAqs4yLCOh&(7lblGQtRa5#d<T|u<-Pawr_X1ri{v|=qb*MS
z{DOd)azX>eivlVivEQaSCMMb?j%lApU<)LG!w}`4>YtG7|HCv!&Hw8}(ZE~=$jMD>
z!z>T?O6U)$g@g)oV}{O@pW&9zq<O(-WrZ9NMN(d35DP$Xc>}2I1x9`vc;>_<2E`0R
zH97Voicv!?S&`v_R!qZV;6E17X&O&9>!EI9`&5y?F|$|M8Q(;@^Fu>Cn!FXGhcm%p
zOUTA<T-9@WL;;OM1MEvy!*Y}kH2=`<oB7r-s{WfE!x}Y5Rc7IBeUy;5ewXcFT^eD=
zMZGF`I1v)1MR{}de4|nE^@a>af5w-VdQk=YJSYDdI6U8jv=we1%5Yf_1;8>{;y<td
zMl}Ilb&VL@=RJ2FrMPNx8Ut0ELq<FnfyCm`CePF8ZPc@T1jw1(C_PEP$^D4JFo3c|
z&@H$0Q%O}zSn8us#-kuMw<tv){)J~Qrklt9qC;ZleCbU5YwH<fYWnn=&O#D4TG>I$
zoH<4|mfAwA?QNWz;>yDE?}H!m3ciT{cg0^oE#Mb*WPj)rI5*rPFSqh0_rTZU3n$)7
zXRpEam*V)rTL6K_$-we_R+vjb`US2f(;4*BJ_?{BdyHsn&&(9>IB|5=ff$Spku4@f
zX8)7a)+j{3hxo`E^r+67l|gyx$u)->3=`6x<s>q8`zJbnRFst#b{{W9oj9#DCZtTa
zp<a<QH{!jfd5c32IG1b}vz$c5d~R#2Q~PKc(Ue^g8fzbT>)cX-f9U57QD%4Hr5i-Q
zmW;jQ>8sU!a_1pXapMI<JTe!Mxfq1?<M&X-)7P`4X+|<`um$+kA1bUE>7O`ltZ!}O
zAcjs0d~%}k4%rMu1nVKATEb%2(mCT1dOA;Io#>ugIhlr<gBl~o7#24PYuair3rS&1
zp;Twh)yQvC&x8=4IBS8wX<ovgRy6A}15bGtKM=M!&5l%%(Q)9lavvBQ-9F^a>yI?P
zqx<?Yz;TrV{Z3A<Pv)I_dV5rv5o26Y7#1WW<X^A0eEOC)!uLGB6zF;0d%P?~;8%=M
zHr{SQ{K>#FIM!g>pR&>XtfXU!knai%#M|PYOh#y_so^NRf->8(<y3iE;N4>?<0%VS
zO?iOPcB~Ke80Fxm+2Tn6L_7p~GU3!(Q=HR2>O!rcK<gFrc9d}0B8=6eb_J4EE;mIu
zkFE4^0(*1v6|8RX*8BRk<eg~m|1)Yv8S@8ghH!)RgM^^`gBwa1vt0cvov%*z<6lwz
zicaePt}6dsOk0!7c%}COm~C2`G)|O?)V+fKeDn9b;QHv)w(6X${;t+3H$l}g#^p|B
z<D?G@K~eNIAw(>tVspLnva1dBz?M}smF_ZI#Vyzexu%DO>HF*PJ*4x-2N)5cvdh^D
zX4|l~O<6}KWRGDKkKypdX?9w2p}D&aov-iUYngLhf@Rzd*DsT1uqyJvy@E(WlNtZb
z7G0UKtjL~OcKes_6ZR^ur%f$h`{-CrZb12>yd(AS(b1Kf;C+Gn==ug&aIJs%%t677
zAz|!i^`^JGKS8QE8YEhGNv_5$Sjx#%`(_f0dFqS;BWke83&qnFEBaZbBRC(>T{M=e
z0Y<*o#S!$**Eh-5UaNmeVRl;;OVM}x!xSzC%mAQJ{qx!l!7C-Ix7-{Ymp~I&l$X~;
zTl-|rCjY~K5EnHI`LoaZ8_#-#etZxjo>$rUExWY4xNoIXz&#5LT{a*swgWRV!sOrj
zZu-S<2a*t{*WM_LGvjV~8=wbQa~{}ia6@`QZ-B3p8kT^sfUZd!(!C4F?Hsa+0~?}4
z2cmqEsV0!i0>!+=%{s;ME5T9Ou588x>G*-*m++0NQDKwY*L4C8LQ`}D!@|OSboI1%
z*!`b>UCGt?PzXRWc4&KL0ZpaTXOyKM)334cU;Siw`T-pF6>MhUINdATvB11!jdzQ#
zYWri6)40C_X?6DxIz~=ztOAji8^R0rvGNER-h^Dt*g<8M-3}Y6KJ_QyFI_n+gIP{6
zf?UZ=<DnqZV8x+C|BOJS)rL--KN15t=z17qAGJH}qn~x#NI#FPw;^U7q@^y*Y;k&a
z>*72Kli8A8Y!A*+*FXD12>@||)qERqyz=V{up&`1=hLRvABm0zS}M7+QbG~S*a{0l
z#v~nh$_dS0D%CbuKPJdT9oWI0s67-7lOc=lp%$qCei(8h>&}lB^S>o1GkWKEM1{K0
z<oenqcTp68r{WaG8QaeQ3<0%-C;``qT}`3dTh`Q{_Osoo(E|fD=6fxbU}xv!;VJDz
z$6S_urXZ3X??3Q7?SJ68#QoncB6gGV>S|#R>obtzC;{CEyuzPE=AXetL@M(l{d+=E
zv@YITc76IMzo3A&8<y&&@(97?cG9B$6F7-N{3~$cFW$}=4C03_We04cg1H>IV-3jo
z#c7XB5AOcx5OG-_V>>u;3H|L%qh}q!JIFVsxMJzla<^D!_>SwfD#*90(jF9vyaUEh
z@IlE=`8bfg3S>~UXfez-_>>aUgy_^2&XwWQ<g{0q6qXL{(id*QpfdksRAG{zfn?)s
zx~Htr5#Xd^3Ckjo(Ycm7g!|(4h<WU3sHg-##$bWS=E4i*ro$itjUk*|iZ&YI9;VUX
zeOWZ9vZKFx|FPv@4~>Gnd}U=GH`F_`Tsm___cHl;`4)@XCh#5b{S>^#RoUrj%I|fz
z$SIzsL8p3JuW`dH0BNUL3}gRkI(qHO3I@jcf38TK#tGb?LGeco@xs6F&PrDDh5m%%
zPL-LE_!;cnU7I{42$FMiRjyIwugHR^U<^_`#q+0>Q|mJ?xleqzc^EvvLl^nO7@?er
z6N+*O>_u_{Os|#(iI6CS{)E+b#rfeo#-Fuw5dUMfjcF+O{jpBD91EJ6;6Xzk1@l`Q
z&rvk;g+h2E)P<d2Zj=<boiF~(WFx4WF=VSh2PsZs=_^-8s0N`KYl{#_j2x~xH$HO+
z&VZJwWY3TDz)?fLg0@_mcuBAE0~vt2dJSC#o9paBnb?9{Cdel7A*~Ck@7M6%zy<nV
zz!tV5H^Vis^(~_>BP3IhuJO4ui>xCIaj|47O(F4$5sDC15u#!w_;<{&{U51u54!gz
zpI`l@gdPIf^t<69e#=KG*OtlBg~71^dxpaORCZHwX>|wKOcO4p!4B9&>e4XRtz5Fp
zM`jV>ky$)R8<+3^VBwEMoJ#RY?vbvI+iom;Qx}t(*xR`Q`XtqQS>BKHH*D%Vfjd{I
znoR1OXP>(0Ze~fi4zLnyr2xxCc$`W_Ok7&^4^`ayG}-w4e~@D1KS&Yh!Rlv=d7GL~
zU<Hr{C(zhp!_-J}H(kiF_;ZkzK>>eq<TAe)hv0^NM3%yqbF*<?x)_?te1XD*u)GcO
z^?!;6P~&s-rrt|WmfJ4SKBn&jmw*!UYeD(#-;xY~P=*@V-(|YRi)rfv^Tl#`!2M*S
z_Yjt~)}+-Q`7@nYr=DUu;9?#5AF;TPE}0ET5{N$;?Z(mRAWspZ{7_kWzX3T~+wvIt
zA>rKGFEx{8yp;*cWo<|JX{%@$y5L*gPR)9ex6<2x!PjXy<gJ9|7~#L}_F^x5VZ5g~
zcmMX+u<-l<fBZaRzV^Y2v=I4(T|o)}-6sNaJmad{ym3btwOd-GNDKqS8_S8hHKsoS
zSc5R3@(4MBKyqu?XltDRL-EBYrNK$UGk~yz{_&e4_ek%h%}>>5U!0@@c?39h6l%AF
zgrQO5iFa7duN%~AJKsM$?)iS>Zne%K6JF&Qfzz;eOPyZmMRYpu-gMXw3Yz-qSP&!3
zr+WJA2FCa2L@y32i-x7L*hOqG|7maikf>fw+?{fwWo4s3brk+=wt|N%f*ZjrZlSu9
zWwNAix`6dy3l(A9ehI84kFes+;)>U6g32!n#+=2QN!q(AzZ;6~&z!E%+9)zRtju44
ziN_;El=EK&`V0cLC>TimWE~O}EBW9!m*2sy{rA_8i5#*FkTpc@Mbz2g%SlP!tbG0m
z*YKo`o>{XEN3nzpZ*c%%9*`~AuE9@me8XDtzw3yP0#MPg+!uJWtFnvkU#)C_qyb<m
ztXzJR71qqnqt_SynzrjHtD<%cmRFl~oJM;DF3RSzUK_|g7n3Ls@;Orme-fU}o}Qib
z*2PozBILM93|7%}Nt4!t!4gd%pk~`(6_7)`=;Rm3D!3_$Y8(`mf5ZpDw_$%1A4aDK
zB$nE!{#E|)4sR#F*ANqBq<WpjRUJw^ext7OK;npwLE+QY!jM^G^rDSil{Me7O|9ix
zwu0vjq?kZE>Lm3zV}6m=cx{N4jZNbIvX`&-H9sy=u*BWz<YU_7u;U><Bwq3LkvWKo
zoj1H8%lPx%65$D)r<}FLHuW#e1g|5apO=z9V)7#7!i-_WQ%d`cw=`jw>fT>wze(z<
zR~&mecc3U`f`J*fx$@)6KD$lv$~4Q0c83mpsp@0R(E@yeY<WMoAmIO?)Lr2iyIK!*
z))mH^VJOEfdi{Jv4TBBNr^_G!ara1PnU>%{nYgkdPTfLkah~p>toUokiKp+owwB+a
zy96_}-wV7k2!5xp2iEI`vdK&m_#nCK^Xhn^HDRE(5$-d$Bk=LRn3HQEETVekC(X{M
zI#(=kYOG$Q>4_#9@_&CXbW7J|T?Sly+6zz3T#<AqJp}pd6~k+?jHdM23e3V^tk3=_
z<}$E|P77^fb6GMT?@ah$!!Oq8aF7fzZIsE$XaD~oMLC0AS5NPqc*pTY*Tbk$O|aco
zpBfjABH@UJ!ib9T$jh}NU|lKw4$8GDo#{zUqH4DIuSH}rQBh3aCdRSUch>swgXrj9
zrdpAN#*>8hep6<Nj#m0e(l7i{pNyzMH{Ir0^sw!CTwcM4?7H+8Vj)vl%$!X2Z1H(+
ze8V<aH9jHJ(&S|BI#gzR6|t}>*l6Kl5H5VKB%gQ5Z0>=P9uk^d85M=Bs)}avh*eJ+
zUJq67zF5VrL?;wKNfnpR^z-y;ii#?){><G!YCL2%=#iS3ny#&*i|6_s+rEvuPrs*3
z`MD-SWNDQ{71~ci?c##(V^dQBXVd>979VcQK7De7ju`R->eA0-cg|+b-j8o57je65
zDY<LKghnMdWL5)SG8^|MdjE%pl9s%jT3j9NBO`VnGQtULm%+_A4Q#q*hVfw6Bs~)y
z105Sjc~rDw%v<F|Z-#<zJZHMwDsCu>2Kn7e&wjpZ3JXgWwGi<O-I3778gZ*@ZEaP*
z9sR}d?B?Py@=bD*4z^D1dZ9jU9q#HxHnaN2+bsKW47$FO<xvDDw6DQNO3L4)#0L4E
zQFqX!KS3ZNz|qwF<|SMYqlul7n4Fp($c=<>9sV5W-ejyXxME;ztiLb~`5pcg!yXr}
z7hTCqKYzdfhllPAxI#tgE_>Jt%EJz3D5Lk<P6r!>;e5;d>JnT`fUcqhfuDJ}Dnr4E
zYDl8@@pi4g?h`UZ*??}Q#)~|_W1)m&OBUE!=uIWYo;YzHdV=gcpx>+-jy?nQ^+Sld
z`uV5MUsiyn;?@gz(kEoBrP9gmy`^!F3NSMgCea*<2_|Nmw9O@6nq{Un-l{d9kPhv;
zoCs9-9D5ag^%*YJw$3*LO`&b?T6Ha972VdrkTfo&f>wM>*JTSlb5eOn+Fr@WC=d19
zG~&-3Ji^g}VB-|#r4WCI)L4_?$mKp^14hgcM>V*7z2ccY(`%9_+<xu4{h1g*<uWae
zAL1RUJ}&&cOY!)#6#M?@abl6tX6R_)=<<Gqukra*t|GTUe)wTi;INkDy#8?$B_DKu
z;1bmSc+HG7FL91>eTZEYGWm#l_{l9Kr!qEA1GMN)T>A7we#r$bR+Og7ampuJGS8oX
zNu8PYO+DSp1Hqsd?+@)9wtr{TvnE#cgj{=Mz#VAuHk{TEM;1noMaH3}KH;R%4==^Z
zn9DSy&BXoUT{LMw&d<qBWcr9CPl2sHnwGo5zF~w@YU?O3L_Uh3PxsC9NN{XLFWEi7
z+gXe{r`H+!UrR}y*;jABu;_zMD{1@&%*6qk_I6MKxkKr>j<!V^5-%2NOZ@33c2!wk
zsntgr;s4-8tLn?56vg0>5<F{TjSglP)1#vVeVd&Uls^Tidknp}N@=>&mOP|r2qqDn
zQ&V4@34}el)yJANq%SSfo5uS<#7&u9<q<6UA8p|VT~1YKtD|@K@pNY!p9^#b0jr>m
zU$o6MKO+d$&F6KRsT%s%?Vpzv<^!~MIJ{@+@I#P{RC3bhl3(Dy8Q>0w4Ua8E;YxM=
zBp1H+NaXI^rI*15wTfSJQ0lEL>bY&Nv$k%0@88(G?AmlZNW#&7_rY%kgeM=r+H_La
zsdYZsC~K!K7FF?xND7Yp=HuW_Z8Z8PqQC=}Vvpy$ohzdRI|A%?BA+h<B(4^J#0_6|
z>R0b~IORs!^do}v1i0M5HcGLlO*+y_g{50#ga6U5@j1^YA|*eAh~ne*Hoqvt%zJj4
z90EL^|4?4?=`8tl%=<pKEoZk3aXn5)hcZdNQVXoVYRmuL!mG<<FcyYJZJP&cHj#vR
zkKY~$)|KI(EzHjc9)IIl{a%)rM+j*@rjuJ$ot`(kwQ^oYnwAJO63c^l+=@56I}s>(
zf(mv5f&M{WgXPy6vx=SRJB4qpuUF^PT=s~~7B1@pC6L$cT?+uxSA5gBO>F3;mP@3Q
z6hV}y3=?I)tK)M(M$;U$(9-Qav_Le@-_c;EK_`0$RP2s$Z<n_NVl5=G7f3SO;{X}^
z_4XtmhQC0B(*8<AqoYmqcB?0p$1&os=hg4GG6oTV*T*5>lbrN-<io2*AVM%v0DNFL
zsqv1}H}ri;AR^HY>H0<f&}=@y%W@G6j_GegZ%zh_m7HX%_<Ja(wih#i4wIX=fkosZ
z4!H$gipxkrAGjUBtN4+8#Qf9gU~4ZR!qe#e4J2~HcI=)$>H+D?s`ODHao=Mi7%<5{
zRl3k0e%JTs{hmO27<EbjLEl|by$Qe(t3S`C`}C@;S91tAR<ujD8#<nRjFM(xXzwgI
zci0fJ+PzbEkTbb6Bfe##{irH$VpVi{ZGR|m@ndfVPEuS+$ZQl~o_Vj4p<kIT;6ehd
zq-(bNo>=$qc6P7Bw(lJe49%VgpZn^4xI&)-Ti0)G{qm;0b7Z>3YyaAX8@ZeoG*7u3
zN-N#Q_1l_|)|~Zjc8~MYP2^T7Mi(5fYx``(O>gnLdVw~EXw-ykc4RIxlM>>lk~dT`
z+FhW?Cv`s-pfOK)csPK?fEGIgk)X@9grM>6SAg>?*I&W?@|vmFqzjDVhmiMP=hhp6
z50{$4Iy_>ILgo?XhlQ^}HEb&vXWyZ}LfAm3^v&XE9~OhJy{rant}sK(diUhow4iV$
z!G2g9eqm(};kwXT8b#fBYHJ@hj*B|QariGfR>KCeg_Ntv=mh)NRCr4n3Rag^fOkk~
z*yngGmz#3af+qp^vA8<R;9oA07R@4u5tZ;HvfY)|(2r@$-U|nKXIQn}a_>T@990_{
zZ&*U8ChXVcdhn6flROor!5`u6l8-8xyMHmOb<eY1KHOmS`_YHn3#0VlXxQaIjKkbN
z_q&f-J)_retkM`rgQ@9dL<jEN5ip@c)(*F(Lw{ab57v&CsjYfQ>k-QdyWYS<ZnMt!
z3^G?A&eS&T>b1q<?TS2+PL0ic<A;aKJ$?Yv2$=j`k#L1kcyC>ySEa{7!Hiuy(jOQ8
z&aI?l3Gy`UK>Ji#JfOz6ylI^oCs_SJ&?gc?g-lxBQWkh)_(ugb_yfhACe;(c`6d^M
z$Zf)x&vYdj7^Ndke%BB<2}6%!7`9(dR1$ALt?Wl>w-7DhlzrUe=4?z0IOFQnx*~^~
zJdYF>kVfAM^h;aeH^wdHa{-M}vBbmGC_WT)I<D}aoH~>D+b9UrW5I>Zxr>l1{Sm54
zILbl_?rGSDi9p$t=Sy|RhAr848cFcNe!c&4_7Lc-oiwS$dhH6tcYe)?!fn70HXZhy
zYhsvzIE<Ml)nc%xAXOpka*Vqm`4g!9E=7jzQKAJwyGY%^%uR8%(gKXt8-$GftVHus
zqZ-!E5dGBaSvsZZ!W42{276pWH<T;c<`jLu(pP|`r9dkwM!-q`VI@^8(=yAZn?)8J
zy;xfF#k3)|#x-89aRb>A)iNTy2pP%G8ql#HE>k@ZsWkM{wfk`HA63o@*m(g;V=KT!
zuhi*=E$rQrrlY3ycIJf5y&x+A*1RE<-1#n1B*mB#wL8_N56Q&Jgu`SY@|yh~ie-sv
zN=2;6xCh=k-muf&s$#<5|2Ws>2hyjv8-`7Sp<4g3L8SlKpl>_#jeF4Y#>U3O{UHOA
zIa<1n&Gk5EhNuUdfJc+Dwd(>f>#2LgAV(t*RPC1(N}O)xy7SKoYrAFOIcq`H6_rcr
zew@>_Jz5LBVLRk&n!={O88O5kU#qA8M)hI)Xx^S<-&%S(%`x@az1Efo=I3wjsRia5
z+Pja<y46gRb9$rAG=4}gUckTGCfA}6*fGbqEw<QtN&YLlEg_0|P!L%8fD0n<%OB)s
zY4^5Y^`Q&tyGthYWp8P_r;f`HY{m7@V54UMQFzVDlK(Nb_*M;%#iukrE9--e{0<~d
z7;|w1lQ<{g-^(I6&BNImw28<NOda*($7DYf=X=M&d8XhJheDT4;*>e$U72O+b5|ju
zuHmWvlnMuD$DMSYv2Z;d+s0O?TFC~iACP;hzn3M<>7_!^6h_GRojRSSsH36^M0Z8s
zXg6Ru^3KtN5kO_zxsUe6&|p$keXNSp=j7w|mQsgA9@Er<D)0t5cQ(G0t}k~^VuCl%
zgNGJBt=WMw%2M-W+NfMBk@3go%sLaWo6|?X>zOYB@^|bf_ch1H?4?g+d~Njnx*~{H
zA@JEg0MSvyvi&r3#&0bgA!r3gDJo5L1ur@`)8G%8l_@T{%C_0Qf@(_7zss~dFb;N^
zM=b1jntZ$vF(R>pu%CfE5R<~q!FrAL-uL@Vz~!vKVesyUl27!%W$&@>k-&%W9&YyT
zO_hg|+YJSUn+1@|<MQR1zI9(5;H%O9Vw30-iac#IbBBnJG&Cfv4hf4&Tq+!1wzG9s
z`>ylqV9rt)MPkwV=c#j$a5kR92d%P9<-m2r)eC^?Ri?KQ!>Qqv6(lOA&M6VIi<}Nm
zr$M6GW)vAL0%!{H((n|=$BJn{?7n(uyTBX3&B?7k`jb*`d7r>g`nMLnZ$;ZpL3>bW
zdM97(u!DoW>fuV~92uYuv34?Bu(OODUjF$q#s5b0rGC67<(u~n8poT3DM!pb<Kcjl
z;M%u=33e-K$}mB38|q7|o0?0;`T017Y9w_Z29o8)``d1Hw8q-DoXF<V{WopyF~GU_
z0nR-1HF!?ulTA2%*h4`;`>0OBE{MdpEcXlCWT)7sQoWy8Bic75?5#|&x{FawN0_NT
zxt~r?aq;ka#HiIG4b}LVB|yu37_8+vyp8i(i=B}#>?lER?md0OxQzp|@lwnOn;nu&
zWQkwO%nZ#150S8({a9XKk^GyT7FLIr{e|Snd*&XIoUonQe+8?m8a|XS_!sm;1O&<|
zY^C+bxUJ_He5b?P)(4fC9k~(&r;c+T^Dj;}w!q^XI9CP|1+ucT-f2tgw*#d3v$=K8
z^rt3@Na}4P#{ZPKJx>DxK*VhR`e}~5ojbqETTJo2@~nP;y(i|n$?q3-{SO^py=_ze
z91ZQ;QGC_n(@NKF7C2QtbCM2YW@pY?)(s!q(wWV;TDazdTbyxI6!+qfcn-+0=}+zD
zf{0Kq!4wl6Jp)I{3()U6rY}{N*XrqoojmJ_R@?O+8z>jm^Hh}5lT*@)I5`X+fnPpf
zBDGuK;yqi(*THs1lwBvy6pm7U_SL2l>~T_v8WVn9T6rUBpp}-{QXE~v;`5ZN<tD!I
zJx5$my0!*io$GnRV59pn5dne1)3{&)3k_MNxYSrt5H_G0of<M#QK4b|M4p7v_FERJ
z*)=Hw>H18>m?3nV(Pv#B9qAVg6y-h?I1_!j-dxd+I<d7kq6|?qhD}bj+wE5CGl*(n
z?3<ow1mp2OCS#GSwa+u9EdEYy8ZbdzmBB~k#NoZ^PMJu|b-*UC4htRT{zQT9k9y5c
zU(=%oGsM(z;L7g8zlu63k6P3Mn?fvV4%!SZNdLN&HYI>|pjd9uj}cz<*%FpL<X6FP
z-5?BLj-01A&U|cN_`~;Grx!>vlQOq)>UN`l&IMt@wixeJf5p%GbjN4BtVH(DK^z9{
zaUY>)+fCKeB|k$ZI*FVx-Un-CQvpM!@cv>sVv%1-`Krk=p+=d?uXAlm;@g-~O0o1&
z{9qt|<ADMv)n5&v4KXQh8rf#`IlkqYQL^$HWc?s?cdAX<0oU6!b0N5R&qR)c(CxHV
zRbWP!T?+h1W(Mgg3N4u}_&PO~tD-tJfdj1|gONo*tKx*G$rXKlPy0~`6VmtI|F9<k
zmd^he8~cl{^cP)4OQUFqbdDFD@-U|r2EWi@8z_b>hHJ#RCb*9`o1O<RB$pZj^V;sg
zPdAvMSbD}Yj1euH-70Papsdtrx(->JN7OUN_1ybIRaQ_OQ%h_=ImQhMa&jXTthTe^
z<@L<a5o01DrKKk+GwtuZgXz{F^lbIn&9>3tR`XSs6zzwr|LqAOZUb_9+ya|}>Df``
z;$|)#+ml0w9M|eNYL#6j&0|Jraq05M`6^3@cZFbW<<Dp^Q~?hK5ZPZTQTGfWT+w4>
zqCarSNL^hhsI`+5eXisAS7-L_$CCuuW#H}nuG^tO$i>c95v5f=`8N)Rvw^Pag^s+M
zqVdv%geVd*D>FSMahk%FYkXwLBm1wuHs=0(B&;)9Osy3^VEF7oJ037PEdyZiYN@PW
zt>oP4_PmGbXcX)T+2+~Wl1l}kzp=ujS{^vmahTVs1U_GqmqpPEBK+U)`2ILDX!%jb
zH5(-M1mgJ4@#3!FTRq4tc8MOJg!f1U|6MOh0}xWpd5}PE(2V?&wq`_xuomt_3PE!V
zOpDlEFz76Wm3t~&7qavQb#mX9HBG&7pHKoiUy$P`Z!CGHBI{m#w7QWbVM==6Drz5D
zo%%xc7rrZI(vmd{Ck-xE)w5j8L`o-snSGu(Y3wqsYm31&C+Hs4m^l9!E4=W~J3ZdT
z&i6SyK>y@}Z1BVt2_z`e@-~wlfu75>H?wBRvJL;G&io|ilkaj5f{lTyE+V7|dzD$B
zFU>IQ0?w5(*r}F3<Bh*WY#zq5bxa924b3!9fvG(Soim-&TbA)LZb^ivX#Ot$k^q()
z%WncqWsKnmEd9;k;pf|&b*(t7+RuF)B^YYU?HzX_F0CC8mCMav{wb<(3j0WgGeHmQ
z3RYPw+L5698wZde@^_xP8!kMz%8sbl7yN<D__hcPg;l=B!_~@_%}T|k9MAg?F1gTJ
zzo=8y=oyDgi{7z+QaTJfIV~!EOGyy9o!^n9H=!^|y?C~|jl&GTtWYBHu)%>R;sX;b
zak>2FKWJ}M_jB&<Qm`^lqCe>(BBBAA02(&iTXKoX3pU4*X;xWXDLjHi(3<jqbM@IA
zZ}MDyCqReu9{fT)fsMg3Ji|7Iblf<G9pk=`x|&L^kKThesEzx2EB=($oqNc4!EXfy
z=+g|1MZk3cEam7PBVB@*NP+_Edk#jz+3=z6NGE!-vVCYxXS5IfmtEdglqvWb5m@jk
z{zh_F^Zh+$Fwq~*L_^rOAAkw}kF~|TeQCWehxseciL^x*c{Gmgs=4jT_oL8iJTFSO
za0KBy5su6SKhXV#L45CTMxNWXN0H24R*<XTZV7Z#YSWoDg^j7zqKF(UweV^n${c`!
zJSwjV_MI5OWDjgLz{tLz+*7#dTwkW7c|vDA*NP=CR(BZyq;(C|0oGTpz6-GPn>z8O
zL>GllI`$OvUr=L6ZMe`A3S@OK)hqFF2!<<274FaPY9sp63aq_19yf(G<@r8ye3A<%
zI5dJ~aD00?p^PCjOzJ3u=OA#LQ4?m`9g6F|V7uGaC(?YkmIrc*J_zAMicIl@c>`%}
zf=4&i-TVPx?OP^p_y#JH`FvrBk3`K*xY?VtxykB7bnqk^rJ&~rch;O2u?@v0x%wnC
z_^e_emcOm^ojQ^?_@X1|11@-!C~`jlH9Uk0T7U6t!o*hq|6xMp=k$V}^JC<P=7-fc
zdZB2Q+Ua4d-H1C$gQ;__h1*(eVQWhDXViEWX6ENJ++ARuOI55=6?o~r5up5iLX*nb
zSgx*Q<ZNuh#2eCn0Op9Ojqx8|AuvqxHGIn=gpok|cbv%2`Zr?9b^g_&2z&JRam|_0
zFfQjoUQPyf3cB}qXxXAr?DvUR&<{I0AwRo5MncW%$&qhw#pVXSm>d^>wem?f-9j{n
zPc`V?kGsXje4#7P*$n2dGiT45zkv2ic*{$$4wJ&}&b4*VBMcd&55@^V_{s?oc24XN
zu$QJJr+%B8Zv;a@*eD*cC47Z=#^G@!4}Hw$*VyZc@vcbk+jvh;z$q0(t2(OR<Jd2^
zJcWE-6k>UQ@eF+J(x%2V%rivOk1fiUBG63x8%>OrR{6^S_+;<&-vL05qbiV^o}$jO
zEAum?07)~x%7s^Y6hPh`+OsiaHri<IlW%YAcIQj^DMcYiyDd5<rnt+Npg;7!TPN}_
z?J-)K3!LlhY=LcoD$Pg&WD{bpZJaN|twW85=Lub-Mi9)o+}!vu-T~`+-}9&tVeG9e
zp-h7|Z}sOrqJqBiAo6V)2HIB)n_kb9hJhuu=m|X%L(vc3L}@iQ37a;PfFk1TDNoa{
z`lq#)wj`ux);rrahlH$;&x5Yp4Gs|>HQ3;wlbnC@URWrHWL_nGG471Hp6f{b=dJj6
zROgqKtL(36zm*XN)J<LLIn0X(J6D>$0q<>_O$$s2PDhvrQ-45p*Umsi_m|M*312o9
zNJOhFd;?*20B(aV?Rabh06{MhS9$g6k5^N+TdF?}O1GL}j}vY9c3(Nq+8}W5eOvk%
zuwj#7USQ!>u=9I7^aKDSptw-<yc+&5@m+LOnDWl-3)baaKafjLJ{x8aGM31NToEuC
zcv{|+76|FRax2+#Apaw|){ZHEAHB%8QWNeiFAfCmgLPJ?4dEZk>TTfh6Y4qvlrQAF
z(f8Tg#FO!*l9Z40bmqG~JrcvOZJoX35F)3fGT;%=zmppeL@5MV>;Q?sjxFiE#hlEE
zkCQ1Z=sXlOr$!J>9VF}PL@T-g!vY-igPAIR*nVP)yL$Im!kcpy9Cg@w3|6qh_;DN1
zW<%?BUqgJ21|j}CJq9+~e6#j?-%M!v16R0Q9FHpp2%8f>y*|w9wB94_o8m|8f6pR_
z+Jao5^HM3UBCfqkeF}sEspYGk&6O?F;Z}~mjZj-x<)vV!#avWwO3eW#R{+f_INQsb
zd<boafGDZ@*Ac}sxGz!_65!Sm-PJX#9(i>DU{0rbRv`Tra-#A^$~1s!CNi^EGU^*D
zDRZ`~I*eWc#01A$@=BYCZ&=TCZwmU4B}I-W_av#AvqCL7gDZo6(4Tp)3X5d}drft<
zjh}=M@LX;c!2JXeVBWE51|S~95nG;k&r(jKsyh60=-qb6-sTB-COm0yk;u{KNz(h3
z0W1QvW$QSJQ7dlQuJC;SaYhdR>nG+2fPSL){x_Ice%3?5780g^$$=2)w^?XyQ|n8X
zVebAI_IETDhn^Su`U>|Z<oq;jPX^$+0oGfInhlnQmBUf@Wt$T7Q`508B({*j`~oX3
z05BC;eYXY{t^r$b;<7d^OT|=9hf3kGwR5Od=g8NWdyk^`RcdGPiR+}ayvUGJj4x(k
zq&i_-y06=Xo%IW3xL2d^e@#YZj;H$}8Yro_Rd3U5!c1ciM+n+f3|6>+nmFL#u2k4t
zDbo2<)h&G}8R73ELueV;pr)rePmtAi^PcAQz!mHI90J3oq0p6&3N?Bg5RaLk>y57K
zvYC2lJNSTz8v09M^Rqq(Atr^vqtTb8yzci_p}>Qlet8?wvlbQm7v>eZyG);>?Mf^;
zxdR?dOPyscz`gkEvtAg*!ozelRDTYVW&e94Ov6m`im%LGl%^F_d)9lnQxSN5w~1i9
zib$_#j9pQysz3Xh^o`X9ITRxgAEDymPvw8%B&h1)?;@iA1d`weQ-iyUAg0IGH<kVC
zAhR`N1(e$3tJZre9#~_*PKr!m<?Yj(L-8BOgZcB?SV~dPysar$+u$63X7UC#oW1YB
zaG`X1*am%1BE;8Ash<^$v<RzVZNNSK4wzFC8Us~s$1Io2lA|;gxw86;!vo_EGe1&j
z-;o95+4=+P8#b%S%#VpZbh?>&3}G!G&m{X{#ui7~$aMreQX~+Yc#Go6i;isuL}`fc
zf>nas`;9Qug&+5y!-@2ygo$h|kinDNzUG+tFgP;?HZRU25`rZiz<U(j(=r7A3v9!w
zVIJpY5lKhBd)iz;X6(mAO_KpFFWx}tQEU#se+zq;0zIY0W=cj6zm%qLvQ{0*26&F^
zXGcWkRXF!Bfc++Q$3s?5Z^3d8TVPn6?pz~xBrgto-yN!j<_}<*Cc0Fw6!p(C9U|Vj
z!<EDs>ebMku&?YgHdq0sEh)?2Cbw{Yu%&_bQWqd)#a~>H(Ic~h^F^BSdFos}PTqb!
zie=3;?Z`)g`b&_UNIyA>u@#jMqh@p(+*F40I#qcHk0mYCik~u!r6N8jzh1Mw_f^!)
zipiuPgE4O~+PPw;{b*>Y=H&mEU{{Z4rs6|<7Akk;nD-wH`tl9CJN1G>M}t^Bzs{!d
zv$~m!ZthRuw6RrdyHr&!dxADx+Vnb@0mc9}`2D1PPx-$)K4Ljki<CHJk(&OKCKRDp
zfJC+hVKm{KCLZUHF{alDc~a6!A*d~!L7&qlyu83Bi_(=8R3&@k+qM7h?NOD-(YLCa
zfXm`Psr_jN^syphA+JpU=iK5d;fximAyYoD7?A#9DD{bWEAmtVACSBAA~Chk$#3u6
z2W-Bs$Q*MI<)tHEMhW6-((5@__eF_J!sXEQJ;m8d3R8q9%t7g1XCEVe@-d)hR}Q`c
z_r-+Q-}<M<xGsf)lNU~wtD&6{jm0goObfo1t3SS}!M<RE(sA^&Si1rN$GGm97Lzy`
zo`jug<lUQTXNJ?b1|*`DmZb)s?<->3*Qo`3@mNh&(YglrLHg;gzBz`p)C-*z)al^K
ztuqEEbrfQoR$|$N89eJ9B99+B$|n|CO&-wK_Z8250cR4MLo)m6B}xA90wIR#fz?J@
z9karXO|-KBPH=^30ZyGIx^14`l!CV9lZHwv!Fbnv^sX9ec+azF%~HFVsF<q=YtMb^
zP$5jn_~#8xTTv+*hORBCTwFcqa*Z#5%u!Q_)!JAz0KKf6C_&Tj#v11nP74cY`kiKU
z-FnT|FyRI-J<>0IFy=pDKcm71X4<Q{G9dF0ipNQg`Agu1yYVp`7$S88&*hQn)N7r@
zh_K%P*PQV)22W1a(7v*DEM1M2ljJBIuzVA2@EWGb2SdV?d3dw|F-2#1jAN2PF{i$|
z@glMV)>ar3n6QHBia#7$x1?E>h_1+xr8~o-x?sEHT9hE_yjsSRH#bUZnwG7y7$gCA
zToISgHTE%tNKE4LaSx#gtqVGC<eFnB0^fU;YT!nzQh!`CHq(OZ{62W*D%@HOKG@j1
zmvAioMRMX#8Wcq)qy$c-QKtKl3ZS^U4#OU;_KPfj&vLxU_6d!5nlOkfxLEWH@tNv=
zw!E5V%LGp#!R2lXor$6@AT?+N+pq_mhy~T0LIK1Zon#wG3uezHma*b1NXTf}oqwp(
zOKe^wY`$w}C(RaVyOPD)?~H>3hsiY>P(NA?Hk{Y1hKWnMwh}p*X{XsBij+9P2Mfu;
z36&whn7<G>e#8-R{fpLxD;E0vaQ-A?iJ8zqnFfK3zW9)gIf6mm2TxIc^zN`dcDp=X
zol+z@0u7~5v{bWZG-1LtBe8~~SHL`AkC&xY?*>Ku=|M3TzWjK`S};wFSd!8v-{ctR
zarzP5x{^I$XEq}v*W>Z}%jti-uBuwI%Mv0)CFFi8@o@4D=4b3qr_O^n$Oz!EG3X+U
z+&Xn2BOtrBHQNK3AD`8MX-RW%lv;m}Qr#(W!UVu#H2ke;_GO4?^Zn<IX0;8~*w_l9
zWAUTo)CDMl0fUI`ovNU2J05Gm`#wFDjyXzdOj1_bG@r`W&!_Nwy6|&7AWR8`ox$qO
z{N3whSaU;^d7ZBwfkF(b22-AA7Hwm@8pAHRs?C0eU&-xXKiJ;5fGu}fymv`o-s&_1
zIU0<nFEaP7bbXJzFyhSDxG0@JNO-p}Kl_ESoh-RYcXsC!8;YK*QHMT5{-pCv<rDLp
zif{Zw9~R@G4CZ-HNR5Nimc-D_6<oGObAm|JG180)s8J1r(4QATl+dRRmeR%y#Rf;2
zsKB}fWx|`YfIMGGsq*Yt1nT~nhxb=v)i_n>D6bC|%VD3=Sp4YCzDUVB5;tK7w@#Wp
zT+e<}gbiNuKkRrco&P0as{m}_ech>47c09&$|!`M`2HvWWPpDQYoC2i@cLXQO4}`N
zRk5OaNz)MOAUz&GQN>=mfN!;(^hb`<CK3I@AyMrjprdQ&ok4<o1n#ijQsf%C#S3b>
zTe(LPn4gZ_zH&Fbb)C}w9%DNL);#7XnlzGT@q6pVG+%tD6<&I!+MRO4mk5n(JOOHn
zGHs3b>34C5tFg%)drT0$#j(AT1=gpeoJv1WBZ^LUJlKYKkn%^}Wx)7OzJZ^5C(WbT
z6{+K14wN@T5h1*&N~cb4bYnm7v-WalsSEK-^?wIqZ*tTYtbVjI27NCpsdB{%+qLe$
z^NYQOC>z^PXj&`5R#+&)4+D3;DK_^UT`7VN+%HL^sm~Si!RAL^z7Jr9O+duiPS#H$
z|1i896qj@MF@qdelqiwz^l<|USE)iWAg*Fonhdgn?%|o=?JhmLPu?CPg}^SX)l52$
z_jNrY-Pq^`k`VnpoYqVm1cAP52iWsQ?}a4zh`8&+nXf)@r07>u+-1(w$GsVza&aoq
zG4dmv_xms9ERp!X;&Z7J4iXQz9DMYZ*I1!sT3eTnWKw>jB#5ayWlK*2J7CNYiaFD0
z1E5Sro@XZxGaA!8lWqqihW*m<x>=R3bvJL52f0{RF#BQFC!LpE36DW#hBfS}pE3qs
zv!Ht#<q0GRYGG)yeJ69W-x?>%Keg@>VUN>zcJg1(f(VZ++<{#rh}mL6KhW(nfY;t@
zNGo0cg1U5%p3gKhJ+FEVkfKJscn%aC%1&6SRx*liB&Bs0nm@;YuL$@9A{@yW@)>!I
zv8=NFYy<3WE)DZI5qkbVuHG`Ls_zZg-ZWB@(ug1+jdTb|gLEU^-CfdMf^>IxN;lFC
z(%ndR$~*o4&KPH$_lv_1<Yw))=A6%cUsr!FWPWOOY2?@RAd8@O%q~Xs1e0xRX5)s$
zGPQ`QU()<kmzX>pBzc<IJ>_xE0(>1uHu|**I(l9QVJ)E8Mr=ZFF_`yJ4p<+&*Sh;0
zct`G`&B|1*YC@1jKZfYdwotGr>b%>Ku{<5vu;07Fs4c7{xHO^CKM<8~LQRoNN?RFG
zxsCj(0&|)IR<}YJayG*Xxm$5r)B4|IXC0*bOZ8p4zAoUozTV4(i?b&}qX1QqhP6xw
zBnC;1l~xKPM$*_E9Ym=zF1N}O=;?stnN%mgW<6x8PAJ_Zrhx^8erYX{7EXOc21H6(
zJk>`VQ14RilOZv6f8F-(a&dmGQ*x;05XdlZszYHQ<S$@UeOFeJ6|=5bKXb=khG%6L
za<mRYcCQo@!s8nXxt55_xAWs7S&sl(#ssbo#-}TKLZYLJ?kksw$ItTF6@*UAnb&t*
z(B&Smka0kgh#0l>3j;vOH@{^CmZ(vwGN6-Mi7<7@=*eyLPEol7cLWXeFo0$JXA>I_
zr1X%k4cwn}S=@tg>LN1(@1vK5u}=GgIPlUsVA_B$*M>+Po)pR_I~uQ~B0CaE3(GVh
z<$p7ysL)F^4MCB<_f8SVdIC~f$P#8;TyV-TEGDj)RFz4ip72ReVnNhPlX1s2Qha}5
z*ye%rGAYC`&f{Yme?CpzClOHn4eHDP;3+4iq|bDU!k0T1=(}c99U&Jp3yF1j@!kWa
zxAW3#->sU9FJs#Bz*bf}6kN3p8j1>BL~bdN)=RP!PPd>znj&2R-#0Php2-Cj<u;pD
zU74SS{`Z8b`}%*Txrs1mCo7j>Q|1>6CHWWzd+=Vpev$RDSWExTvaKanfQ*ffsu;UA
zANF2Nv$z2w=-GK#@j+VlwdvR6my@S;S&J1u7#APja&jC)QvmC3TcBb<eThiJ(WfQY
z<QQ2=78}fD<<KPgi`AVH?(a>yg?U~Pk;{qP`}u2^Rt_2F6y^mm{M`*DGo(#Q%{ss>
zJt%%7RPZF6de&Hx>cKHG>q5ha7%0t4L9AbTKM#Kq)y5E@$x%2X@q4{#jiQQvX$omK
zINtYP*GMjiQLO(4V)Fw6K7P@G&EE~TF!P2cXv~tS8j2{I+XO@hh3{%YA(|^qvhtxa
ztqXl)EmodRhUbC4T$^*lA(^%XG8r2mN)DG1c6*yhlcw3LdYf|T{pe9L@(mcs$cCoa
zO5JpuLmKH@^y$E8i1AZDKiM4B``C(#@@%8+x5>d-=ukxFNKi>UP(FnPCXmk4)=O-N
zYt0dwsiK4^{a~RnfIZ`p{#4E0mItb8FI_-&c(XDv&5Se)&Rn7B+64dZ;-{nRW@J-W
zb!*zoI(>7CIWrKR{4&mg4KaajAhEumP#Iqe*r}4crAdPy!Iyyv3VkqxlKz~SQHEtp
zFz{*ZB{$ItTS(rceHZ6*<#%biD)UrDG$OQ+XXXvt_QFN|`GG+cOWA8HscWk*=T@xx
z+@>Xm6t1&i>=3;%2MvKQ3ZRS|9-xL2&w2+=NT~nh{)G{FRd<pNkaP~DrJmSVe+W}v
zLHqJ<GQ#EDp~p^spfAPCP<pkCuil6pB_h9ego@<DB8gDaj3&ne=aV4zCbQ7%<-3r}
zNvq-UHX~J~=A1VQYRnp!oxIR+=Rh|cq^#TqgD4%r1p3G16iekPKRa;wKn?j#IQIvR
z)MSH)`^y>8e9Y}{M;vb8{GsD4fOS6H$FC2Cpp^S`yMFDX8Jg&7Ym`z$FEyabr!b_=
zamjEyi)@$0@yV`Rt1l$(6^!jD$K4O#Vwmm7&$T`@Bns$@dVHCb#rGLRXaWik;KeYV
znK)TGd%xO64S1UOPO);YqM`&c$ROkhDMUnCkg2g^f_@W3?AISo@ROvtJ^wc;;hp|J
z@1K*E7Rb6#FdFYi$oSaFFS1^xk7W@H${4k^toE(%^|3RdiqfhsKv{&VYvr?F=#<5b
zAF4o>L_55MuuPb%j=1zP%3^|ORJ@4w+E@22;k`X@-Ziy}G*A|n4$F`r9HlHQE2Xk5
z`ZMGU2eJ=FjH_CZ!TfJ`?d#A>Bl)Fb#b0}%-9}`79~po0#)!*l8>AzLkPfq`&A~*P
zgzV?0@YUxw_)rm+U)aj4sTCzj5}T2fz?LS1-r`fl`2DGg!AnI;#H9U3*-*bT6|ob&
zAq6AIYOk3sKZ}aHoR7O4?fi@Ag6<M0XJkkoHAKaP0565-UCIV5I=DROj@u^RY*^ZR
znMiRj{uaxh*>w0b|HW=PBBUrObT%Q*?*uPr$=wn3CzhDq30ADhfK9irFO<WNv|DEE
z&wJS>KAL*=4r}Ohd_h}CRcO41=z*NdLRF-qPe7idrm@5L49*CQKBY9hezT>P&~I5>
zKf#e#hKA#pZPRnoWh=*;{!lN7wSd8g3=oFT!yQk<BO}vLx(?(^$ANR7%5(Ze^1^#W
zOAHD>1S^u%8<^k|q3z+Xza_xhwYy(rL9>P6RZS)|za<KP!tVIh9KclvW+ncL1k|lQ
z+#2c3a&fp*8XjxkBLUU49>|M<8DYKQH})pZ7jQ?4lt9jp$(u%edpyMHUUM0K+^357
zMIH2^09mT-E0v&@{Z1ZEjo;!%3ivsmjxi~}*bIn&m_K}Er=+joN;{h@$m4w^Sc5n2
zff5XPDY$-V9Odd?)E#c}%^bV(jNR5MHL=uM@(@qYuaq+JGXr5!YBrex>+)kB!LJOk
z>U(W$(wdqjQOo&h(7H|q)^1B=;>yCB(2InbD}oK>;ov0mQ1dW}bDX^@E=DzU)G}<?
zGN-vO_`%nJVzkyLGGfZNv&b{8#@p2zQ%}H2DZLxscCviiPbMiMuuWFwUR-oAE3iB<
zug=n4g1;KMuKoUZk{{~@&X*_>j5Cxt(5Ra+RE_{=fyh3Ebf(~z3y7)L(#Ol5IsLb7
z&@5}q;4$NN=0ds`ms3`Pzm)&cE}PR*?W@da+)f=JT9lc_`N?;`^?9t7-Mkvk`8D?X
zlkHm44}kvagOiL1KTrXq17kTJ$Jt3vIVs^9Hlw$;iG)UPkvs9qlN+Ytp&DbCQQvXr
z3TSF}4^1+*Id(GAX()VI9HY+*UrtEg(j$F>N^)gM2obm28i5DM(()5ZM~(Qg2pJpk
zkz+X%`~ljjVU63Ykd|WuNaJl)T|`c)?A2NK<+4cJe>4u&FaI5NxSaAKK=@qGCcK~e
zv7pbkxAWo}BN|dd+Y4}texRcwI}rb?O1#&`ET!BW7~0orKHOD|cR{C9NX}|(sC(Yj
z#{>>!sOgkx6`koCV56))VY4M)D5qd9CNrRa1d<lD4zLpgg7~6r1%v5XCb6^X(j&?-
zN06d>`$vtwIR<JP$=jcp#bvHF;A1ZRtIqTsGW`%OxgjniDO|hM+trcJGS1%?;_6H-
z_F0tO5+62hccH5qnnTn4Fcp`?W37scMM8oSe1Yb0Yl8VYRvgJ^)T%A*QxC$fwLLT<
zl25+T_C&xSd(NV{IeyZDyOPUWaYlSOlLHJc%@qIx`h86-5K#B;ky)?vMIC<h>*BB5
zUs5PMq+|$5VO7*kS*gWVev@RG#a0pft*K9eEciECGlqnKsr2rp>mZDjoi5)?APhgX
zb(!7kO>%~>#P&A>GBxb~MIFY+#(qGa#{L1zAyxU+1}2Lib(UNkQa=8yWj0)dnCjWZ
zb!Q$tHqdT`u&*@opT3U>gx0lUGl;hGlZ|}SbLOQ{<My{<73`6~mc15Vba`ddM)_zP
zq|bb@K$&%2ucmxQ9d61jpk?Q+sh3UuUg~O@k))Oju@gA#k!Kl3v?M>@!g@U{?%mQ0
ztO4+J7FUDEwsNc*_n&suKJbXd3TLYGKf<zp5;=iJ<s2zZXJU1j&^>U_$ZRkE=C#7G
z8FaWf+W}@UV^i#A`}|1Cz{)IbPt$*-=|9Wr)U-%Y<0z#N?qT8anoy&;gyp)iW%gs$
zQ0am^*!j51j96X1k}xqfb@(ZgUT!DSo8B`xe7;alaC=ejC}mKa^0@)93ZcY(W18g6
zoD7#9GoY+io(%w_!{I2$<idV<-;C#{?#sl;s@dk>vjP|<2B->gtEY&Hlt6$Yzx&-a
z;N5zLX)?Qo_!gpla8E!}*u!KYmEm1fssoCpM9`AoTiiIqt+R5COGcUK(cWrPu1OD=
zA-d5)9@YB5thW5hv;G}~l9!)}^F#CJz-&08{9-e%Mw?P{oe$Dllu>|0V@j$d6Iraf
z+7Kc+s3i>e<U#9ULTwB1Dk###ro{;vZt8=b4_n}10-Ftm2&|=$B1X2{%&;i_+sge{
zSdE=_7=KoPWB~Bzq-}bs7yh3eKq<6^jUKFZzCcWHw#PMPf8(+Fc3#u!d)+t3x@Utc
z$i`SOk<}xjRds;Prfk*<U4xsNFu?Xewzg4@Z2ut4P?MK*9vl^W!+esm`rPg2c;CIg
z2uh=p6=bX%b%CF}n%O<D=+@4pek`bP30kl85`H{G3>oYG-80nn+I6{oYx%ih%LGGE
zpfh$qI93)%-s;~0pLEraOCFEG+r-tkOE(Th5H*qQfG+=yQV`uc;+zxWBay<65y(y{
z@CD?@*NgV)i|~b#h@H!!u<Klqvn893Hc)m5eRy>x{#RsvH@9_22wcm6WSw*~d9&6%
z)wBfI=4W_2Ws7btyp#7_K(?#@_AW}+b@7J=uQv)}_L&(dktyQ*jCt+ZXZO9%OZ*u6
zTkG@mR(j^<AAc+U&l)G$_S>BI3i<VOq!^5ZEwDIXM^BM=a9#pMferEJA5PiJ3Mjgd
zZpb^YVD#Ix%YZt`X!SEceyy@zlU0g`GlC!*;5C8nth(kMu?UxPmpIW2r#*gViku^5
z4|>pPo_<W(@+`{ZqjT*jFN4&ZpmF-7CJ+qBXM$yH#F@v%AknmmHUpy$E4?1401DC^
zWPhMstu6{fc-w#|l^{5wT1xKw&ogi}dX<6P%DaG!qgJ&gkH&&W()0NJo4?;PsRh!-
zpw0|L{*$LF{Ey0E@Ba557t)D<Ms)Z6<?eeu9Te6%!F%ajFn}z-@d}5FC+~g9;NjyH
zctS|I^J&CNo-v`h1X~hm>Ts|V0kh-GKQHH?>YEGgf+iM{6D13{`>R^dWDq=b554_c
zrC>sAN587iveV3cl^Cocr178=AppB>jelr_?;k^Yw@yMR@B4`dmJE^EC;d(`eK=Hx
z>}BfgEsOu6QLl0#%?Tv0X^D?XYujYI>{_CMm;cN_>=y>aR&h#OYUMeF5It>IsgE9F
z3-juqgeeeD=#i>9CE;((0t^779G(ns<7^tEC6We*h&S2?;hv~3x~cQPXL+5DTsv7Y
zu8nUDv{{SYwxSa$BSJt&kRXZN`DCGSy^$I&9kM>P>n5-SFhWH~YGm5}N;5olwzpl?
zTB2|lu%W6xq=v!KFq9YXiu^maEux=*obxK$#-R!1_nS!vw#WLbfXW;{1)g|m+)s$}
zo-hWqEgq~SO%9USP%~jK{}l$a?|DMqRsc}y1$1>%Q^_HB<v%l^A>VPeed%o@qR8V^
zzfRwK6o@5ne>vsY3QUA_0yOT}jkin5BW)Hn=%qgVsBy{gL?kt_c}^{`?633rHM@E$
z6Uzhl=>zbu5C@Zgf<K>Ul1I=vzvl#|H=ER?=@lJ4yw~ilYY7@mc-xupn;*1FWMLqU
zPFId=tcfcts(oycf#?rX?U4}=i~J6}AAb^~gMuC>lSizAy>)inn}@;J*Pv#hJoO3G
zw<PEff0+PPCVC(f*@f*Ll1L!>{Wf9zfOORez}HPA40(SJX;1|MnYCBU5(k-08&&w@
zkU!HzATHc}-Mq&#cNN?$b@$n5l~yaZ_L=``J54vXvRnQ5#d*c)eaaT2EFXp1U!bq*
zGREVeVk`&0c}Ix4OJm)>*Ca1sNAxAP+t6JvgKg}@KRxH?*8%3dNrN%N<lab*;?=g-
zF!(n=+D|gjm)zGnL~k!ok)PbuJd42)V0wTWLhpoes`G^oJ&(Y)g7GH+i;tEQ6qZ^f
z)-#}ut>iFsAypNKye>fy%fbWny_uxu;=pfd4RK}HpG$UxzK1&;rMBzo>Vw(A|Lw%4
zfnZw43ocYub@Q<2XlY?H>>~&Jc^R=Af@|3s<{Qg@TGrY0Rv0PCiwgeRDl5S|M8CyC
zdj!06(x?s4sqxv1=wvsu+X<bj|H)>p368#GSD_Ms{0G^2`VpM#y*}!@Q0sCews@p>
zLcsHd9_Jr;OPj{ic_+0(ywbSb0z>!H&Bdq1^~TPv%ytcmGHSRU|3K967_BO3mH!3J
z)^n`Z6#);@>TzogIZL%^Z~E5`Te)4MboL!<fu&iWwEYXMDzpMs5tYupcr)o+aEr#;
z)*`?)^)Tt!=O9YgBAj`cTmF{V(krvDn_FPd;u!=Nbai?(tar0X5-@^OLN(eoXGzF8
zCpHq+dOegBDg`i=Vj!5WxU3wVoaLVlAMX|tCgUR|`=c)1*mx$``3m#DJ-YpSSss+N
z5Y)IS+n1)mXwQO}w+G12>Y^|^h^n0zRMeuyw9dU{pdo)s7VXxcR<ojAe7@J3KJ+x1
ze6RN&ae%uY2H227+LN07Ahv3j{jkC~jqu;YU{sy;%b0sCbhUmTLFb<>Y?@c+aq-vX
zGWZg5E<RtIjc0j-VP_*N(i5j7AI!|v0pKH@2X?0PyjEH35uaf3e8K8pqo*M<eU_yr
zB|TZaWg9fcy02LOM}ml*MrJI7I_r3O1k2V6trkP!_~nsqPpi$ga(xblzYP)z%_lWK
zkj8#>wX?qh_a?Z0d$uJ?N}BB9s<WSxyVTzz)V6FQ{K{&4gCesGg6@_mim7X7u@|&e
zK7zO|_ymehh6NupA<GkeqHPn}d1`}Z%d57D-)YBTAJ0I+F0CM30)*?+;-iAI`Y8%&
zRkT$^B8_ynFb$8fNu%eRf_wy(_d%rF4s9FM!sc57nd&1&goUNcmH`_Fl+MCIh(n89
z14uP;%1d>7ewn2#TE;~G06hF^2b`nE8LltG1H;r)u*56@pb}fq7*H3VyM`r|2-do3
zeqs4-m65f&g%Jl+Q$MFBC%yiZ>+~ke^2R_uy@q&gMe49t{akhTFYaoXzP-hm*w*7g
z3xZ-+xG)b}d^r65aHv9v3*%(LzM`a{DJr5Z1Z<U@9A70U+{}zJo&bB5RfuG0ecfV<
zEQw6u#R;XH;(flEdGDK@Lpms7#bLF;3JOXFD7N1d%)8)yQzSb#fdl%bs(G*6H3jP;
zgnRRlkS<<sy<c38OI{w#PTSiQ$dFEqH~bhVyx2o;M7=h16r^y-%Y;A8o!IG@_b>rM
zWRMJjDPozboFu-4o6*;6stE}pX$?4hWo5R6_}Gptw4pKjfN~QC8iv7%8C)d})_N&Z
z6v0ZR*}80le}CKNyn<pz28Mj`Y3w?B;Yc%53VI%9=9>Rs1uRk99Rt#8v-Z~;{B$}X
z(*@<Dyu%7}6x*|6|Fg;1N|Ws8c1ucdNJt1z3aqd-_4>aeU_!??zr3WfS4$tKgsK`q
z9Uff-@Z-b08a+t(E`gI6uY^+Ty9mqj?hhOQs$2T-oy?%24Je|b4a}1|r<K=D2}r5R
zx2>@J^}sU+mk4<p5aA4RuHR1wvMB#31hJdtXZ;n=99YcnKCaEp4RvWb;i>_tHvjZ?
zAP_vkRtKyWbq=o<HyP)%?KB-UUU$|O#}k5n^S?_l-6FaUWqm<<Vr!E~tBrgIC(a3_
zEW@(d6eS@g2<=???4Vl!9MW29LBnM*yl{fRtd`b_j%)69F%^;zEiH&nbo~8gp`Nms
zp(3wm&Gm_3OlRHvQ#sj=QCzx|AKCI{kER1VL(ywVRb@GjX=k#<<{jW+Cf~3Qu+X1G
zt-}+=g9#{v=70AGcr9?^55_X|{Ahl(=tNWphL@$0Ws6VfttCYczj@&#r1Gv~s}DNv
z{a-iL<pL2*ybkcxHftZ7L0LNfqmomCfQ-xyz?R<Dm;U`z*e2-_XE@_eqyRfhyYHXc
z#}3-eK;}so*s2;`DcraR$5rrQpoB9O{?+1}t={Z|FH$U4CRhHItvGjIB2215h+8`L
z`0)8X+LyI?c<zI3e1;-xSJ?Mziyp&r+|)(d_kDtuhCA_6<tv~$-RAe9lnEXI;r}f^
zZ(mJGGQ#*ICftC7FWGUKWES42y1zGz13Exm?X_j^@#u}RTOJlV;A}+H!h4<xuEU+h
zeVy0r9hIbm11Lw0wU-_LYN?>H%82$XpwqKDO?L7yzjl`Vub0Ylu9NXwzWw(r(9CRR
zeFDbozZXvx-gyXYsZ~V44&$yoagcI1K2}B!L_W3gjH`H$s^DYnt*&&?SUac~CB~5s
zK~KDy=`5CSnEOM6uIEcBP^i9o--6a*w1+vOOK>&5O0@Eb39h4L?gwpC&%l5<!Jf|Z
zV+8I06<n<Wl>h_d{AlQ-=f8I`=jgAlEgIIyvWu8d|H^;MqnI{Ew9z#^Gs58Bgii~O
zUwIxDTdey`9Q(Acz+(^uh&^f!7Txwoucq{Pcphc>pAAQ=G9#~kc0B)idJ<Gtp8W&R
z-2dkp$Ielg4zB0J0UcM?w2n?^W10+uQhXO<BHty@RqaRiKf@!$oWGlFSWB^JF|;}M
z@p%7VLlyYk!3wu)j(u#6@BCsSlL4zD=;v52EJ8B~T$lMcYZ?4mEiUz2TmenokmH5q
z=Id|4zz#y9?di2u^X;WndF64Ey;R@Ta*g)ied3yjlEIu%J*fAM6~LhUtLFkqZVWG@
ze$g18-OIL#j|<}%K!AYPpR!1Mh3MsWaua#coU)}<yPfZb@snyakTQ!c@%>ieI3DAS
zqKE+fZ%u*+@|pR?PI9V7g#VnK;P3c?YByH5ipk3A?V@ChfRnDCYhbw2^rt4P`J>%g
z<mN9kHtF<~)gGR8)j8#La=8dFP9dOc10V^3CBau9&}(V9%f!Lc_W!$oc|cmPwjoj{
z(9(P`tv7;Es6&hkzwc%km*5^;`>C4Ee_Iz|^vbY6Tl?QjW<UX#Trlh!jga&buT!Oi
zc1w*w7dG4b@lqgdT_B$ZTb)6PsUP16Gj*Fm=7E2KrBemX%MrEhH<fGAb>cB}Bq<3b
zO<~^u-@1MWmk(ySdW-11JMS!w%FA%^pM*7YJ?5Y@q?IbQ!o(SO-}`yaTT4wX;5+a=
z&;15&aQRf-$X)(>mQK{rOIsdjD|`qcD2hH$(7W9a0PR8=kdQi}FHxR@+sQKc{^u;6
zy$lodEgmOr^%nln>EL7O_x<;a#uwlS4E8a%@9#}aaQ9J%DyxyNM7QVW;oSY95w2%k
zd{;w0e+>Aq7HaW-4y<<)b?!URIupgb0QmI{F9i9OIyo(o)e9cRcJiuCV<62A{QB(Q
zQ&2k=(ugB7%v)6OGY_F>5?5MBvj#pp;K>E5g;0W<gOfFR43sfk2S2xUKAm1Q#I^4j
zBDYX?+0!CO7<lq4_P7{t*$I7U7ZZbUGYQjLq>pGH*TWf@-uH?UZS-;wL1gF;iB}%U
z2*?ofKgqM{lan%poh8Dg0rIy&1-vts0gx;Y!o9-<UP9YUMhe6C?inzho(0G~Laz;f
zzQ*PqhO{<<eUVof1SK|G>e2GWg$OwZma@}|k0}7`BzO@!GwqWHRbQ*43zXBg>Et3>
z4VL?W#I;yq+3V-EOOgOWfo~OCn10;glHG*Y9m;<9mX~hLqn2Q=abNGaQ7f|&LHA5=
z4QB3}^jlIe*5eARYuLN=2j9(iL+(w$RPfgcX>5T?TWqCuKs^_&KTd6mLJ}?|-kU_~
z-y}-JoY{cVEg_h^KSCur^&Km=F3vGQ^2x@TAiDUXckFt{FKwF-+L(>b6SQ>{KZ(3Q
zb8-0@@Iw1EedDaOSMWoV-^+$b(NImkF~2FwKbD6?NPgy00If|#8NPqE#>v<H)YnqH
zt@JZn*^m((F6u5<60}Yp#++A)_I!Et%>Qp3Q&rmy2hwUXzUuw>(^~^P+pQ=el9SWV
z8sK0tuC|`1tF+*w-G=R-U?oz&5ZjU`8RZc}iC#6cURUbBqEV+6Zpnxgr%+XiBFg~Y
z-M0ZiNF+@LAPT|UN%0^?6|BL(FU}u#IG04c%%$v=y3CmJVNnyJqWXJ5-qR{?UC{lr
zMUCt?+23|)7&32QPR6;E(YeJl=-wH{EYapTUm4TqV*3|DILWiDz%d6`*R~@Cy?p_)
zJg#4PjNc_bR5_@JNlSRxJ}6BVzyNi&{?{damHQAKHD@^`FS}_vKPqN5-?Z^!qHvkg
z@Gqr3Ya!bub(>zo#vl6ZBp?Z&Wh5>&bq(8f)Kr{z@DR^dnHb0hDvr7E(zQ=(>B;#D
z%Rvu$b2NV(0$FN;<@lZ!Hf-J3DTp;x+VcA9$0`0VqF+Y!!UZ$yp46d(rXi!2CC6L=
zq@Y!+->i<y!UZZCE;h}bD5{IU#wm9j9s>+5BdHmh_+!TzAU%v-JY8a4ivPp#L704?
z-b}{Z(T#Rc^!=>n-(zj>4*5WfH^i?&Qtxk%dxqll%Ha&hAP|=tBB~J<yn%(;W}%^Q
z)AdTu7~1z%t<0m1-I{W+uz}9r6X;M%+twIX5!TmyX#s5XO#oWw!+OdMM_V}M`YgyQ
z*wJlfStRgiW3WLHF3;UX6)I!Hb<(!29a*`;yh&fzQpDqirKNoNDMEyM2Rp)mMksSu
zBFYaY@37xC@;D(8>+RnjoARC*l0Mr?>6!h31{-|HJ`G*?6os0ctG%-E0{T?~h>Ae)
zIw2mu+;GC%55Y!zzopRYB!0Yh_xTHm2;oxYfjN{rPR>$k2t{Dl^86X(?febG<lfa6
zy>56V5=fH3IvPl&Y(K-vT~3mMa%qXj3GjS{5WE7_*(OH1>TVAUbxjqA%K<Y(1!F&J
zkh{z7;d^)RYN1?s^6J_I*5`Z!K(2vU`)oYfmzd{E<pxdV-aM#6zQMj*`aScFx*ZiV
z)73f~tJRC#%vt`S>m6u9)Zu!0)vZ4X#mAC=hv;tV2=YkUY{R+;gafG<sBCt$({E=3
z?dl|F9Z~wKPbpD?+SetQJeLPw@0x#VhhNI%%j(X^i<^Ebb*DR=6xN`WX#n}IAoK4(
zSTAMC%?YZtSduVk>Dt`$O|hM>GNm4~|3$h=-~WeNP}RwYxQ@RFK)ie_vaHGuiU1`@
zh=W6qdlH*<WwhswxpvpkH@)?Kr8P>h<9IWgG|FT!8QS6RX5y2M$Xs<uSev#`QXopl
zQxKMoKdh`QFdcWysqTsa#c&9TXe-NMQ*w$-x*6(q{_3ukELJRM03ya8-Lzwk*NyN?
zvvW+ctSz$igcJ=$8{a@w#$P|-ooW>qjQmP^mr~-6xRC$e(6el7{OX$|^w}k=TSIk0
zY(;Y*TBJp@xlW&9j-cK?L`VXe#jHrLuq1242)`a^+y~)7ZWHunF%|4Vpp6FQjpSTf
zc6MoA&YKiDc!hj2s)VTG)|?UoMci|cl9>KYfnLoAO-{s-YbYnEpt3X(1hNTT>_H!B
zSeBo+UQMhx;ZK4Y6q7mtSmC9(gN&{UT6=jet!eo5@+UCt-&xPKFfBB(1d}oIgReBx
zNsdm(bvb-YG&xfA#EQz?itz%7LRuQ~f$tdJ#mjSZGP$JQc?&4{=EbE&WtSrnPa6s7
zq*mCQHxmTQBrUkoe3(q)-eqN@r@(<%O9?}KVT7E~{$mFGUWL)Lg`*)p&=ywJ$2Xh*
zQ~b5dZ?la?cX%Lp&6a(&<bz~zrB6}m{sR8PTT3tR)_vx(;*Vxf6F=TAzNVcvQX^m?
z))fHsdyhYBu)W=WAoEUOu>cCRV22r2!)l@RI_X~FSp%}J^b$1NehS)j)uy6!%H)a(
z$V5@~83@rZ!jq|OWfeI>Ul{^MnG~YsXXSopmb%Bx*Di7s$e86EHB&E1Xi4--4aaBa
zmFF5SCfmVgA)}Dcw`Nu9`I9LYAWI9tz@wGu96v@uum7%&FW6ArbzYUjNyvb6l#oy+
z9(gt5!-@vjrPo=Qw{+q?3$#S;sY!ECA@nvHK72ifSt2Qm1%RDjMY{gsb0Sl0acOZN
zFNhqWCP1NJy)oQtz)4VEW(^0*#H#ev>{mEIlgm~BrPz;R1$OZ%K{9BJd~cH#@&%`}
zW|DsCX1B-HMN~H-vubLrk&*sGooh3TE1FEt)ak(?eQ|`r@=&9gskeD>x7RL2l2%lX
z9Gzklipfq^v>F#+U2ueGpFE-9u8e+)s2J$o={)R_(@zwQDAyqZ1k+E)F%91vxQ(ac
zlWe=Cl*v<aj74>kFQ9%dTiF3R=dZAa<?Ft0J2Q@-HFQ#4KTx5k{A)Pz$t-jHC7#g`
zQVy2pm{$Gi`{MF(g&Xhz_A=>+0D#k$4kW`Zze!T0P-?7w0b$eh{7mJKe#5<X7QlqI
zelj?G{oIc-nvSD9%T_-+p%Pz_oU~NE>Hw@~=$4OB4xfe&glMQ<!+?dvg*S(4RpSS&
zTsC9VLt2_?EzcZamd;{!(fF@-`JZ=n@SuK9TgbMl@y`3T4`}okk@aJPT}Z9;^4!^*
z&AFldbOp?sVyNEcsJroDnxVn>h#LdoB&^dun!NmN4*ljUL`afresseykhLHZKhz{*
zI0AMlRtMRi2d$X|K?*MTEZoT+*K?zUlI+s2`(V_v{s9dA%gr9&_Zjtx<SRZCK-$EX
z_@i!1exS!%i>HX?*;n1SfgS+5!ppD8hVMu{2)<ob2MA26AHcLm;HwN);gL<Eu(Ybk
zRX|;UrTS+UhF;-8HZ}nA`m+xxfaEFTuTC&=V?Ut1<(j>Qvx`ba=+rO>`)CYAI=#!z
z+d#|0T6-;Ar7-o0gUO?|#6cpy06%>dQeZ{Gx4>Gr%xoWK9=>IqfEF4OmhGl%sQq|n
z-XDdm-BqlQsW2F*WvVg#4c#ubDbi$X{d_F+@||72!qkObGCil*KmpX*6ER@!Vqkp4
z;jRS*ZNZVD8wxgQUO9oDiC4Yhoc#saSdsq!x-J|jL@d?zY}Jl%N)Wrj7hLQOPm0C@
zKy`qbR$i<j#KOc*25d6WcnK27DR1z_>GW@HlX^<I!bez7)bT=sd{Vsew0feGqU33W
zS%qNk{-C0Nw-)1t$K+OZphfI7ZP?+i#*G3+0^_Y$I3t?{iWdODP)3@IxgG7Cb)Ypb
zBCzk1*XZbF`gW15UtxSSV24x7ZclixkTWbT&feK}PN8Rs(z1VKy#Y)+tMCi{iC@?l
zdwM~=bx491)K_9GFV9nU*js(yLFGBkjBK`*Mm2b$Q0*FEAzL_oTq?=V2__?Q5gSyN
zHu^>g!x({ewP{UKke$P$;`%CqFRi2GgV1GF*r&xCiDnxt2%h<R8=JC%I$G5tmt8dw
zD76=3Rn_JgFfzDHOqcyAA*c%+_6s2_2fqrKZf^J&H42Mc+C74~*w!W%IW<M<S3X!|
z8XFy@e2aN*1oqdNhx1-&NR0Hk(elh{U|!hpbw`Lo+5v=ZK;DWzmsYN+s`tyQSH)F%
z_TU^jCU@{N^#t<(6<-Su!SHXE*HP4+x_AWK50p0moYDE-N6!qO3peuv1SMKRaaeq<
z7fobFZnE6l)p>2i3E}$yNOJ00MTjMr<>@P4kZ7j#J<M5Nf16Quq#U-nEc*^-RwV7L
z_AMZ8C(+BKd6K=m#+eK&E;%HVu$ITY2bxi$HbVEM7tAzWQ3rYUUSB>G!e*yY@w~lD
z2L-5KbN7p?Gx3hZkuZrx@vO#zN=kGrML1!jJ*%G}Qcnfn_Cy*<akxtXFrS2kDPkPH
z;06>{Ty^aMAiJgczVFN6fQf2U&f7af@SIW{wtdzIHt3BBeHMbbI0uB{wAINtdxdld
z!0Q(BxHP9+r{aaiTIwI)q-*ty-Qb#*d~izx0p|ZJQ`;7XI_vJ<?D+eq<6jPz{k4zI
zk648+g0*xvqSA|llgg<2Z~9j2JG$H7kq`*h*yfzaRu}C+S%MS2$SEJ!f(rvd4pLUB
z7Y_GYzPm4*ANmqBN)~p1^cds$lkAmfD<vq8qnNkw-EgqB0vS}y(<M+JBBZW~=~jnA
zmSeoo4ZteDugU++P86&iji;`1m&4JANJ~VIpt<<Zw#5H(IT^L(RK>7xT3oz+a+qZt
z`a&ZqF>qx_4=7Qurk(1Je}-hIM?l+d1YM{Yy~iohchPSq1xsM%?OAX)Dn<kG2vjLz
zj9gOZcimYfc3g1rNmoG|_bm`5)C8|@Ym?c5Eu829m*f(jz*{e>p$uPIamu9s{5P2p
zabI~>4WJl61{N`lE?QAe1ckSGFEIPQB_tWsfd}bNAtV{x{6LNgK*~N^!aDfqMIaRt
zB`CeCED8%j_3t_hS!yzDH?ge=N|3i2e|Kn%jNpSRoqi&#x|5XFYA<EHb+&m4=mQ~)
zOAVRCWP3xjoFd5uQ9d;;VXo*W3$W|nFJQB!uI~5U{UASa4($4&d`3~MjA^xz?7t|-
z<&#eolGLv#JEvyn#UE+7G`s|Fv?{arAfmkD$xr(Qw%1+GkWNVaNtpqKJbBV8*a|Uy
zR71Z5%e{Rw{8vk5;hC}Va<BoK`1hB<396e>KKKX}N{qkTqPT5^fmeI7fh>jTV`HV_
ziVm3Q<D09W7JHzIWuu^OaV<W?r2A&JxV>-hmUdGU1vO0noSnS*?iHl<ecgks^oT5#
z6f%dLZGyYrq@8tZ40phr14nUw)qw2r^^XUM<zMM$o!Y1v3WlEvQK1#ts1?lF><%1?
z;g8_h@W%zW9$Pg&s#L=}#5a_Xcsu|;<HlBo#yfnB`j<%^DrH4Y4>;-a^BDz1O3=jw
z<?bsa8D#}!F%A8W<-is>E_6DS!a5@k2xp-^D|0?Uq2>&e|ConKMSWByv^5xA{`NwA
zeyMhOW2<avg;&_pqDUZt1tWLV3Bf_f@lQB8Y4{9_(s?3mIj80@r>14$H1}J}!Knvz
z?z!D$7iI1b+*do8ttanFP6z!9UX#_mv618pVq4=fR&I~bl!TZ+o##n113LK@hFrEv
z=;~yLQ7O@Ct?SciO+2-GCpCYjRF^tG<vU^-{)lJ|>t`}1j3$h&l~NY^DQPo)6h=J8
z%ey2G+0yLPh^Q+oM{PEMem*QnTN>nZx=<LhTb<=>dU|$af%0^dTwR%LBP+XZVU+oV
z{IZ~O^}$e$n$^m>lfXAT&<FBMBDhipKP{3RUT24^uqgv8&B2J~(vYpv+5sw}n?fK`
zeW+lNum0>Q$j{@_fNxrk6>rLk8EO4Te<?B+dM9Tnr_m;+!pgE(?n@W7kH4F%OuVWR
zvP>Ek6LzK#B&Z=kR(;{texKXtxa<a7>GX7yTTny`cTef_Cl)RIOo*r!XS~qd-%3uC
zMKaF!X^L4}Rt!GKC4A>8No6Hw5ErAMT5VU?FC%mkW?yC3_K%t>Klw<*F$G>Q3cxYt
zr{6zaVx1O4D#AX!r?1$y<~J5O>|?{%A=gjBbrTnxo#~l05xS8U#yLcKs2~=*uDh;d
zJ3KMwD4;VXgqH5bN2Ilz<xn~`t=Nj#$MW3`IwBlKcS&jfO6aVUVUH10kyEGM>LT5^
zH?rE0`^Xi=m{fz2V~2W~v!3sa-6_F`43b(spG<TgtIaJXZhB`y8XxMJrHVB`;6o63
zE$6u;p}(~jJ%XMrMpsqn+@6RR4Mo9xzaG(J9hE~P4S7<}?D!zAn6BGUU<R?6+Mx@5
zMuU;|b{F4`0V@uQF|19$es#3-H(2+SkdOdbU3NiQrT(d)$nbz1!nf)^oqgZVghg;h
zNxt<bsdM5dFWp|IC1}?83d?*rc&kqO{{6DIFey2~LU0b1NZh~x4ayag$bj`+(|0|D
zU}||5k1N7pddIiS)cJ-&Ue{FkEQE{l5OUbMhSO{lL*k4YNNQx;FG2KWp<ls;^xO1I
zKkdiwqCN4sC{72><}0s~rO41!`NxZ&f@wS4*Csw<$nkJue+Wufo}4)5?ZoXi7F3G~
z3(64s36(qo5sW-NAzn8bH@Ens2IUzb=}wzM;=B5l)5+lBj$&AWu}sE99sLT_eg>*A
zg>N=`Og;>nq5R>V&Gt`?l>a%&TgVvljIx65fiTZ%w%i?d_U)B{#UWP5XXM2+4;SZs
z5+{nbjTM-8d~NPx`^9S8QtHTBlhf|Kh>2DAll=|2yxQoO$xRipI%~qL_l+NwM~-z=
z_64`W3+{Z_`ANTx-QP_-KAvo@zqv1GD+Q~;MRL_;cwRnB>sT~LYIFYHq5DQRbuwYj
zSkuV`FSpWI8^_7j;N-aT9UMVe5C-|KE{=fE1&)yt48mS+%h>^Osf~H8S{--=HfE9*
z@M{S+<T)oz6~PWOHrKeDLh~U?UVi#uat<imC+C;-*7qmsYDUosx6y1}51-7c9iD`c
zr;V9h2G`TPy0~nqWINP8Xd^G37_+ZXiH+``oEZICMrcPEXVktk)AfWqe1JC+qW_+n
zkgU|2(Wviw2wkU=*uHwkk@K-;7=NwS$uY2}FwU~~rgx`&h6Pl%__(rlXXmL8s+)%8
zajlq|T-(}1>yxEb_uW{}PEK;!>LRJ68AFG)CTrK!_Vrkg{SKYIU<r7c+~=^@n*FE=
zZL{C4ly!)e<V2{S<gsn5(2u9AK?QkLL~wGtJb2N;Ueq&hex|DR+8<0VwAXH%Kt$Bd
zTVG@)H}vREQcY8oj<J)uLwV6!-i5|f$vo>DiB6aNF}5ivm+8u_`#Gp4P36Zp8Pp`2
zg)>6q$70u&`Qn2OGi2KWQq$TY7f$En$_gvI`1*ibhxZ?$prkOf)eRgH^vtO938A-!
z>aV1TTTl%j&s4Qx-k^7i3&`?t*~51@JizaW4>FBWSFM6?y0c+sD?SrQfn9cTcvzfM
zft$drlQsPdO#w<O`-xBvO3KVcCQTRCR(ka5Rt6FaTtF+a4LH2ZX)c9)e;sXFFqnTJ
z-N(+^FnXmOYA{kzniL#!)`;DmSqqpWJ(yp(=L<y6i)TQbs|kFqK1b9rU(?#miGP$2
zJ;;bAeVrpZDjze)KIhahb$qyLSC*WRn--@(Fo8KE@_sT3n$t_OKFF`@D4KUZXtZff
zCE)ZlYbz_?R1@o)Au6pShOfYrEJ+dtc{XISo%)u&I6NyE#cyYhDv(PPFodvVZV&fE
zdz6{nU<8XI7k++$CfqSn0tlU1Y{$TWfL~3&e%J{wss8(&$XO1N_udX<-b#E{qTW13
zW7moaj~>YW0|B#e3o(wLU0C$Q?%|SOmRycAKc<F8R%tUMvHpF-_(@5wo{=n9rN2{C
zLYjlYn?jI`_L1aJJt)PUn2!6V&e`5zHDZ?I`%sxlcjFBS>~OmSzoniR4$J$~kn?Iz
zV%~>HBb!1dfp-l|^^UQ=#pT7yCY112?i;AV;Sta)j*U0)!U^KTLoQVZzgp|QMx8kp
zF6nyHNlPe8Dtvj<r7y0ptW8nt|KIK8@DLBm^X^QK;h@}<0JhXlam<SIlc0A<z}I&z
zSF@uivsUI?GGsNkC&U!g3`@mA{5ErVE{`9leq7xAs-RV1{NduW)z8Fn&ThD-9(#{U
zK%Qc267FR`HB>9;Y(3V*(>N@=JL*aAK{ef8!LF_TMKmQWC|_Xda8ou`po-ma<+Ccx
zb$CZKevejxM2JMc$J`k!JiD3k8JY<Gb#+Cx8sGibun6cj%JkwuXf>Xh)4HyvZb33W
z>vKYoBcjqw@a!5aV^m;`j}#6NDZgrFkbb}n-$C_@=lG6)kKZ2PaQrsqyRrW9Hd?ly
zFm3Sa6f;so`a=#}vWp`|V)Wti5{t`f#z(~gQnitA_$CJz6V;=E%D?aF^5H`XxC<ug
z&hQm!O^W2Cc@-reS5?p|>@SAkDH}oqErjNII=2e)4Y-=(M_7>04)30n6Np=K&zWV3
zSnH8@tyYWiH#Zfw3JgDYe#Sl*^$q%&>x<Qg7P>iQ5|`Att=7Ox7T#nw(#lR(p4Z*_
zdt;j!Z@9O4HBg}Hx5-MUgg|1rV)X7$wJ(|#!SPfT*p2(q(7h?h>+xoZw(^~!6_NZ4
z!^U|vWt0qhU|}(;{NXnRsgHc(GsUfOtb-T)p0J7bo7VK_g<B&n%OD?!C>gigOy9Ns
zVV0(q5f+icYq8y6c|32eBp-<J*w&FKk=vE(83X5R^hdq!?(P#IDz;ND4eU-%yFj+b
z4Hh6p-XDoqs_#+UMPB9(V#DnOH{|tB&#^&Q;xv|QTh!&C4zSWe5yGd#N-vbuu@S+M
zKDzLc@0y3xcKH&4q>Hqp<A<dCg~#DyTLLz5c9!k56mk+<m%A;Ia*FAoC59^U9Psfn
z*<`$?A~E<BnM7R^#|d%nn%>RD1ol);c|ugKh2|KE;jr<r7JA-6W25JXS)>--gAkI;
zQDy_>7R_d;?KplTk#oboPNprqJaf043CnBo&61)XHTxDM-CY*(TA^DqS0qC1pl7zp
z6SYrP$PMOTTL)h~_LQ`B80Kz`wmOf&+(3^4#|gE}VI+1Qi@oOVKrDtz0Yk)~<U5hh
z^PH8hy@%F(qX@qv9F;a*x+SjcMoff2L~HzZNdwZqO&56chsBVLEDud(>|@h#&9IBU
znO9enL7{OK#{9{0lg9%JGJ0a%Ir5m0VGma4U%kLUHiS>#4C0hn>rqGiFn7rcWT{sS
zji=#E7N!~Byu%)njY<Q@qAU4xwo(Qh;EuE8bQnGwGN!$|Y=F4=m!T~<Kf!8~n722)
z=bJv@|4{|tT!@8zf+C8oMF}4M#)Otwmh7~rM#P}<5^)wDAFt}X|J!?}x7RmUsP2jz
z&#CJBK+&)O?7a-~y8D%UFxoRYbkaQvX3DRynaTpxA=Q{^<~fHH{F+#0x;cBw@=!cr
zLA&6>_CZVLvrmX?v;+$}F1^hCqLd%wq~%8Ss02|l1@%#3W+`i!`&g`XQ0c#V(mj{A
zCJg0%ezL4AX|+}j@2a;eq`OKRG>NV<#~yn<zixV1Kw3_~n&<P4Kemad|H(1lVS3cM
zEXD7{Z+`r@p?06WD>UA)nAU#m#o6=8jW(t}a|S~ftpbC7@_bCTRzdNAMdTqq32*aM
zDJ~1<D#XNl=Gp)}NFm0(gRO1R1#J(H;^ISU3%4PRYJtMAw|t7Vo-hu~dzQgLzQI>V
z{4xsa*5)Hbb#y8a)gFh5&r>E=#GAwnm02YwhH&^JbmgY$0tgyMH7=iNFiN+CkN`4k
z89_E=q{sk{=@S3FsH^IczK$}JulxCH`hn!(Z#M8q@9QmV6Ed2rn+$d>jX8C^pd!Ky
zYYT<P4|QiHL}l#m8pZRnxUr*}tT<4udc4RBA~WKqiuaf-=Y22)iZ*8|zJHD#dxMLc
zwzGIQt<75~89wY%PcU+G<M*Y80_q88o3;X3GPWZ?2M(3Gj^o6t;*Tb)6Y`WBj-Hw-
z3zXDUYMNpX5=knb7qy3(tBC7QI-g`&=dPWK1bC}(qOW{K)9<AzdVHXgw|%Mq9?~P~
z-UONdZIvx3oaJ^rprT}$?CIgLqGZ8*H@Nc^yrE?~x%wOh|I%cdB|nS7njNHmUAQ)U
z>+7|n4<&0#>4Q%`#x>8QDI>2uo@6J}DDdkj5&Rq?$k0jW9J^oNT|JRMZCa%*&&uhl
zzoE@IAfLb<>OjNkjC0=I9#q7@ke{8M1&#~y#2HbrY^D`Ai`XYQnWobIqPaS2JyRB~
z6+&Yx63p7S<7v^wn=ue@;mCeFA<s_n_OAfGfpl=iu8<J#u+nzjv6Q=!tR`x)R0dPT
zWrd(F?k<;s#w2-x^+r#0cD^`uT61Xp^f9$heVnwyH{6!R?+GzVSB|f~qF*ljI!euh
zI3t<A(Q8f$ah=%@3wp9~a|(qdBr3f<U?#41o!(fC><PSsm&{=PprNEzrh5@|9QeJ;
zUo9#!JG{MwqT7_H;s(1jWBPp_PAjxR<S5ZLjxs_dnsjP)AneDGPEw<4PfS^I!LXq+
zTujUxJihvnZxndYYHCX61M}jAComNGMjlthSeYGRO%#1zG(A)8!2oE<vi<rJ3TJnN
zmsm&=28&QsQuV}I49aqAV!FbCKeN-)iCzfRYU^X{G<wmyAac0*ezc!o7j6~gr+xGE
za@q=-k51tfDe`dIDD&}B7lvs|P%n_AGppfJyGU|GwP29R9fFUVc#unPaTq_bQW&#?
zbT`g#BG+KJGsf#{Nz#A15k;^dSbwGK$b&O|%aeQ^AFFVFLj0A`8!3{&3U(A93-x|2
zMzoDR{2rZP=rhVlx>q!!?uVs^07iJn6rVTS8{3K)?V0i&_&T|5D1X>Rl+-+B=C$eG
z(%VV?b>X!VH7~_e6(g>mIgJnv{xQ+~mt+uW$~=997m3owaCO{90XMqk(jdx$2LHMa
zzd8foZJ)UV5+w7hLF9WzX}P|BiB%$}^E-7`q%6W-g8W{?HV+YNTF-!>*0%lgyubKl
z-8rViP$c*f5C-j3oq8AhE4s9AV4J76MYcgBZ9VOCIc6b{geGm=+>YU8ht-1|3K7*8
z7EK2$cr$#spT1jI3*Tw7;qu4LnHp@B<W$61EHB1Im8D^d{)Akqvq_tVqRdFssLwU+
zG1<K`GfC`HLnEM;)%tuR6`4U#Nx5Ki7w4Dq{63J%XY!;<`T0}f&;a$<dFNnClasMg
z>t0OHLL)u$eklpYa$dj4;M+x~14fF{G_}S8#Z&oXI-ltdKeAC|wK;))0WEocQ`2`W
zCHm8I*LTvNAzKk!j^Gigf$ouZJZQ82Q2*g1FDy+(2Vo8MD7?M3Sxpt3IMmPdWuB}J
zW;%==L^$Xlx2pG6*~WS9VC>gb?uxw?y2mNTjj7=}MPqP-;gOY#Q>l#L9KhNG%M0Br
zS5CWCsA8JKW^VZ1cHe~~{|S1>YiMXZ6P6~!o~xNaTTD76uRg~wleOSpNNQ?leE+B6
zgU@7b_1J<4(iqo3CMQ)T(l{(E)9>)e??akI>3vTXVS(5xr{l%*R=WD7CWfP4!s|P~
za+z>C!QBAre|Li!YEg))vT~aXbR*4SbV*seUtDvAdGP#4<q#@cD^bl5$-9(@N}=FH
zH|N));=ZJi&{T)hMS@$V$vW~mSIz5Dr<MVUqLQIMG9+<<a`qaNLa_J)g7@9Ugc!PT
zXbki6PrM3eE{C0(d2#SoRyt5=`?U~4%PeD~f<LfqQM$x->}YTcC>K9dcXY9iqty6b
zq+y7B-HuG6Z5ywYMmLj!xpYVvq&%8LKZuw97A6)UqdB4byIP;iL3W)#xR-Qn*)LEf
z3i~Qtd|&`rd9T98w1@acMm~*)CQ)YVDs#-m7!>C1se1dN6WQ5{>r-H_kXBat9X_j7
zdYk$dCDlc=*0hI?)-_&q-y<MdpzkyNjKZ*WIbqWj{KA*5%hr9h{KeRlRGOE?zyU#6
zeSXy06QMD;wzRn*yY)wk<lD-bsYnXs*Ls9i{)l~YkZE5#9A^6=u!dVFjTYAD)O_ZJ
zN$#O29O2QN=do`@w;=O$+-J`U!B;lLRl(M&`l6=RDyCzTmo?u)?c;sFlh_xI`=SW#
zs&Yp10fF|pVz+tBsB5WVcTLDa6j_kgr_|#*43Q}9J9`wisTM04K6M|)icmHxsZZ=T
zbJYT`9dizKp-IxdN)j!k_edBy8W-)UGyGcO;)bGUXzb?%i^d$o@7xb}qwC*1q(41S
z3Qk2hMJmUsAmeLCKrbP^6>L5uIQX@!@lGfll<lewr&!KlZBL`f--YIiDt{O%`hLRp
z&Fv7*Q3ZW`-Wc)Jub8k`grX;Sd|vq?z^Y01NB?Y2?H9rhL;DUo#PvG-)WTn5@+#BS
zPR`-OI$v9T>|VL@Xp&L7aH;$1*pd(+8*lK(<_^it(USJ@$6<8F;MapK{iG+*aS6tj
z$U#tptpt`XN^cjO4rvREG87tKNmKr`mwgy1A3AI%PG2EoIh3Dc<V0ZZsc_JtP_}xR
z@VsHsLRfdr!}3|vCv0wxQc{=|*63M`3Be}w;SECA9$Pu)o7WMQF>AD8o!S@_7ohcd
z`F<9Tft837k*U$Vk4Q6C#DBtWN`}VAUhi-^c}(lG@mj7!{mU<2%b@Y^yg{`nV%_mE
zFZ7!F;BC2QEwT9(H&t)HQ?-EkJIuO!KzK}4SaDk@k0<6Fowin)<Yx%Ek;GJD+p#E{
zgG?7~oDW2MGztr2_iI6=wLFuXflwE(<>1NG+vi7$9<23oH8yW2D5!`_L~p#EOdspt
zPFXq+5%f~teS8?BYit<cu<`>|a+BKHe1R%|W@||sXVY25-BWybHyK!K<?!!SE@?sx
z+R&?QW$2h#n^N}D_I>YZyQ|2&phgaFerZ4d(wjZw*nG;b@3$oK7^XF6nxp2sj%v$-
zP7=K50RK*nrSW|ufOcJNy_JQ4Cqzu5Lg0EbGn0wLRMX*pZahY&iYSpLwl~s8u>mII
z*55u*Uk3zD!y8o;l#KN|uIfiTq;<ARY)XzppkdOf=w&{Ilj1n-$?#W^?y_t%$Z%WO
z9GRC)SA(NV!PVq0o{y-b678W=;Lu>Ul4OaFHaqUH1us<##|iH-oW4@_NuR=F7}2BT
zed&(n$vfLm7_YLch`_@VZNy{8wl7_{4x1l%*p>8!zel^p1Se2tj$i3i&=pP&JA+u$
zuvElq6m$|XoVth*KWh_ZwXK!o&5xoHckvdbn<bne;-fgFIzKF$X=BUKF?Gwy>eQSt
z71u*QaCTh{P73M*;*Rd3p$9sWj`9(ca%vLZ<5miVu)k|n`y!eh+@BQC%|@M{8uTa}
z8Wx6%haacf)Ito$^uD<qiv_6;oCv_0oZpw`)FucU&5$<+{}|Nm=etG|7l2yJA__tJ
z+svtHh;6Ej>OT>8UrACS-4x$ae&hj2mXGjz`iXkHkC8YyIJe0O@v7H@Ep$HrA5&iy
zR#g|ZOLt0xfHczGpmcY4cXvy7hcrk@H%NDPH<HpNEuna(-~T`7oEt8-KFhUbtue=V
zYos$6UqX-yJbJ>`eT~xhUIjZ}$SFh38)>R46nR8-F#Jg5NlXT&9GX%y^oAeJmGZ0#
z{^Sl%NWaIETjQZO!wZ#Z|2waBxQ3CGXiSAcbR4j{?3vgV;b&?w%H-}XsQIGIIVYJ;
zV<<=1p(G=+yg-HDhU=3{{h%0<mWZ8pe@+tH&N(--igvUE27|e{D!C4nUDjUj;IcX~
z`AL9bojZIWM4nS-bHAKLuXYzcx4b3KoJK;{&UH3d{(V$|)UNTYF9S`4N8=xLabk8^
zP?Mi<pm>~OV`P)9*S<B`oSmF?d3U!S8q68XZ@yf___2~DzQVGk#2i0OfRvGz+&(Gl
zYf(jsu-#U)bC5Ph{oGoNFZvZ0W;#kFtt_2^q7Ft{qFw_D<0uUaQC#IrBBHeTnTiY-
z%3ERv26Ddf51)M2$rp3+4fX|jakh?6h|+c1-&0#^t1X+TDKh9l8&dU4Z}jTvZ)D)e
z##)g<Y0Zqf^p5t<7TYL0OQ(06s>*QnJ*~ChpJTo4fc`NKR$RSpzX$VM*l#qMjLHI2
zJG;w7F4q{>8O#gXKxb|3-0QfasE+1jYwt%I9&IPa9P?HJAB$Jr);bH%z}nuJaF5(z
z3fwdGFjd3!EAieyIYj01_s|V1mimqdXjgQH@TnUMl9M98NccMCdU?ISMIs^?O`7a_
zr>r0%BmQb2h?1xmI3V}il3JKwo(=2TcTVuY0;NIwfPe2+lfJ?VAWv{VgZ;vM#VnFv
zZIFLDeE5emrdHHvcC{9}8R@|)7}{M%gArC%YV0urr;zlpc0&$=eFm!a?%rXJ7j~Kc
zUEr>7+h<=eAUl=4pk>F(O52=RxZQ&tYPmkBqD(z*rT;2M`u|ppKjLO)Y6qR(AAUbV
z|C=VHAq+=24HRTx)9vkXuteo;{oRo>^0L-=TC@hk5TL@{v+2mF<Bk{B^DBd!)<2da
zP9)DzRGIas-q8~`WzM`KQ{k|wsH|M?8<_J{{*GveAN<Bu(2^6D7+#fIotK!MIAX6C
zt_r)rh97CC={lx<%A_!PRR<cC-XmZNb&;_&A)}4@mM;G9_mh_)vB78lrxw~kbE?l_
zVPbA-a)yF>&XGYzpZu}JLn%41QTYjfr6wsgai7HT_%iyof>8+F47M({DW@d9KD-%I
zPJaS}44eJfc8O-e<meb*OJ;)Bt?m@8?XE4X;kS_COR5rpWT0j$ud$b^U`NL}!%w(V
z{3sRr=sSDNVy#U@42@ib_F$6Crtjlhl4dW)u(buXij&PqQ(O4ewxVF@vY{l%d&FV2
zI7jBtSDa)3TFFS}uIhJ1*KhaCZDcLTNfIWkpn<Sx%#&}!hm${r+9fnV;4=_)fVS2s
z1arS@1Ip_?%vBIHd~0zF6x8bQB(LpnLR|{HV;E?6dIMy(`MRj<ypc|0qKBxW@{Xp~
zoLp&UQ+Xne-`3E#vs<!IEvJ>)L*DAB#~ew#J{1<>cKZJAu4-RaKM+p=-l;lc?pE!<
z9c7X6JkEDahSsBnu#@K+MGd3<T|mPA`*T%FLYeT+<%k26OSU$l>&f*h+cWlB{)~3I
z39BU<KVeGA6bKTuuB1IxX5g8ktu!vS9;n=yvO&080!$wPR+=q+wp8T!D&hUC3YQp`
z!gcjVhW|#XXHI?CKx&3|G#w#e=3@DsnT(BA`y6b}&x0%cA(PAr`KTLQ#)+%==_^kv
zY>A#=kNT22p;czhs3oTY>jo`sZHR6`QDC=_A<R<rczWm_D3KbYJ0;L1$nd~UPdzzd
zg=3`WsaFx*a5`VSQxJ?OsL3jU?BRYO#mVUFe#+GAR8UYJ8Z5+`qjWM-fFRNxV1)M$
zE3A3*qOMg_QzER{l5{MgWEW}}3$$`o!O`_$DX}@pH3d}O>}~$=eZk2&n4KU+PE|$k
z;lNycYH~ea_cb+c^x$U-EDVdvY!EnFFRg#PXJp!*8y(VHA^3CEJB^k0HRmYq@T1&E
zh}yA7AMf$6LTIMC9@CoJ|M<kDm>yO5qxa)wL^fLd8Lf2VR=w7;P()rcnz!fbkdv`^
z$5<JR3=9nPwUcMHPQ%s2K8>XK0bil90f!KT2Rk$qb}Loy&*O)Z`xK}gbnZybOy3U!
z<K*$Az#uj=*e-rViF}j^kV+Yir-#&JR3`tCUh8_nVGFe6c^6#b4$+n(mH>Ua!1t*E
zh~VeNB%Wa%RBOXc1nCwt<fI>*w1UXg)HJ{8UPEiW`n73g)NTW+bu?5YVgq$e4i_b}
zmzDA5{<qf2l+k3ppK+Y$fXfwCxuNTKfnH{i7m9#VibXaNlEv3$H&^obo>2ifFRy}+
zFG5h&ltq|4lpmIAnerUdCkB4in0TDZ==YabMbXoh{KQ*jIY(o+$Ox_I__JyeVN353
z8WlRv1WrBdQ*AbHBZ~$_dwl`d4kGu=o$^00P60C?m08Poh30;Ux{SzAy*tE94UwX<
zMJrM>bxDgulp^Nf8?;q%F_X||;xA!EIWzwdLb)g8)jwD#VxYfzcPEZu`_2vXER=GZ
z=md}3PLDDb!Y1js6de3q8^Z-OW)+4?%`;7MTU?3=^m24MvU$9N&1AT8Oa_V$1y?8$
zmD^Y4-*=H5PMp-w<!H4o@KX6#>D^it)r|Cb(V$*F?sV`Yall;7e~tZTLH&tjB&+)J
zTRJ-$x~2Gesd0Md8x+TF+rqD+r;mk;!(Zgywm`@mJ&$Ixi+a?|wATy0P~Q?8aV}`p
zR;(F!%$ZWgR_;3bgW+LLPKnrmI;XA+v&cf8I2oW*Ck^|z*)Erp3A|`jKQ?GS84Q+M
zzskNDnL2Xxe^Ol*J*b&(4d&9~uvgt(l+Sqg>)FdQ5HN{LVx$u}t>j~tWC5F#h!bRW
z$|z^m2-~D*f}Zn%ccXKS69XUmENZ8ITj>4N3PEdD&b(JGw77YJ{_&=-Eip^1SIm;}
zIxjUCcb_;h`lNGQ@CxgJzc*ai`CyjjmCXCM`)*oNru(349Sru>U0e{qd{kbT7{cfL
z&^8PrFvEp?Wx7J2vfSj!)fD6?M|urUKE|MyOcQQ>bB{~`rL{6z@pD#sPCEubv5#;T
z*DJY%?EG<mP~O1ENtrAiBO-=Kz}6ee>huY{>uU6eWoaK5ccy8esgb*`G#3oz*M8sq
z4cR0C`EG0q_G(<Mq{tx0je4lo;U@n)zW^4Ty|rdSrIb8@X+UpqVCOwPG@n1b!bh{c
zX^Q=irvu!Sj!X$>sCOD1%H+owEsPfeu5is5%PGJT<EC_B18Rxjm7S9{=OkZhyT1{T
zcyQ>Ud_&T2dFZSLAGLBca`f_KufV&{{;Jvl`8wO++T9MQKAsSNXW|YF3}r4my+Mqh
zEP4#dvKiYNkM#Xz5G6{lRN!Y5yBNew?#nBpk;Gh7W&F?VcF*CNvgt<ZujBp~9-xHB
z@wRXsY#k1fia`)nc%B<`+DKmi(!J?9SlaAGQ5zUS(t_3fL*1P60ozIv^6iHj3DG$x
zhsU7YAw{nZlpl;p9mAb2!*&-`2IBZCqqMWHBqzZU^fCN;Wic5jBS;;wVTq*3Mqp4{
zQ_erY*hz+Tj1;cEHdyGVHs)$^;lA~}oG3k1ij9R5B<u7#+?(-sH}0f`??b&~6rulZ
zp0?KHt!2TFWMun7f~Nm$T9`;Y9;5Zz*%kRDMR1H|SN$;&eqh%`G6=ji*B)`zF$3Zr
zIV~8J-Pur&D~193&30=a$O9SW_|g<Mwi5pm_xxo~i@_cA=FfU#eV^^(;AjL_9<0+k
z{Rw`R@{=6uhGyqEQ{*)tJT|pGyg%A&jj`mN`G07#8-;FGHQ|Bx9?XJOV5f477gVd}
z2&R^}=&N8*J1xpm7kM1!<ndjd#KX@E)HUu|=7L`oUxa1eYa43a&L<hTN5SRzNI~tr
znKWEBje^ivSVUJWe_v&6P>RR0rO#IA6?LF_VgAW1vjZ6l-+e@Wg4U{>sa84vKIuML
zTPp>^0<Mp)?78@<BD24WWAB?Dhc^Sv#9*3H<@z6XDV`f1Oeo7}=Wq+nXj~W;TP?IR
z5vOonr+!>!R$7WHG!_&rsI#!^oD|C{S!5(5F5*8QYCd0)4o4`5YldwDILLK51HHM+
zCCf-|dtPo3Kt{$q{)1Tfk|yDNSm$d-qi?+6GadtbeT)T;_O$_k%*g>+g=afiZT9^4
z5Mav4{~ua3naPgPi-Qv&3!!V%wm1|3X&z~NmAw&PD7luR5rZII`pmGU{9Ef?i@PXg
zJIez*uQTVikTkz+H{R$0x8Ahz@At986K&ST#y?Fs3k!BgN}gO{<?@+O_7;Xbte7{H
zBN@34*|xF$T{l#p7g~ktiJOSiDL+L?>3}N^kjXq$0{Kn`aeGVsPWMNLv+z-0;d5tg
zXZ4=$a!2RZWlMeT`R0o=UV>y4Ov`K(naLVeLr$<Nfb;1VKkoUEtLXKyH;GKiO<7wu
z8uu;ssK51j>s0JDl}Q&?X*$anVI;<0j)1g$&dQk0*lB~p;zCv-o?5Tn(%;$v)(DIF
zq)qp7FTM)C9RucPtN(~$xs=D(Dbxtt-pKqOA-y__4!wrsIz^+JuC9TD2}5Vpm(4(&
z-U+h6Ul$uQR9!gd#6}(3eZB=qkDDFhrn->aan0=XW-T>Y))m`(!~O8Ha2T1@gtykt
zY+c`W`jU&88SnWOkH5@oT|rbH(m&wA6~x;g+jGxPZ6WK^`v?R<O`W50aA|XDEGGV~
z4)|gQt-5hXko~uP6Encd1(9~fd7<GQeqrM&nNGgl-|-C3(J{B{e$j;m+zMNQ=ZM3`
zkjiVk7i7_=)OmS~3up;BMuRJBVa}Gx6(VhBRg`h~>~*H1kJ)8>%+dnCs6P!k0)*&L
zQNOm<LWB3uPIF;IZJx}b-A!j%n>}M|3*|{wUb}bJ_Po7#0*s3;w@2a;eDg39)(fC%
zGyK1Ov^@*E$i24vSbEL1`Mj@oPQD%a$yF4d({9r3(d;yqamw$<LAl+WjBSEKOm20F
zHAbegVSnrejaEKRh-q=E!{RL04f=b#atClBEZk*4^g3lo{?kz<i2T!0@%lc<BJAAU
zM3*Xlagnw!E-ciOUuW@>lJ3uVVl6uAYNJZ1yahZuCFR~4BjsAD!oLTq9V!AlI%c$0
z3r+F59B<Tq)1<4y$Og%WxG4u~gemsA;tlO=yRj<cT@Irgzhpv1?0pR`_*?US>34Mm
z6<@8exvChpI;J9Q#S)ZY-!CDOoJ?A9)6Ygz#cE^l6o1qcYk$!tuF07Dd^-NDp-=<(
zNFVs(<}tqzmwS1;uLo)p@=}ECRHEQkh)8oBSJp(e#Qb)SmBjX;+Nz0`H+=KIBqll~
zbepm=CT_GFbHjT_NKeqU-Vy%*-5&TPE;Hwn^S!vVcx936cD?FG^Aw9+MPAb9?s~?O
zdfL~j9*zg6dz?48M6(YA2Uw{KLsJvqzyJQ;#@rVE$KhYxPPJA&6x3siqR?*gRy6Z;
z4%<BUt*c@jsVc?JZZ*+*e89GvIWH>S*)E{`O?^f>+n!p7(SPDrk(X}HmC=-pJjCPB
z&PK<%#@(M1l$Y8Vml<=W<^KD~Slm?KwA#I+()u*vE$Q~M2mIfkmzPxZrHgGxAb$l*
zA6~)B^^Bdecg#GpDK@w|rQ>Bs`rnrx_x$;Tu{7Vt?s9aIm{KdBlZa!^{z{C>y(que
z`uqF8iduD6-N~T_w9mI(&+7B5;zIs=2b7=TdF1b$6)|{1syBF6S^*}CU99e-<=y;^
z$q&(?s-|pZE}teXL^}woIy+z+8>>1V>_9y}SYH<gvjcwz3P4(%PxzawyXP#A%|>dV
zi_P|PebFKk5ciuMu8gacB{b&N$GWQBcSF126_hlVve+%TJjdT-kAL_NLk~yT;SCUl
zn91VQ)e{^4xY;i@v8*+5I5j2KQB=|pR2EqlAB|W73tkE=dtESJ85=88X`?Q1ErSFE
zYATBIzypNhzh8~r8&slQlA|SbmZ3ns0SgxBZgdL!_f{X9YqH)lfizcr&V}Zcp<Ol*
zAg`)AYkyW?F{LA-r=y`PuG<rmmS>$UC%G*rqoW|AA|Z>{XkS@lYQ$7q42M}+V?MjA
zZrVk6$|oWsGG4Ks@$V_FV**4LCaW_*PA~r53N_Pvs>&-|cYs$@MDQrS`y(IAs~1gI
zXtGDRF0pPeu(+xQqy>zO)jviO{<pNzgVJ{SM;Gd`aVEa9(%Pmn1HYv$yJ1P06BF;V
zQmXQb3fH2FP0V{D5$nh(nNjKYUE=!kRL~&g_r0?=+l$O4QASm0VQ6tvNxbt$Zb3+L
zbV*rRMrU$+Lj{_S5|Ny|q=B-$n3|e~0l`9WTXl9;d|?CJTm0AU!}dv!)5vI9I2ag#
zfY%)Uv%f;_?(Qc2$#AD>_P;9Q`6`AB$@eO5_;ki4ppGt?V+l^>(A__q?WWjQA**r?
z9w<IA`l*a5=?-b>sG#K%J-c35<Sn!pgs7-#;9=rus?FJ)J*g`%nE#d$lY<mm^Vses
ztlw(6mR7f?!eL}}j><W=yG&K3O$~thZyriF%rpH7k>2wSB2QP_xMZ!(<hc*$JsuuS
zuD7dh%9DJoXINISL0_so{X+K}@PDsSY>-3TEUBTlRH?bRUR4Pm{f+rz=5)$jPNl{R
z6CwOF{bWCl{EuhNT(>rOm9LxC;mb6i>N%crz^T_P^wRG5MNArVXs@@ra=mqeM{jQ5
zNq(!9?xNw}AG6)7Xgd{E1S>3-wN=h<Xy8TII`84ZWu<k&KLJ)vib;E$dV*=&c&)x8
zu1HsA>^Zd|4%~q5<@Zhh2$)FeQ8=3EiHt7we-+Zlzr-97F)BVYC)_&d`!Z?=K{Ah`
zvh<X%ip=4;1o$NGyeO-rK1^THB3qXG>ge_BH5y*YhF>e)ue(cbTxAQnhxw*VU2q_@
zRPk;|%FA7@&Rx!bjnNzU#Jz*n>)Te`vUiuMdN|ztl)}ZN*!B(-o{i{5*(b;|%wHp3
z>NPTdk>o#F-=sTQ9UB~qldvr~zdgf^d+263==>O@fujF*JvZh)k)(N`eriIM83EgB
zsICxDY{Pfa^X$#B6Z&~N&C)w=Ee<E_EwML>N=viI3K`j~jbz{4eq<A^Pb{vk*%11e
zmK0k|uG4{%THxpQph7H){HyOZv$474vpqNgqQ8E&A1>i@BfZ)F0A*jc7ufh+qetiS
zCo1>fCdBF~d)BCx>`cfxearUd>tDT-zMR%UH&#!#l7q`J%JRZ<H?tkHO*5iR^i_Qf
zTDCX6wD*K}oKWaWiCOvD-1UK!#Uv{)NnP&prui9;Zy2%Qse-UQ=-Rc>f*T2xAE(&k
zbE?y<XH)9ZmTycsEdCN9$`SFd^oLHRIU)tWvHp|$P3Q8XLkjpl<j{aJq-*5?7)le<
zhp8n3!=8U<bNYTGM6`p%)$~-G=fB@X94~ieQNl-v1Xo!rhE8U1pVbqc?OeA+qV64+
zX#?B`Tf<G@R;%lGK6z&M0m3-Z*$`B(mJ4xonmgI_H|kkfuVWS-dNthKZ)?Q9zyfVD
z=)SlPzE?!9!b9e;SoEcJcKoa6Xe+08^x-;Omjl#?Qya)%7pdv90%nH`EG3VGGNx-*
z$G+UBYcyPe$^0F{PUPLXpT}w*H+R*_hbj!HRc;=<g?|_EU}r1RP}bhG)bS9$7y6$^
zu01X4TM<#?)B*pa>)-j&-G;M8yajt9y_on`bi!IwqfE6XG?RZ_D18+#i-pWGKMw*f
z%vFUt7_A#E(bMlV5{2u=p*)jW3%9;=9e$@nUau*#$>Q;VY|mXHus>!3f;v5X&wi1W
zwH8OH_7P4inW^#4=}}ErpLMjK_S`P|hu^r|Iko1Vt89g8uAiy7-cu2|(0L%{TrM-#
zx}X;aXO+C(Ek0o|{`&&!3R3d|i}#4*HkDX~ej<G~`V{mD#wm5EvjR}CI`!u_xvOfW
zBm*0!7!0LAcw&EF$>_a(2vfDJe!)U-_N(N(?!T)ne`C3H)#q$1u8(m-!~!*B+3XWE
zcQfnx;R+4G|2|v3a}?;M&RqJN?3V0B<H$K-%cy^zFlt!@O|u7^J(XaWUZBZk)j99_
zRSu6@uGwHlq-eFi)|l_GGnW24+<0+1;-)T6-f4k@4K)2gD)QS~0pf2if3G7ilH1m1
zs52)<@7JIWjQ9VZOa%Bb+F4WmAMXNJ8cjLLlK%Rlv*>MRUfa1iB$M$)0omQ91*@GY
z6Z<kf!4YCqCz~AWVE%WLr28BG=m1M>t4OVY1T};m?Sc+WKLl_+9=+=yHtzzRTkB9)
z?SlU75qQApPCF2B<8nMWC_IuN?C|^~EMejwv_g)cFFF{w=avj=ZvJU}wjY0T2#wd`
zpcHaWJKL~Zj*M*c&^X}Ew#yTcIk^_MTG8q4i*azpOnOFD^zjA80+0DOw!5^{6#Yk(
z_dr23)(`b}ns)XP8Pe`xC%T#{WVyR732aSlLw}%8Cpgk@>hVg`pN8@H{azed$erl8
z)YK?Rs}*kj<y%ChiKXiD#8Bby4ql|YpFJN8zu9oYfSB;Vfe=iY?QxG>uWvLZ@%mid
zmor#hGKYs=b0V-g0=8&pM*-S3a8K8=D*kOgnCUhFD((_<6Co`se0#C7Qzac=15&pS
z#pJIG_VLn5)o-aD5g*DD_uG3Z7Hax4AJq@TewcCGNE|Qs437|hy#xG_P2hJ9;-HAr
zK+7x6rHi*Lf?VPC71mg}Q#pT3sz+)>rrwWjW0KG{bEf^$s7LPB1>4FFnhPac=2T?B
zG_8l+5Lfkvm;g2l|9Od;*+gi5LiVU-B@PPJG$-X^E1Q?bl6KY)+sdR0o=u_u6Jb6q
ze8`INMpgn8>Wl1AC?_dwmX<5Cjl=Pb?pxTba$Qge4|4=2Y`Ks)X0+gK*Lb7Ix`veV
zr8O$2$LxrRVYB|1Z4~-_Jwm!2@OK|f@?B25P%u}q1neXqKK<SwOCkOAxC^*&y#nqc
zfUmvif8k(&_i3;?O@n3ql2nJYOEJD9(oK17r#;)8+*^GgW%w>eZ!sID&R+3s4x<<7
z-3u=}M@3stDSSTa)9T7m95bN*Ew(y6tQZ|YpvH(v9BS@>U_#$cP)~>B>c^$Y)1LW7
zg@f;by1`3xo+5rt<E_j_6L|1V)gRAv#RBFhZQgR-y9ZZ0Xx;F)HQBN$(^CjrQB~`$
zDU(TNmFxi<>JTxq<3sxlvhO$XQ<Vs^)N;8@kwtJ--XW9Mxp;yzu(vh(swoU14+AwN
zq2mhXYVgXWikrungrUt#aGauqYzdQ{3a1)bItQLYTdWq`T=UQG=o=WHyI@{$k_=-0
znEYV3=!qm7986PG)D2}xO_zeHt0^$ISvv$&r0})tK(eh;Al@yRk$5qFc836MH#vhj
z<Qxg*88LQm&+nVFBFThV$t1~q&&%&mFF$*>djd$`>qRy9G6&AFFH-p_6E?+J3rkyq
z>{)B!eOaRx+`+e!SQ!Kj$kPeKP(Sc`XS-yRyBUzz4cMxUF&E^rnygN#Xw!fI!-9>`
z>a4I8ONsid!Ql#q<r!l!j5-siz0&J6?X-+y)O=}ff*GK6%VMO}u{@FS2LP^?+Ww}!
z0sve|%dj3>_8Jq(Rg$QuL*9}kEgpAQ^TE>di@h@~C`E_?@15b<G^Z$~X{uJ-L4?4z
zj-m!!*E|i7B5;jpRB~xRo3;}=o;XOyA=-RjZy=a$EeyoL!C3U7$rUOx84Q3+zb`~`
zsXq-3Ua~v$73VAn60=wo{?f~<oM#AOx=V|d#zwjkP68oov4r`y8*-x0AheX}qlVoU
z(xgda<3vjjKx=4hMIU6Db&|&Y@*_Q?y&ni6{Rl`9j`}!}g_<c(-s);9#hB<bA{lwz
zH0&d5S<f{wBL7ghdUpCW09)7wVbJ$UT)LNct#U*5h;q3Q!om;kkGh6xz)(9zK!A9a
zIAkmuR_>c=N7d3nw;sxvZ3CLXq`c9GUAbQd4UjQ5TL&h$I@rBVb^FQGM+|xtJ2`{@
z5BY-*czrr55RH(Porot9^f{UTT2^+vx0lTCdAYc}Je|pY1j3Vc$GZpsFZsFn+a_N;
z@vL=lb76T_Rd;5*X@=$6xZ0tn6ZU9k>-t(%LRxTrMNVEwx$R*njxAfpnV<>O?~D8c
zv!WGKsg^N5nUeAObsrnt6u4rFonCcPcf4VLp;p<zlEV6|obK+dA((_k@I0(E^o+FB
zLM$s^u#_8y1J0>Er=YN@q)~f2N7^=jX^rQQ4Jr#@9pVdwZt7FXez^A22i-MtFm=t*
z^|iX>pqYZDpt5E}?owbiaK(IP-*sPhm-@qm3(bKe=R|3b(lL@yF2Ps-pX_h&>;5LZ
zA_tqNn$ldF=6s#AizvhN_lL|zRFhTHM>SM?EN>ipahz(0ONvaXtUqEbn7xI2^ZI&{
zS!*8Yn#*ZPs$4A>e7rVry7`#9)1h^RXlc_fGK4oWSr*Is)Z(cYGv@tl=l|eIq7j4T
zhb~|Ax3wkq<dyja`BifRrgnN;bS{oCWIAXs1zCB2{O)4y7AzfhVeMlfNH`2uM%M>i
zv*MG!<s_$7bw>NAe*#AnW^zMAk8_=8Yg!&bd~t9?Nzp`p<efO>8~+uUt0GBs%^E#U
z--mM|ZU+X#ZgMs@n~i4s<0>R1ByNZAcRvz7xEyTZ=Rs_x`tmP1*5^7i@0|kKUITTN
z*nqK~;E%Ie<FCYS=bQ&|O9%Ix=aT6W0l_!SJ<vJ77_+2-F@W$!!ph%@I?5C;pInF3
zhb1!2K_g`G0A<!%i^xsXTS8&v)7BmFPRA^cR!LeRBHvq4)NJ4pnowa%6oQe#rw`U@
zVD}(87J#JR!|Od`t7VoI`p{3Mr8y)gE}<wo*W2M~X8&C}RXfGoUHNyp@g|hJ`s5tV
zahcndb3&z}K*$9d6i=I%zQ<6_ZZ|6*9PWPShWmA3g&1;hpfuy=F_%|xds=;GbXszS
z$IK^FGF37<pg@ji^^hs)mX`FUvbW~Np?}F+Fqt4j+>yN!1!4+QcHgFVuvGFf@tdW?
zli|>}0q|ZrdTrh#mq_F5^I*dd@wq=#<8IA99_?td$b+wG7GH_zumW=xFQ|OCmA{6}
zn7{p8S5#Kl)G<(3U2!{Jm1^?5p;)<V68Fe`p({TzSz2X2*QdU<oxb$I>)n5T!&X%s
z(i~FK8I{!;RJnPq-PhqHPo~d01K!fJh8mjKOh@xs*isc~oyM4h{QmdZPHC_lPB8%V
zl?`cjZy|(&4>8p}^^79aucwQ<=zTghLC!`tdhqcn8{_>Jt92Cm!r9XcH2ypkKwq*W
zZnG|dG=4>%OG08gi*A|j`X=3S3YJ9x@|9(lUQKjGO+#5!LwIV<+{w&B&@ZHi92683
z#Vh1#?q(+wCe&!c!{*Hlp||da7w*7NaGbL{cG#TbN|<?_bNe|4`#)G0=3k!&0p&H3
z7M`DPM7xEI$0Q~>GM&ZwGQm9awbYW5np(eewHcwc3&u%((U?|O+1Qx^^(+!BZlI|W
zDI;T|BRJEC&DF+jl2_Ils1NZ4q+5k!SDdgX6>r?lRTlzPrWE&|=r9L7OfqFo*xcD&
zs?1k<IQKkSO3JrvRrm!o9J@PADIiq==mY^Dh<3pzE+s|jWCf`a?NmSv>(ua5#QVMa
zd*!y8H0Tun)4h@D6-{o<q?P`H-eyKwp$I8KE2*S{_PE4ErM=dtRW%j7=Vp^<Ax?39
z0_Pz#J~SRN>mTVn*}p!gB~Bn~Z74Z8It&VnqS~4P+S~GxIiw~nDRm~V73nB-ZR7!~
z7g_=iF(wiTF^0#3be~k`7~pHd(xNM~Y~4mUB4B+`p~;&|QEG)81GYhWIWf8Z@|TYi
zL^BS9J~W}Y(UCGcN?QDP52({2S?VinYDy|GWBTDvbOQEC(d{+y1qZFx+TFryIzGfK
zF{{Wot%9&lQ?K#U4l465VC^E{^c4`8FZtcxcoby69Dg4d)Z!s;Lo^QCeD;cpk`<}X
zNBSv*0>hFT0%+&5RyFPx<<|o7ylz&LH_5}e@h9gO@qpYkc{55FrSxXuO>zSQfvOBQ
zdFa2|G7y0djtLVJ6ZE${b;bkX;H1dq_vGSo$QBYb+3gRT$rJoB-o*;IB*e9dEeUaj
zk+mwtXhnNqWy7-PcM}E{4!fxyJg!bTIVl&XmB85R2?~j6wN#A=g>b3Bnm`DJ>q<lX
zm$snx&tcuJ{i{#M>UMGZz+IZ}JI9p-J62F&hQaHCrjuAyo}E+dVy$GV|JxeKIsy{R
ztTJkFLl&{!fN<mUWlv}A4}4|Jdi>h9*{);tLxpY94;I}qOHMhqnlDMHPrtmNRzG{g
zKRzGlEqon1pt+y<XnAl!7-TZrtHq4VpP%x93tF2uWjXc~C%($rq5iDVq2VWlZt2wb
zb3tBtJ(XtF1Tv@6q#szt=AUDyh?ApN`l#A}T?e=HF3bi$CuC4lGOTa<s~Jzd0)6*8
zlOnb=Ao=-fj<cKciR?)$J~^6NOiHAWa1m7|qcp!5P5TTlbog@tkb@MMCMz{@QI;dQ
z{}H_bLYxX~0)mXy{3rl08h=$`yXlL%on@N0bO=f&*!`04xQ<CAuu6{|mrE1);;kS&
zP=CA%T}i}C<}BisWZD8rPHwVVQTPP^26+WPlfR<u1Yj3|JM}-;4b|lQCR5!Wo%p+;
zf2%;$&a`84RzO)+d}%Z+`FnGP$&ORJWQVGM`lcJh3%9)uVpyDo*+|gGE9Xvadx+KG
zb`i&Y*X!FgjTY+fhJUW1dtGAxLA8xOsRceCH}k{LgG~l_>8Od4IKx<>oH9KVP;GF#
zDWCK#8zHM)r>CzBw?_JirU}l}qvEh<x1gXh<{}O@^x@<2(o=;$yv0yno^j1eOtd;O
z7>I18(#Kj6=EUmd{oEn(I208v`*3`AL8`&wHtA+;j49heqJ8kamQui$DbV%kGLMj5
zQb8FtH^b1W#?h#`nS?!6@RoMA^0^Is31FM1s&ZX-nYOl2%u(GT965jYU-5zf#p?mG
zotu-hx~|UZ@9UrXdS}<eRL8AODl)SAii-AXPtA4N|D=qc)r7CG1${1Sl1l-Nu{`vJ
zb@_62!GouHa70i+N5mMQBg8`qNt`rp%)wEZOk|Y{J$?5iP^)4SknUuXp11)$MUTFN
z<yqqrH4jBxhg&rW+cac!?42)7_F)N1cyl!U=BDx+fKy(vFtC+`jJ1N>s2Uc4?Q-;;
zE86_}fjk2j=d;vxwP-OL^WM8)5@K~#0%=%m*&FRq=0JK1^yny=NF-VErgiP#W}YYL
zLkooO0%`6Phb|R~lSAm%uvX3)yyvwZ>)7^G<;@4k^h=H4Ggm7?{a|UF&<UTl9zqrh
z8@|oTiI##@mVK^*6Jx<>iHyU*73K2*e)bg2%s1>rm|PTr1JO#F!rByUMKxw~y9D=N
z;_ykC$?3R+c+v@&ZAQhU&H&vG^vrn70@(PvXlrYp$7aWqz$!jV(j4Dazox(x!lK`B
zg|R0<;R-KEA^R56F2nO<bn_?p^VE!t*7j!1iYsKJ1Y^iDTPvH0E9?klU(_~w`daKH
z<#fYunxYd*sx6`lLke$K`?H{-B_pIJ-ouK0Hs;^vghW}cH6+EvY;)QrN7y+&&ZyQU
znwX#`rzm>3*ckb2g)9c4DgKst-M=C+arFBM{A@6=v`!+g=D(}vGdJSGF6xr5MdORN
zT8X?NLcWwts7g#x0eYec0g1-(tX8&9c5xYXW)Fkv#~icIK$QY=d(a4V>bj1d_eWjc
zJ`FnGdSbH@E4c3)gELAv7aI63wU2of2@{2A>}o^H&YSw4K<i`<bdaeBP4Mn{5M!aZ
zeAA4>N&_m=8pQq3>lua+9cu{&X?_N>W34@n0j7P>cI*w4Zn<PVtja*rdPcnLPS{*f
zfA!j4MI>l1DBom9Hu`5X1y!zk!O~tlwYI7xo3#>?M1|w;f^j$Ne8Y{>=P@cGpi+I?
zj>b^7g`PfIyS~!VQyZK-47!tUVP1^-MARB+Kl6Qc6W8<s9Lq&gQVu>fJv|qfeH0FY
zNK0qLv6B;IR@9Ktw8TUbAtu5lW$t%{X6LDHWy^u14Grr>>}MacIl9E;^jumMu?g)&
zc5O;8L-M6((}34D!Ykqyc_*kScow2yG9o^Xwo*_-!;^#BGBYPwhMQXvn}{?yLZ9T|
zi<*XKl~-Ru@g6GZ=(rSWX6dE{C7UWCEJ%#F7F_2wxcWHs5aO}fs>Bpz-h-~Sm$L@E
z(Mx@eVADXXh5X%|z1*+Yf&HMhH+%DKDH@H;cimM$$IQJSfOn`JJrI29uQ`V0^{xhJ
z4`#;F1%3blnI6Os88zHaeLb=8qnxEV$<GH(aGse>X~Z{int0Uj0viwQ6qyW|$==&v
z9FKdg1c!L5e{pjsDle{pB^&ph&pXS{LfioYl#;aEyIA{t<vBx~nZJt#8xT}DQyDp5
zp}2!UPsUI!TWh>}P&PSqe7q9`jPEiN?+bOYS`|Qd7Pj=#MNQqu3+(Lgrg@!xSe>VH
z+9rRNcDAcQsM7v@1AeYYkOurlEaax5LcOMPbJSi#!N9>mC7Ar|Wx9TiR+rjZ`1upw
zL)h0dDeWx3t?_(Y?#jN2Wac|p632=kbu5=KO{7eJ!En5W@9~ZExQ*@NP0h0u>tY?}
zVqHz_Z)`eDq`K5L)2hP@LfE-%Ugx&DM5(G30Wlm}bYY%lMhz_~Y%+$zxPKuJ-7z4w
ziu!&=Y%%zwneFQz=S%0Cc{&6`Cm0N-A4-c<Iq|=0Fmc`_km@oz!D~r~7%P<q4n<kk
z8Ll2^{iJ*EyaoaXwA5lX%?(GKk$(tD7KinN7>yOpmh+z+dBpOipa}iN4C?PMBpE1W
z8F)Xl6&^VDq3NTBg7xHn({U9Q@U!@b!@ipo$$-M)rm`rB%xEoje5-*ZDm}Pd13hB-
znZ1Y}Z~DWrkXdk3m5G)MstIQY6N!uspY$Yj;iOiHR&HT*T39Y}s0kRvM!jq!@+t`7
zzz}eIbRlk(u2d(Zx-vLi&)wFT3qW7~_#U2O1o)@z!2yMPV-F?B>^S|8AEQRmgUU<d
zt!2jGu#XS<Tvl79&Y4|X4=)IINogo*VaP)L{orA*qOfR^Q&e4CZx3e-G4SvbdaSKa
zw6)jLnGFwRnqU!tAgg1o?~5E!GIyu%yJsX0gQh-$K7KWvCYlG?N=2Rp;gN!#Mp6Q2
zq(9Q;gTe?#$Dub@{usUQg}$+ic)*~&FO%hg4MOT8IMk-5VoK-wgs79!VoSNMIdlho
zJU%PHeek`R>v&ms!x_k4k^d<C5KIOz%ojG=7PdWz6Q>)f<p~)&h2t-9WT7{+l|Zvi
z*Z=E`HnV-b%LTr*WwY$7^igmGE{@_x18~;6@n^L)N0)c}$dGUIq@#HFb;?8TyJvGg
zL1;1gaQ25kj$5fEht%Ir<O?>mYIgD7x31>|A|5&zh4|ZeS5$qlvz|kU;{$^`+2HWI
z+F5~zAulTwV6ebcW-T+^7a5c&X3oMl5U>$Z8J>h|dUtG=E@W^MZ38r4z*)<{=dm)&
zDV<$UaDki$$ID1nbo|{^%FKfJ^cnUjoLU{7m<Ry$xbCvJ#00x|^h{)eWK|HTxSxp9
zI|RrAmi9;4Se?s3=}`_xFcLgOyE3Cq>)YC$<+8&3`c1M_Y$`P|nQ92k=j-m5L`llX
ztSPG9?C!%B6t2EadS0!rwE!I8NjSEP=zxJk867+sV7Oyx*M_ym@j$rNE#{_PmV+=f
zNcZMkY7uhTx6^ZL#uW(sg7TB3n7nqX^Ja>>9~TI*ME8KIE{VmVyQ7PHklha=9a;}T
zu8F^W-Mc{GuRD@1aD4A_-I9OE6<~=t%4vf<Go!?zK^<L9bd5R3o&jcY;7M1i{!wCH
z*fEha)j4|xj3>(C6u_2o0`bJhox}eL5+yw(E-R5ohzC7a_qtzExg^Bh5<5^8@w*_9
zl$Av?p&4Avuke4f!)k_6z)4o4sx4l2d|d3o?p-_JhyFnq9v`>ETUu&};;hij^j|yl
zpS{!cV%;5qeqn(<IXRh~jg2e>(=boSKWpL>s5`_)Kr2Jq0;89V$!9m+g(Xm>q^2hD
z-4(79#S|o3#*}h?blocyqOj_y=xnZu<~5OV`Ij!UI$*QkTqsHKIwAPh|BjO(XG#D>
z2oyOj9`hpjE&NGKN;H>!ZBd2WM?glLK`?H{g8lbT$%~7Mjje`B|6$j&3kUu%Ig*UH
zBFL}NY?1ARNS8-)dW_VRm>-&v{-Mwd(}>$eC1u%r9bUGtU0}G>zmP0{e~Nl3kcoo-
z&wifPmi1J(a1Sye$R5Q*hcM!daeeXyw;@S{<kj@3$-B^k_!k{*)+DfyY0TX>TYsxt
zn`bX_fWwu~v{K69FQzIu(gecgjU5oUQG;QFA~NiHM~#SRG`*?rg$;~nKV=?lRGsUg
z$nlfg%rI@oKO$b|l+<LFhfx&2!(DZN$`I81Cr^VaFwk5{nq^skKHPneEX6SfV|Kt#
zCjPbqRYN%fVymAya1-wNgh0D=#tJYi3%-G0+T`)fIe}H3_APdxQjjqskB0LTJ%<~x
zT~(*Za=*I<@N5F<2PQ_wm;}Xl{@Aa>gM%6x(=Sg)U)tW-2mL~0DTdmB*6V{=mn9vk
zZ+Ii4)SU5+`y`CX1lYkNQvjbwCJwR};M}t~xc4AnoW5tA#<jKN^Uj@$Mtr_&CW>0-
zZji)JYi4~}-eJOWSLaV8{W071G0p88zFod?J%F?kjJ^mt6E!&4jd#T`QCXjhct)`D
zRZuf652R80OFLW|XP&t_Fvtoy=G#P3-69(uZ*ZP$=RcXIAeGvYtXc<W))a*~SdQYj
z{7Pv0&Y8KLk3N~%B%vT<t+ir(*68@$PJNuMMT1)%^x<}zpZt>`1wHB}+8EVY($}&o
z;~^nY$PbdND?vZzf(8EUQHSD*42DywAIxlI8+I{+V3&v^HBj%7WKQ$C>uJ9TRrU?e
z*N3>{NkGgP|EBwD7LuG*qhBhdR9(jJpu-p4S364{{xbZrfsVHF*XhpzYzskLET@~3
za^xYZ0=p0|u*DV{#Zgk<?z5s&yy*%gAX>Q;8~}CKNbg}bxs^491{rum3w=UPM}2vM
z7h^s00CHpWsTAH;eyX(_Z9~MdsX4PNc2Fe>xM3y@GU$u93~vN__{qoavCS&DzP|qO
zbhbCY)N!KSIRoJF%_B`s$$$n?@i?81oE)w+O!Vf%aO^+hSkx7%9r`{~P9yq{oZ;Wo
zV~${sx)V^!*l*gy4yDormf>Wgn@*@fAl;K5j0z_bvsnIhhi_|hfD%Q7F(#1GUn%}i
zDjLO3v;%GC2EDh7rvD-<H6aRT>ic-c^<O|lVr2CYuFG`!U9{^>>?ZPm(ruD|Qib7i
z)ly7V@@_c=1!2FaorWsAVZaTvo^zLM!d}(PlVY%fXEQln<MQWz3$U3`(ihi;7Y21_
znJ<>zl+EEhP<IW9Q=u-|0l7Mjra!_C;ESDk6-ipkIu4XTx`nF!%lE)<kxBp@&`8t>
zVmU<Iu@1Qj4HFyR0%&}07ES}v^*C^X_+n{N5yU~}QZvN<iA^fEV}&^eun+*zfL#dD
z6xU&_gNUOzKSeu0o!9;djRX)n<a(l^p*@j7xWjBk0b?@n<4*~?OHO{!1L^4yr5PXw
z@R%PtcnaoMzYDAASTMM@1VycEM#y2{VwxMGPkWNGVVWpvX%)}RLZ}V;L_s`&EC}fq
zVJtZYzOm6tw@XfjdMC!_!T#p#q)ce~K+G;yD6Ns+{!pA=gyi-A@*j{tli+_?sc{XE
z-$Yww{wZCqg<Z(ntVQ9mU2xWm3k&ENDmM|($}x3G9%rtOzb|{Z&B@Yejw0R;i^4We
zA(wH#;~%B3fDxN|S5N)$tdG@eG?x4oOsvPbvywo5Q;?w3^9eeAMFN1Ho7g@Otv=$9
zuO!O@U|s86ds6HBUmnL-D$dH~(}&Q>{<yFxqil<HZ!mF*M-g^HFnX`SrjU-RW5IbQ
zMUup@<|2m5rA-?v^?fooWN5ypvr1hsm*G_!Kl_1^eIrwI#e6f8K!0t?g@v#IWNY-7
zXA{&9o?_8z-I+dD;4&iY#5yip=nv@33g0&qo98=xEt)4Qu>Q^X+x%r_!gydJ^?<`J
z&W9mVD4y7l7m0&kido@q@p$cdczM1d`n4boV*rCpP@t16rTy;a4xRB>a8^@V9E;H^
zL-B6#id+JMhTVjs)Px3izVb_LQd<#l;mVRSH)R2-NbJ<blzj)NG{EsK1eudGfHFNP
zpv&r{7Mba;sqL*!B)I~bN?JJq+-&SLy@dt{i=Md9sfWB&6YCXKW$}sLK`xe>;A&*~
zem^Jt>>v$hiprgRHiQejxWY}~&8SDZeSSXAhE3ttI(7$H!Lq+}$5YDxfe^;Wjs?7A
z*^<9<94r@+H(x*=lKXY3rg<k@BF^%X9>jnYX*ca<s^zbDhn|U-3p+Ye`#6F1*)+6)
zWZCsL)4Ys4M+|$e3VGjX12c(w&VLn-D2yqoEG;f_qABXH>J2ti24<>8C1ak%@ZIBA
z*qSlc7IUrAqaZt;p7GcULkw7O_e&?@Bq>||Skx6!@fpoH*_S2&s_StMtX?_E&HbFd
z99O=!J&^%dg&O?-&U@&8*zJ##h0>uRDre_w1SVqyj<ZudS8sa&oYv)pW_|H>v34qo
zD#>k*o&(PORvkdp8}>up(l|P?xPvSftAd{b-b(SR5uXnEa-S|CHj;lr)S3*$w?8ts
z>!7izwdi1bLhaV3hU!Ht5}TM*Ay>GY^th3c6%df|*R4L_^j5gEBV<IfNxCKTTgNrz
zQ2P=^6P*;}Kvt3068{t=g>~{%^1Qr*9?s&@pM(deNWzact{*nvR|$!6*nyUTWhRqH
zhSJ5x6Ow_cIxyd|H}a#lx49|%Ncmjj>PzPNMc7&#0Ax>(vrh<h-r-8_Fp*J3JeJp4
z1f)RVUZai=F{Fi$Qok!yQxI<Z1@U2Ud>O6F4W&9cW(5l}7Rn|ooEldlF?e+f+$oQq
z7FtqOxw)>-9Wg~^z5u6%$HH(Eb>O$V$v11v+h&tufurBI6H}9kImv>)?*U)oN>MF0
zWuo}V<t#~O`rVd1q~HE%QfjK2lp%Q_Xs;CV1cna2Vdg<dU`e!+0=1{S!Hn95&{`?(
zn_yb$J0sxXJIy~cs<5{Nn<q|rMfJCw+!9VepaT}<<S`^06W>O#McDbg`o+%j9P*xr
z*RPN6up#54qE@zAi{Uxh3MC}Zr)kL9Ff5wfKgN-onJ81bq#fDqX|Uh@TGPSoJzkC!
zBfG#v^)Nb$szBm17v7k|u^Vk?rSWsbiwOeCLD6=FnwSsbBoT**Mmf*>^Ko79k01%b
zz3%dDj*|2sViSC-#FqldVpw3BC*k7~krNsARjCNjf)ul}Qo%V3b*htS#S)lerc{(r
zsQ+@L7P$e!btDcI%FX4WTHQi|lf(5(y9FR~#)RCaDlj9z5*LgC`uUvC*GT!CO&6<I
z>zyiI_Y^|)h4WLT9K&1pxT-y54c08b%bLJZHhX_p9L`)wijcZ8;JBSj+=hjW?REop
z>RJQL==sz5l1r+fI7W|qP>2>~Csdglw%om8H@ZrnnKTlW@7)zKT4Oc<nTl<9_|0sa
z&!1Da)pmB4oB?vUQatwh)&dIuZ0P1G=hl^{v~5}zh}r=Gvngda?GnNC$NV7mZTmxl
zd5^Yu!(%moB1G`3Uv4&f;zA@MWTe?e1@>n)Hd05mVxThA>*Abrys6xQi?3%#KqJCc
zcc>8;)@ePKk&&OK@<^J%BkR9-r63Tm<lEqOqOwJ71d|2-;ZN#>6*3@Q)y?i1+}dsE
z2s~OyqTji@Ug{sd%JE?_0Q6BTvKJ0%WQ;wID_q}q-HSY3?;drj+C7y&9~TkV^Road
zX&(x?3T{&oLm4Jk!GLTFj*!^caDN1aPiSO}%v5eS>6`@cKVom|IAe}EJ2)dmIyco*
zAV~5ZuV$Zln0Q9F$zJh63UHhBgqEe+O4dGyvPZVmO?1cxyq&e~tO|9p|1CXurV4im
z)-B(iaga!&oQdFzeR^>;g#5nEt|X^7JUeM1tnFXtiQk_K{D_?ggFxNDxDkp$faR_3
z3o><*(~^E6PD&6|d5UO}O`9qvfCeq@vDHL!?l<)MvxUzv7gY=zzdI@Id&VzXt+>iO
z4RvpMVbzIfem&s!zBsq0rg8%Kgpyl*Rd?Gp6JFKdG&0#y9O4%NAEb-t-C%J}bguS!
zR}#oq0F-vFuP-k1Zw@BUwoG|TbI#m3tmKA?Ovk1hkuFbFi0ux~5B&TQcgmm$hJ}fP
zhWhYk9Rah~e8J<RW&Ak-lIAauxKJgs1df53IX{rXQK|R{g|(zCiTO)Yl2jvY1%*UM
zfaV-akh9V2=jjQB#mCir6l>uQnNMJStem<6-sXx|0;MwM^;&`NC{)tC@JOFd&jt`Z
zq&?^Ud8W3AHRLKjOC664ITn>V%U=C}Q|fLpsxbf3r2x^f;ad<9+A#H{A!zQDz|fJ|
zCsoiI=XBUd)3n{`l-eFgm1w{L*o%xWFl1>#G}Y|av7xi7w?6fX%p<);J2+rF*Ahlb
z7Y7(`BZD%&n_mh3P4u(YhJDO4l&T4DHa600i#SRZHn8&4Ezp`}+M$_6B(7>tT_&k-
z_(@h<oTSm5IaaZnsnseWqS9P>RyI1x=WajXthyliJk*AE2g)4!QqX<c5^<xm%V$+J
z#rMW}?IUb}Ec6-H0Cv=jg6U03Vc8T7mZDd7i$oi;g1S^#D_9^#1VVHjkhURJIaRwg
z@)|zW8S(uRB$e1J{~mKl6IDC-CK`_SgJEZWz-C>O&lN<SNr>kF=&3#h-lA5{oRX==
znJ1f%t^|$Rf7Q@l2dmdfhY}8*J}pVf&Go7#ns0JD;+80FfH|<<9w6}wge_K5>0{MA
zG+?qedi!v-2)hn<2vSmB3jopNG_GYlc5f{JOwh;qg^lTMp1%nBsh|Z}^q@-F3XHxb
z)=B+@{1kSJezgLul)D3~;ebx1EH>YSdjYwUp{`t-JuG%)JrEKEYHzu?-q;!o{hfAI
z7}>gsCIqRBQLb-_{;%CR&+lQ)GO;@<N*aN+v7zDTR@a1iQ`p0s-X-M-oUh-Ul1SDl
zu!EqY_}7^C4Eo%n6$SS~)VBFTljFkBfw!QTY9DKO+OlxK?XccJmC<(*=;mRSqrXvT
z;P0H5zAI88xIXD`ox$vOYb5L>KTr|fYEf5~MB%Ig_FIaN+9}T803iwzM^`PZSuNab
zCq=%cMFbSm)h=y#aEK7RAv4|vzqwm-^yuiwLZ2K>)L{Jco4vtCgb=x=bBt><d9C+U
zOx%4GI1#p{(~J)otdeis|0hlHs~Jwo@FbIg6M``oZZHCzGa}b;{l+_0<fGIE`N*gw
z2kUJtVSFeD7>+6k3yGk#)HG!I8eyj|nr)Uo&*a|XoE_67L&)f;1njxafzo3bDUmf2
z@d<e8i6neXNW$I#xI2%G`<IRIS=7-EUqpH}f36P*%Rlvb!>WI&8#15|1X@C1>z&1y
zqtt1$^pon2%<%e~B)n=K;Q)Prvox@Pn|xi5A>5!lVYdsSQvN>vNXW{10-dw`uA{v@
zar&G`Bt#z3CzkK7C83dalgH!3!SI0l<7j(6koWc+zKdsFZm5ptYohwAdz@mdUs_Io
zI+egHMBx=tL&ulm)6<Q}(^=jvOPHbXLj!0Suo!i1q478}vA^&k>>&P8nH{hj6Iaz|
zqA%x}W^CqqZ3;6ejry~YJj^JziAZ9ti%i1eLwzB`ys=An!*{!yJ#1xFi`DD@w$l>s
zCpPArGi!DiQ2nK_tNK?26;$M?18(5dI5d7*IqpSYMlSYW?l5Ea!j*7Yv$wdN{h(qj
zeVfCAPSW3$zg7&(N`bG<-=@9Z_7T1Jdft*FLyM0nbFGhlSV1}h0!=X&eT<HQf$+07
z`gRC^80Az-n(F!YjYhM%r0xNz!~<Y)O$)06=$Vx;2D=jrCnNOVM1j3H(4kw6(xxc2
z>d4lH7{mKMxOT@bG&2OoRl&vP91^9*wS&kpTt=SAumM^7+|B*Zi&#i2a*Dbz_Zw60
zW?P+#k%z`Fp$~YyO4&S(qV00Z$;-5ecA+8FQ=P-STI^(W?*i9iRz6ST{t*uy1PLIm
zAn*uR9cTm$blmfA#dzrcz5y??m(g_bxA`|nz$?<c?>|y;neNZi^Dph7|Bp%6(KY|k
zc+-TFg8DG;2de2Yc1TDR*)R1&qu-@vX9SXnm7-6yR@p`Ps6N?J2e#1Q)~IVNgtr)3
z^GsYtu&U07iVuOHVY)JEf8u|V2?fFEs7P5LV@Dg2D4lADbhX~8MF0h3e|?uL(tDG5
z1z|5|m6GufTDS$8z!9ay{|#ssYzz4NNkRe_6S&@7zkCRo2JgGao8FbmqIlqE_(8Zl
z<uW>3qx!=8)c4HyEEdloxys7Cmk7doJBx4X92NxhgFl;p)2j-vEAmO)?ySu?aZAHU
z(MZ)GJr*F#4+wXKM`y4n$LcW<ZDMtIssY)4l0|txzMI5y+Jl^TV5l6;7yJqF!kC<-
zq?S6zGGBLpf%RdSQ%84AOm+>!`OQO&Cc{+1+}NDK!8z@+9C;gb6+`V;^3tLLBbExw
z*8juXTLo10KkcKWbT`tCbR!^0gCMB_(juZF4bt7xAs}4>3Q{61poD~!N(sWIQ@Zoa
z{{G^9|L5l1oO8TUal7~0Yt3ionP;AvX;rOmi2ayY=JiVZZ&+D|q<zWUlV!2`zQ=Sb
zL}>-)bM>LvacRCU98ja%>Y_?*mb2R-Ihp(A>*#(o%&dMNJ3u8mjw~tuhgkKd2ya_{
z0p8a~+O3i-Ml`5&54m`Gp=spZ2b)p%rvZ~p{uFL7yCo}r;YLYsVZIyok#YxH(>}v=
zr(0oKNR2#}H%m6m`%c<IQM6L);To50B&4N(H1js6F!@DswjwcWwZx_lKAK0xr{E0e
zrVl_t3L$XkC87R}Pfar3Z%1Af(@&oiC81=^pOTmojM6o{{>zHXn+@w=LxP{BK)fl4
zhG+@+G>e&*3uH<yj}9|a#N$ecnF2mYOHyjtL(UVddf>jxIPOV0zZTnqp2-Q5mOtJ+
z(t1BbiuvrmF9lu=Zw=;6b!+pZ<9^Q>h3_z65t!hz^i`?%J?owavHR;uPxD7~jkH95
z`qTUrDKv9--=-u>aQt}0uuA%#tDwB##kJRZFsF?^!M}CEx@Wn`m>?lZFYtCc3{*(G
zZ}yP=shVu#-QAxQJOQmb2_u`2CK!c-Ig#DfAA0JgzOpObF==uCri+R$?EO1BppjwP
z?na09`S{5=J<sRz_C~Bk*~+-~dYHSjvVLwt24d}YJzMhEVug<>nwnzPrjn$$Zk>9;
zxa;nmI;!{g?`hKI*KAex03zt_<?$Ao`F(4JKK^~tJI2@d8HtVF=N6W~v-$0CoIHKu
zOL1RR)GeCz2i2Fl`CWt6jPVGKURM{!5g3n1B7~#AUhX)I=<qC&W)>B=VOtd&7z*OG
z_1qnXb`HghUY_@w_yS?a%nW$_Wa);Xf5Y!R@k@T@h0%Bw@$Q{<Mj5@AdQZ2z%k@aD
zwvMkxDi56MwO!3d%-j$A#z9tBsXD-~*v$aL7(-GfTV_y!n|Cu?u&fVhPF{;I*FTj$
zOVUWRIRd4)w}+`H3^R!)_t1qwH}P3rwDjMO?$Z4cQ81OCx!s;G<5O|l^!qNJ#nP*k
zo&H?@?}G%NU#MD+bdbT>?D902PgkakHgrL47T?Qg4EYI@&37_?#<~q4@-U(Gb)po_
z!yCkD0o^o)6=AWFz({1`9lGLgh`8#RT;g(RJLgA&cj`i2+C(Nyu=vAtr%m^Zsh8&+
z^}dI$8(oL(6(+nu+9`3%bPPqGP&B#3S+2BKnZb;@EN=q5i1dU5gJ6pUhU~$c#ZOnO
z2X0=x{Xf9P`QhYN6y-=)XD25coBd2nu!bG$U-p2~e8N7B_cXQ>jnZC6mGq-YVwxfN
zJZ9NmPKpElueL9OrL<kw5Ys(h+ne0?0hN}7Heypb>Piedy9#bfds@7tin@%YH!&>>
zSBcBkn8pgryS-9bUH$-_F5X&%x3id`3hhR|7i1I1PGY}%yGU4{5N(no9xrXa7{wwJ
z33A^J=N@W=F$0HHjQXN>WLqUHPN2sJ83H%@Zdbj^OX02cUtjwFOiSB-y;I^FvIG#X
zVIppGWH^aV{XDc2ns^?4i?G*5m0?j+Qu^N867HMYq7w%F?AN*s7VZhQF90tXI7K*`
zWk=^72OgR)JRZ}n%4}(=xFq>39);>?_zBPlc8xFH=t<H|?kdoo$Ml;081b!Xurqkv
zu)h;KV5Y!Q3q_xivlnz;6OxUdr=5UJH!S|(cuS1)@P^WtHoP-r756)?Ux8WkmG9Gx
zF)=F7a(BE`MhiGNCf#=gKW;uAu4<aReKkI{h|Q>&wCw)(@9I!zusgLSHXVSEmOixO
z(}Q(x?sr3NP!t3e^{n0`A!|TC-N&wqu$q9<qYcT-<+ab!vER4NYsKepq~knP($1KT
zrJou)R1BjOIofx&Ptlyjh3=T5Yj%~7PFt8w(T4ViP|l=I7Os`q>X8^wb>`6hlx|FF
z2rqc!?Z3Pu68zG>_wz@lLy><tdSH^rI&&D-X@5@SGp5Y6zm%lud^HwcHc($JdNS3`
zU>T^mC$TU!hl4PqSB)=s`yGGIpQ0J}tN*#h(2#<t@e4;74l5Bp48zD0#WfDQi?z?^
zIz;^JBUP4O%Hy~&yJHS`gCAyMLo5#onk6NE*f1Fiir)kP@g&rhC7@mJ`EKc<R?Hi_
znNK;8gy9*BCJjHOEyP^Bw~?7`HrY^De3LfSZp`e$_mPW1TSBGEuY)%!s5Y@`70LyQ
z{IpEzf0I{?O8WK;CCxMs^_CeF&lPtO`Q6XGuslddGYsoT?J8TXzin>RHk@hpDeNiz
zNx0k@r*9hYJwA%?ZS>C2CmMumcIEE2cG$~|x#x24i~nk1t#55e0ETPptAQAwN$^7y
zfR%)|z&FCpfhFH?H=<0k+Pafa3LdZfDE<UC*mQ}5GvP`1Rio2mI(MTCHSw@5s@p)b
z<2f+#b5tLOEpc!K^tmH4rJC>;wsMezxRmP@8}+l1g3OGS(V>1YE0`sVv)s5f6aWQ2
zNBqlIg5ZzXt(Z51IVD|JZN_@C>dejtd^tm?aLTAY_RpX`k@l1<US7{kqW&2Ak2(Mi
zsSbDpq&w&=IY3?c+3U5&bJoAdgq3CW?^{(*Ti&#aF8F`@Sfng8bM`0Go65jRi6LXJ
z#X5w}I2GICCHd3AznyV!cHgBEXWK8FEirS5FA8&~sgU>aC-pryM=dMQm6LWIjGf~w
z{ZU?~e&wYxZ5r4f`$1u4D&C){DehS!^bSCGAT-Z5-N|MdRlOI2sc?~qHm-kfRxVob
zmjWg0*`ymWuMp17kWOp2Z@9LE#5^U_gCUY4*O@Sxbs|-iJEEB)WrmPrH`LTzeV%^#
z-kDS7^j(}Su`P<4GKf6HNsyzI`vqJ}bi4J~JAspP{IkbQd$-b_gqNdU5Bqu^N!(_S
ze&LGGtEVAhnSSz1wNr~X<o<wW%|o4!Iqvuwv1}jDx_xb6B5Lco-u?UZ^z<SQW+QuY
zmk2m@o*0d;zh5eCqm{R?UrD=?@cc;YLc1Aw7~9ptd`9Vo@_^5T6Xwus5;W2$D-l(4
zWO+7`e#Sky8$<1dUc-GjR>H#-j6<Hy=#mLHGi5bl)Y{DaP4D52n-$IDJ+wb>3{$J~
z0B6RPDfY%-A~vGucenj@T8<M(!Y1j84U*An=#x97w<t|cJW$?~<I9}+>dD!fojVe5
z6FrN*fl=_{CTgZhj0$Rho5$}LPpEHdZbv1(Sc)<^Lu48ia=#YQ)+R=kv441NiYRRK
zm7Aq9*H9U{Wy|B>zw%Pz*FrNewv}F>Kg&*2v$+s~PBO}%hfhh+n$WHGUCvt3znhRa
zu3^$gH=Oz?(nTY2t&nDW1;Vr>Xw&qzI9Yom218)j7rR>Vt@aF4Yr|AZ@-{&eM$dtU
z2J7X=nh27onS^vN16@$98hMwTC|rjHwB11iXeCpbctUuSGQuUxT-mEI^`D7jg}E%x
z#r8rjn%Vd8jmJ$;lJ*~DB?&#&{m+hplQ@sd^ON8+q8950@1xBu>F7VL{#Pr*R#4D*
z^Tx>B?_}}sWSzrQy;IEM+K8>W|3&}bN%za$hNsvTcpo;S=CtPDNAj#^Ou`(29>1tx
z6bCk;8(nXaa~>u$-LXglGV5u}=O&{VaTa#3v!GD><0f+k;j<D*Q)vwHoy^#GH=z1P
z-A-xMOfprq3@rPr9M?FBziKT)I*gu=0j=*fS?n{~YdOs9?@fe!{S{^NZN8v=pe8g(
zoMG9eVI!Tu-Z0<^yZv5xHy}OVey(1bc($Z_Gu5JsGKTw&=ZM;5(cm@;X<Tj39ZH63
zr<IW^eREA#=f%NO4W80lQWi>r7)Z%A%k{vi)Goid&<2;z)QAHue1Rtzj%t2A`HEce
z(shfUl8#=S=vQWdDgs7X`N=mZ0|#Mq@ixI1h8cW%>1<Q`=Hk`&jO#Z|US{1A%<60I
z=xdJuZk|iQQz|`bQz>^wQ%ErMd4s?8*`GPgoY;(%pDk#|%Q^lkV;wW9mUE*YjQdjf
z;$FQntA5OV%KqJ~)i29cr;^8@<?`}utKswm8tRK@k0eB%N)#3ZgiCxhczBC^^*bV~
z@p+NDY}C--`27bD8T~2QZ0*ju@op{eFhs5}s7p!ZNIdf$5*cFq{Nfhi(!Aj3YtMC#
z+_FYA+FwDd<i4AR#WX7aBt{^!V<Tpat^Cvr`4P_fq9?Znl?wU3=hUULwOHaQOyb-v
zS21lxyjV#}#hlLlTj29(Sn8Hlsb4#%NCn-;>8Ms1GhEtMBy~i$GlOSgb$|IZ<f8+^
zpn^L2@pSShujlkjA)9{^{SBC?7<Q$wf-Ox!w0iKXKhbyXU)_&}fxh3mN|4?ByjUNh
zKT>*mAd&MT)8|FwL7>-n@8G?!4l9G<Ug5yUvB7uPB^Vh0ICaM#b4P~#d02XppU2wQ
zui|E52<k{>Td^8~i=f@~F}puEtJFPTd*A*3_Z@P+juG_QoU7iRqNJes_U#)dJNusT
zqeprQ3M`kqMl5<3!^1+aU+;mqqL*(@?B2cP>o+C`v8V1|clYw!-;t;TIZnIk_j@TL
z1N-Lpa~xH`vP)y+Ooi=AlS}e#>V~X)sF2OLp+Y$V5ht1konLzj89z>^4PxNd*c)a+
z!_2@Fbldz`zUW)b$PQh#hJ`=U4^0gD-G7|cZVKNtx#8ee4>AlY&FNg;QW~k67fy3V
z$QJEzTfbjIx7_wN8K!ZRZLM4kYI^kUNW7VJt*ZPRxC+t;<TR6KBD7va)FAy7yZQWq
z-mn+)Kdv97C$q4z$vpBw`)63e#!_<H#4hfQY(FPZA|hvcRL9*|yGCEuBs7^>mne56
z5JSh`7Ymp1#NKUn{%xKWZA!IZUZtJQjRx2Gcqbuo@!C{=(*@c?{{fg#$HKybDaB>T
z<4fMVHPf0?I&-6VFmuD=an@#&v+fi(Dv`sVp^?YY=6UZP6N~#?D_T~q%WEo$kros2
zNR52(E#X$X;AfP(?~LAzP2`U!jU=c_WgKlizlm*EdDoQSLC7v^W7ryXnznXD%%C}+
zW23Qz<Y1-KaPM;|m>7qHX^Xu68v3hBmS6NjvgIGNB6gu*@o8Ube@%k&r<2_?X(!Lp
z%6AX681#!V9E;E8^1S@s#%7L`#ebG;MYG9sGtS(${yL8N@J`soCG75LA%+%QM7v0^
z`*ZH=p+p%IqqeB!?v7S2rp3DaKi||2b@k1>O0Mvlh!u2kjO;ODm^r*`-Wo!&^Zi)2
zlAHSm29QB<FkA97@4Kg2(6xW<8}{9DUEMMIS*n@QM#r!9=Oa)|rC4;S*VoF^xy1R$
z6n)PAy1Vc!FD@-9UZbK&B~J~|8;71rm%Uv)TwI&}^qAE*A~s*zHYQwpd!Lk<eO~&;
z=)XX@L%X)N780W5M$Q{OvO#cPx~1Vw$<6oOD|k3K&w4XxOK&3EZw*t+6WQ19;PGUX
zNF(Q7sJwqE_=|!x(PGS&QzzF=MAx01B9WXVQS_cAB|8}8oA=T^5v7khAL2f-`?J{U
zQXl<m!;QRO*T6_`%<yxJ+Z*4<gsBlBW$9X2_xXN`ziA{K<+;{glU{3Kzh0DQ%g@0b
zcb}(Z?cHh`^3x0}%JQ$R-RXO0_0Oaq`y209^U9Mm)4i^bX&jQcIko+6?5}L~tzW2o
z#O}r__O~{R+s)L)=^4AR@#OkgIn1MwTOO~nT3w}C$u%5O6{5fQ>>17F($dnbKx0vn
z?DNAiBbOmn=ADj6GLwr34L*xJ*6jDio>|pk{>$!iJ7-I7Xsmzw6lK-K&GYLu$^A~R
z$zP_qm4SCu_eEF~^wR8YP1B;WMO1RD!<>w>P#51Wq}kt*SFY%HbCc@kSr=^Lt+<4#
zo_Z2WrZrxgbv;+Q)c1#0zJ66`(7(*7e9SE}ms_51J?YynZ#s(Y_Gv-7S=qn*+v)4-
zWP9WFe35TObGLgU-dEb?eMpt#bYeED4HkZ8ixsM+qVO<bVgtIjS2Pxr)8b^gN>*w=
znRhlOi&3{GrWklf$r>5`#2^`%>pYm^F4p0?!T@}E$(p8`%P4cc+2nnMLnH~^fv%V5
zp?#?W&|ORYGmc@_KWwY@iorkE&%D{=3%D-}=BAfhzo#nmKGeRmR8*W_qL+U8VxkcT
zI`I9U8Q0oRRk^6n*M{9CSjViM?x@`whrQReb%p}(UrbJ8ld1WWwHMj{8Lf0#9;h2%
zgy$qa>nJ8_?-|utNiSXNFgZFN@Y_y;&L-{lT*gFKSJwwg6XWaR5`Cw(wo|=y&(A54
z??CHRGE>fvAHHtqe&Rr1ZWj|{cwo%PAJGFn!dbTCS2yLZ&iCqO{QBH<0{-SQ$ohBy
z|1Quf<Gwz6cVElLm%Nc{UpLPG;=rKXuM~1=hqI$CMIK_u4GE!N3hsfUobi_&VfMH-
zWPg%xpI-lM>@Fr)Ms7A<xyY9l=vK&ww(9T*QfG%hn`tS*z5Vbtb@rfva!KpNAI*fk
zh1qdorbEp?w$WiM$05vINH+41eagYL!%^8t4a51`uGV3BJ8Pfa4^kx`jAsp`rw2Hz
zy6QFu>t0OPyDSsBD*eyH;UdQkRmjtU-xdo3pn&-Azrb%k<S@?RpMUx1XUTEM=_3Dq
zb?C0{ZQv#U^<R;{$>M@nK)%nve%7lB(`f$l7x>u;7jm-m|GXU|CNlr{?+gB4`tUo_
z+vg{HNu1hZ;^NcIo=F<{I@88}7YAdI;n+?5a9kds=Z{o4@t$hEviKI8)#Sc)`nPVT
zo((s2SNor@<4J`CkmhUoInd7vn1<`Hok-46J^3gcK&16>{_o04Wc!kfI(e^mU!2^f
zq`SSf$MHUVqBHzxW`3?yLtVY+-MeSo{P02l8j0Z<*Wi10IQk(*HI`19t)E?27Q5WM
z71m`*J?@T&wHLt}UKC2`i;Iid+1cXa;-jOZ!Pf|S2M5ESELC%Fp_ieQ2U|$GuC!CN
zf%td2=Go8Tf^YSny;xXMZ{NRvzxtC^EAv<YV>^n<pYEpf<3EiD1{G$DiCm0ysr<gL
z6BBV!HT@Uf{tG}fe}kIQGH515YzDk`7jROQPMFlRwP7JeP_-Q_tRqhoeMX9l!o$O(
zp`juCcXRXdkNb>MDA$$@ahsdl0Wm&fJ3WG4+8y)SHG>BO^Gn=}bjS@6u-q#mqI*zI
z?hE@x2F$=Y%e=Xm#Bq1J)n5)i0a5RB;{5QTRP?o*|DNS=l%`*-^{uOmxya`<+3fnX
z!{zy&wh01SWjs&!S4&E`)Ya9oAKKU~FQFZu>@EK+x~phkXa7g`Y~y1%FFZ7HAtCuk
zuCAg6u|NC}FJwKo5ocRD&70R`QPJvKvPs&)3F+q&WlawM&M?x`f3*0bKu5<PxtqN`
z*BPZ6$8fR8R}YWZ6Pyqf=~C+p{mJ%2_Lc>(@$lq4w)Nqam)mgG@{@cHHv|~Wnw%Hc
z_xCli5#Xe<!Jz)-)TXUkj_0Dr$B%n^@0`xBoDl}~<>oToy?b{_%btu67jDu^H8r)T
zwzi5lT-15l+1X3-D=MxwCPerne`q0I0E(lcr2Js?acO0^V8+Mgpg8BDN8n%BO#Gq3
z93S<3wV2Tl!ynOx`T#{BAbl>8YYS(HVSCx=;}g5-=97-dB<R6?y7TrrHX;;{N*;w#
z`emlZsYD=3*BVm*bk%jU>rOK|Ufj8Rx1DC{v-eSh%W~`4rdiajb{KDv_2x}zVjROQ
zc-MaqjXFvLd+6xw9LzF4xj6cOV-ZnOYEa>~(@BHDWkR}0kDkcQ%>1Rl->xkH6^!)d
z%a=MIwY0SQF&!Np1*?Dj_)&ktL0njF(%^isHnPuKCr!j4!wWLfn3$;8*qB$Z26}tT
z%gd)LE$4UUb9~SK)-@aFN;YVu@&}{9!)uR25HXyKa&XiZ6<uo#)p#CRtnDx9x%+Ep
z{@ZH(wJk!&-&0dEk}j^0qYDW5e+ddTt`*Icbeg-oI5Ww+y4V6YTv}QR@8z}6mAdbM
zrpjW~eX+<oH0~!_Rnc<#Dx<g=ole$Al9`#r7tO2t_&B7xng%{Sl)42bJzRafHs&yO
z`w9p39PtJHf1TQAq-`!4a`gM>CtKRtGPi(dE4rJ}+t>H=#}6{n#B!69^uD5ItJ~xk
z`#-gz%<s3m(1Y{{{+B)_CfydOs6OCryuH02Crq|>`+{AA4=P#ly(L)_)ch2cPD7s9
zQqEuN()oNkdd83S5EncF*_r4W74%!D-*kEK=n?5-^ttBb<X^Cd%X9C2oxPosq~FzL
zaFJ@R+_^h)GmDO<riq&y0U>Sbua1h=OQb8K2M+*3K4oS0h{$1cQ%J)4iq7#I`2s8W
z9JK};Zeck&@2g4D=lC8@q>I@`MQP+3N60CrkW*0HH!zS8w{P`1*#&#K?2~RAzbDFw
zjR4Q(q<+`JeTd!c^J)J;RujTmXUo#k63)x{VybxqX|g}VKmp1Q*U*CQxt6le+EF+W
zV?NPTtG_>#LfQCPXguz{ctKah97ZKzR_%Wk2*)?O)sdNphm$X68Wj>7ONblI-!gA`
zr%a)Sy=38zS2U<FRg{TYS2Z(}MYC5*+S$QTR_=rktoJ<==T><}=)P*-=yA29ibulS
z;Ouf^xo(EIp3KkE^ANu)aC?0@C&%*H)Q`#T_xDRd!x<qjCMI_O{{7ag%d@)~;&;<U
zdw%>N<z`@D=<MvA@jK%)*z9jhX=g7tRdnnNh_oFmH??N+ukdSkUhEw+NZ%PTNF`}8
zX>?tkQ>Jb|*`peOx*Kfq5->t!+6-i#<l}H1f-}pJ@mzybS(bIY@o~`TY~vVgH<n)7
zYkxd}RW&b&kWtooWr%r&eN^_ABZ3M43$X&bQe%$KZqM}Rj>FAI;{=m%L}Tfr?QAA$
z?ToFgSd}=+^71VED92O!GbH9C0%O1|#w#r^P6^gWKbV6v{QYTtv@Gz0(Pyvz_xCIB
z^B4%%KI(k6h|iP4XWZhtD*W^#yGF{d5#|N6?+@wcYL!2^;ZcpUte96?%m!iN4RuhL
zzI*rneYTkG1M9`+n=ZZ3kt-}L3_AU!B>miwVGEK4d4=0Dk8PF<i)}1k{#@eQrr9j>
z$wkyW(#}(<Ke@UfA`v{tS|-H0!OBX8g_T9Y(c8x}T5f!Gykk{9<&XL7%;C?IE33_!
z);MTU-#=N_WPM~jG+a~*u-o@6?Jy{%+WUy1hNL!dW4zW*dXm33U-!lD8XCJmQH&b2
z(JO-rlZHe$@2+LA8(f3%!S;5i*GBb@v(29O!C8I1`Fv?mecMJYwWel&eB7f9%@?mO
z?}~Qi`Z|?>C8nS|7PaXvG!+kzl$p0O{3Xl?v1P0klxxi`Dhe~$Xbo|f_cTAprI(d?
zwR~?SiHK1)y8IaNMSB=W;-JmjeM{wAQ`BE)cTsVB+qGwP@r0CGork5iM#T?5`P`hp
zx1u&2#c%d~Kj)L18(5Yjr)!$yhq*b@jvL!t#}ub39^?bGMX(q1hGts@ze#;Sd$%1s
zAfQRWQ=V}*-8W_Ux3NFv-D2VscbFEgb2hv4oR;LXpowmL)G<S<5H^KcIy`LANV;n#
ze_JkTyYwY>#8L(1Cq$oR;FZ;W`Gi$!QA5D$KppCGxRGizKpfg0k@>SY^6O2v#89rx
zqV<*KZ*lW_y%BVzZ&4pC3*Ii>D<tHv!XmuAET~tk{&yQ=aQc?Fg*Wl{1dXkMEI}6U
zmk8SEwpjB+rmw8>4EW)gVNc^ei?{xg^C5l8$3LQk_QC9PdJI~gzXgvGMF)1CbzcfI
zi%BE-4eZoycEJajBV$#!@kU046-w0c_>^+v5}O%0e&aqE%SY*I;o;&+OHU6A3(F74
z%rrUL{3HExBjVW*xI)1HF3K|~5z|3I3Z&1OQj`}G;@di+Ui_jC=fY57(2a_YW}zhc
z^0Av}q)2<YE+U2F^0(tqeu0(Rb+#&coGrDG@Zj&VcWI-HG%HL#t%2}#%IM^_JW)V_
zu9;aZ1H@IUr}iRGzIWsy+#X2Yflk~C8wPSYzsqw^SK$>GkJiF?hOocJX|bVWZ{tz2
zN?L-k-Z0+Dbs2KV{b|t@m1-%~i2XxOR5aHH;YU1C8CZ-mmOtBGsb?QGV>hG-f=lH#
zx2qyCPsu?^8x_jmvJLX#`YFjBk|Kj|Bjs4BiW3i(F*2vF5TzgD6*W?L+h67zd=tYP
z!;UD0V8)U&KFup6Ep2ld6e@YP&A8MXYCG<^bQ&5()$k(0c62$1teB;>LnxKgBrV<V
z4&rJxrs0~{#ax$(d&D8@7_IZ`*)!i5{eb#d#MaqvOrvrXWoTGfLeeD>^}1Rb@ites
ze%|$tp06X}GX>q9*vlkZq1mY1D|AfVnB`#r^|t`*F8VG4wyGDO{~e#6o|2RlWqXaE
zv;N7pxrXBeRMQo5jqcscOh|a0kPsQb#KR*kB2wBToRF4H*k=)QGL1s17L7BduUAEW
zSlwavW;{y&5eu1d$G7&}Wa=J__V<<%fe|}q)WNgDU9_C}76JnC-`^&fkQ9f9i%?Qh
zYVnGaQ(nWvvAk?~ZbBogsBDiTGjD2pva#ruv9b7Ihi1aUZ~k!c{`|`nn+2ve9nA2f
z4_M_sStu%L{5Njg>59k}v_$j7+UknA9<gL+XO|!en@eXhZaq4cvY&3gX*>F&UdhbN
z&(9CFL$E9&ufD#%u&}VErUw7P195bIQI{p&l!=cvgO-6Q?_kCwUWT;08GAc;mXZ7S
zM|dG#>@5$1qx_-}ko$Z%{~`66i~OyIxI2ETC>p7c+-b(8wRuu1EbQAzAFw(<_RDDV
zLk~$_M^(bZ*A+M`xc1Ul<m4?n6X~C>$}H-DV`f2#DDWufy_bpARKvWFNiF<eO!y;E
zev#eoh^jv#KAXq?ieTLs6#GsjfgQ`-h;mj+NGOATW=o{7qCfNmy9o8y?@+w&6)Heq
zDP+co&FSRc&3oI%qb3+m3A?gF?c~YL^gp6#MSOBDUO-<2Fcf@`DU0GXCoN4<RLEIW
zO<TDd;kT_w$Cz)HMi@>II^h?#YB|)Q_D$g-d-{!MorP`%Js&Sn#vF*}w^-7>!cyrK
z^f}u)=F&Q6Oq&j!uV2`^z@ZV96lJd^;j_LzjN4XAUo?{YIlfEbAucWULseES^q>4`
z4+5_RhM}nO+$Lkn)}G2f@br+LU*2|<+@mtj4E&jYfBw#wQ<0y4$^2D<!)0MUphk@s
zog#0@WUqe`K!*MdgBU(}?Wnt7y=Jd(Q_ksjyo-lUPyG9tiHTtv6jPCs6y7^?{Kn^6
zp`mStadYnPQ>j={i=XduUD^^X!>cXSI^GZSI8I*Jqx;`h?cX0L)6Y{3)2=ezp_QbV
z#3W07bY?p6E{&h+mYqIhl(nEQ&m38;+@h5JG5KVu@CE1T-FmW2+@FTT{QQyv3$F1Z
z5q>CC805;zCV4|Q5qG}Xv49Kbgd}#i1r0U21T&SugpWAL0&UE}r0;|kg_@s1sr@$2
ze7-_-sIIhH2uZ&pes4JSKG9$$#!Ke!&JlETH2kJA&pO_-vcyxnDPjDWYIrU9l$E+P
zHbQjn9jgj$mAekjvyL^E!@0gtS0zg3_pFZT*ed%&jc(V8=SY-mEYZo9s|>6k;0vWb
z;Xt%6(sZdYQ_6~Un3YWs6U#FtDAqq0$zv)Gm%NEVEz}WEzbz@0P4`?E8&#>qp<EfW
z%^p^dRP6zn>|PrwO>I*Q8#ltl#B_3U+Mr0?H?Wum{5em=n}%5ClpnWgJYmxIlK3Kn
zqLq!l9PBBX2@`q+QOApWGCVFb$9XLFFf4M|7j|r1>gjDADpehYJ-h*}(Q*_jJa|Bv
zP9Q_$=dv|@i;iB8lt1Q}x<&Pg+&xF};eE9@4Akh>e(6$>uDd=EvH7l494H&|_xZLq
z;v1DK|H^|jv(SCc1$RnU))l4k&EQq3A)g5H7q5Q5$)z{N_$eKq9f&ibp3~-v32U`N
z6yv<+y9n$Hz#-F(ryNu>Cp<t`vqCjdeD<8%q`x+pVitjroKFng!+4#T@FtQ6t?}u~
zg29C+d&rFcUL_#3nfVPx6vYdLm9MFYd*kAlIRVU>j-Rar)F`%dgg2LEP>UZq&)$3e
zI$9&4kF5Oe#%r80wk0p6a>WH&rE=NpX*@*9OeYsHKLg8D6H4)1tiQ-xRdAm<VV-O;
zuKG(O<oIq<3>HWh&zY<_NRIqza?dMpS{Pn)+O%kBXo$ETAEB`f#6jgt@?duga`M@G
zKd9<qh$|(~EL%e0fZD9l4fX-+FcHg_85kMi2x$gsC|!%-?xDc6JbKMz0MIJDPjI#;
zY0D|bv-fL-zoc(FF5QE!=3he1Jdb{Uif<xzHdZ`Y_5io}3%dGZ${<&CR=<l-0)NO<
zT3?^0kzYgh)I6fOy}eiwh4qz+q0j7BOwk+sO*k0o5$}yq&v8=sXLq6hui#_QZ^+_z
zZE}3C>EfQGCnfbw`&%&tYpSaUV+W=|F17$L)&F#r^T*_nR?#2N<85Ysc1z1y=vW>K
zAM!HKmJW}?O#OpVPSTfJ5rA^(;5QU#Rc7)Dw{#LFz(~yapyx%7V!Vqg`qqS|U?Sh4
z5Urr^d-Ug)Gev>c0e?Umo1`R$U2A<kGPzknuP|*2({Hzya9SU=V+nBh)7Z(b`|t6$
zTS8R)CLLl7sXo(qA@S9~n4IkYROB%z|LTTG!a0YcgboEk*l0CYVb0pgwIvD4NT6kO
z4GbV1x8D)pY70cy<+*VKa>g1m?D=jov^s|wzzm<ZJ}^_7H#oCnY)^mQ`es$<A5O-p
zmHROWwg@qELoT*AQaT2Jzs2t&Y~tfGQ&$Y+zR}T-{D1KT7czHp{T*;<y-;Cz{Mqk9
zs;J@D6Q-ys)aj4Qt=l#c<M&I)Lc+CY8eBM2_gk-SdF`(NPjPm{XjuEg6K0e6;E>{J
zy-rEF_Il)8mFteT6O0xCWb3=rNd<UnKJnokOk9No33#Wiz5*KZ-ZM04#W|v82>=p)
zZ+lje{~b;U0ut0P>gi9m1~8gfMdgXJ^RB_ZV#F2>e1FN|KaFmRqwt{(Y>+~)1eO6&
zY@fYr-wq3tNmi~@1ZxC}wX%X}xdjBS0=HOmLXg}i04@oKX&Kj*p+t5KXJ=<1{VU8q
z_gd>u!}7S&2Oq*v{PyYuFA0g;Y)8vNQS4t#b+PZDiV+I=to|@qunvL5_Vo1BhVj&J
zosq$PYnqRrKM<uwt2Ig`@4Vn2q}<?<fQ^_SV!+0=@MFpcYxm!oV?RARI{UhgAqZrR
z;wYF72E!~=c5{FTtFJ%)5{%Wr*4spogEHOD?py!Q0=cv+r`ME$28zrxExt{!rDKxa
z$&UA+2wbEcH`U-mh2#rRn_vEI+}@7p>Qdof70adi?7qby{fg9g<Kz}P8XyO9w6seW
zxU7%=T-M22X$4aN<bDVL&;`5lL$6T6;)jNZNhm2}<KkL9_c+u;PCwPvsZt0C2%Oqa
zo}pp1zoOs?3RZl%;c<e$9Nr5NQC(9r{g(HkF;R?`=Qq#G%m#EiI=bI;d=vheR05YK
zt$y?St0V*j1VFz8!$suga*2y?_P*vZA^lGnVn9dDB+e1ES}Hp`dWUkETLKhFI$#%&
zm4B(VoBZ|bmrmQzP$dvvSyFB+tX2dx!XYhO2MmEp-YY}1NZN?bS)GyJW-$BiojZ^R
z9?pNehvJ{s?6EU9UNwJk5H*;2B5-~(D0ekgX-S5$4S8ifTQAnPqZzqpPsbY}J>F_L
zi~V1O7{6IFHKq71Ik_CO>#~Bi4l~mYE;i=o<{il<KT7E&9W6Q{NUmMG*4x|L*w|>=
z^pcN<r{HLGS{mXGCQVXG#+74WcU(lo_d$WDvKFX|!Km5iP%=6{S@xeD1S=XV&yw-P
zy>wPjCL+wV_8?!sx8K8oyO<M#Ytifd9bix0^O5HfZ%55b3w%msz>aqp`|cg-P5VN&
z{?Y!=DV!N3Qv)|}S0aE#3K$u9C|X%kNZ$dFxc~nTe)|8398W%q%aCvYB2Ac|pP!4X
zq^8Cd_z1c4ZS>TmI-ure(9zINkGA}ew=MZsKQF>iQe@NyBrEuZUDpigiRI<N9L52V
zsklLZ82s7A(?0{Kysrgaag6?Fhj#p{7a_~Kz-@D;D&^@+^Zc8$U%X}2pDqR!Dg|C(
z^BzCiMtm{%*}*iJ+uG?9XkA%dwR{8je=%_y=5jZP0M@RotUNMxzbL&u0FjG}n|rnp
z4-Zd6O|4Rvot%`^`DoL$d}wAKW~UnF*47@r)c}_Dh>Yt521KeTD?=oVL)y(2#lLF+
zyRua_gAB^b%1?0w1S*iB4IWu-Z7pzgYO6qW$dM*1qGY6{-Nv>XuME`ia%k|_VTp|n
z4EzFS3`SWBM*C<W^QPn-;4g;UAbXwy7UkJ9?h_IS9LVJ?i?1#Tv~px)-qO(0`W=iI
znEW`y(cOhiWX5NY;Qzc8zF!9iiFS^Fmu1#(C<LA(AB~o1_2A&Z<i`w(hPap*1qsOs
z3dS}lN}LSK%F68j2|s3w)1SQ$4-X~p0H`t{g@OASp`oF$*IGMM5?FD92o!N)VXqlX
z!CFS?xs?^QEvSBMasTK0Pj7>gA~8G%B2RrGhu6_&%>MDQhy7&*GU!5e3&s((+K%yS
zy1e84fQ^8y64_F2&w&Wa5Vw<prvWIU9zO+8Gy|m9uu7p*v|1KiG^E|&VoX5OMUIlz
z<)wQ8%N0DcxO~r#g_Nk)6oP>)Tqw@*n-fm2fBBadH-0P|YFJUiTn+A9W|#CQpxYSC
zy!p{-UWJZQQBiRNPn7x!{WD}CzkwnA{mHgUdOQ`pixUka%>u_wts?+b4Q!N|@^Awl
ze^IY5eDGzb<EFAD-N=%=sxe8MBXvE22^uF_@d2=;2w_90fec8G0jqk`b%l69>~eQ~
z5YnzKD4c?jMD3gBvYz=Oj9Y?F2$d5R7ylZLIgRMN21Hw;2ko82PZsVim#4ORC~b-e
zT?p?Tv1xDKe37ftpYc7SPwj7!iDP#Fj}KNo&hX+;&Up_xpd9JIktL{JZ}j{_s6#Sc
z4h#$|B^t(WuB=4t|M;QDzk0y*I9!W^sGgKb;|32;!^RjM;^OQGD0LkZlgv^Mwh$ww
zbCbiXO9NwL9uAHNCMNiAhwB{7%j4tYKYemDHKh;f&%Egp9TU@VkC~a1vv{e3M)78D
zd1a-bftp(P&!1$t;eAhVu(9J`zYa|dTw<2iYWoVY1cyFInE1x-?rwg5J{ukxDJdzG
zK)SuQ5vUxluC8j7s62ma?XEW+$3?+8<mTa-$KLq;DN5r};BH4lN^<g4le<9bK5Z^F
zD+>#xx!nmEL*aOoyohQenBfWp8bmr9Jv@sTsHCvA<`EZZWdUo4ezVd3kCe2uba`gH
z4#aLCDbM#;e?nkSs!PS$RxgVx{k{w3w6(S8SXfx)7-vf|;>WY?;U<w+7toe?eYzSB
z?NcP`ab64w`QG2Z+@GPUqGMe1l%zIfrohn=P(e3o?$yQldAx!%>hG^<LRKpukH~wQ
zuKl|-*sRTKT#G+9G&HmgxNCu*(leZFjqrW&m6>hS?NVYKJ6qd~loW9oLJGE(Bp>II
z=M#nIak9Ip#1X@Ai&j`#LZZ&N&OS{{NC-?Bn_C~!;bd^>mU~crfO#W2Kl=L#B0+GV
z7>1valoXtvC1j;yX~|4do}Qk5?u@B`X&t{gKOg+)&6_u132C3Is%8b278gT1U@9Rn
z1FnC6Q(O;b4Pa*_0csM*IyaU>GfnX65L2PWn<3?vmKL2(Nt3+1JhcuE=$U_%5%ua-
z_k|1ZUv%o@#t+D%FAfgQ_yv?H_Ut`mJ$Kmz1&Jed!oX#-s=pvvv2l)dRB&o}(w`pc
zg0&o&n)3R!Q>YnhfYZ4yz75uT5Kq^)Z|yreFQDk+cu((|h9Yj}gS&V8M@Bvs7Gl^2
zx-*l=Mw4O9;`W>9>OwDYVQFcUE8A|Av2&x7bNkn?2+rZ*VSrd_G-A(x1!EDZ(adpf
zb3J{gwtjx@lT0V$AxO&X+M|sQEHn<SMyuaNd`U@3goy_^hDdeG*`~DFM+MV*AzG^6
zta=-N|DGJkgep99|8KlN7m^6_BMoBRSFIP|=ii*0+f7}g`Yj+MLnnVy?=%l+O`9~&
zpZ5B7W+I?o01U(?LopBRP%V%&0AGb)-RJz+>BS4)l%?5O`Rvg0ckjON20WeRyMm(Y
z<7j?CfR<(laISB2vXYP{7y7_-;orZ1k5h3OzMxQV1_!01L;Na+?&dByPL4=o2DLAg
z9GskTzNg7-Ebc!F)z!kIzwo|1Jc4rlZl6E`8n^S|h7m7WB&?9TakMd^cmwSPUn~yk
z{M?*Qn8wA^HvZ+MrB{)WtE;OqG$I^KOu1IyVkMh_*<YmJqdb9Ww>s3{2>d+<<F8KF
zE4JynA7WyLLTx0WV+qY;zAI*%t8526SF;si1=GXiBf+Z>95O*XCSFfHx`)Y`N-HDa
zME2f8f%DCIdiz<!W0;SeCMF>v=dy&qRkaL$PjQhY>!^U}<N#wbc)sl{e_UJcLiFW^
zVd1>MlPXFNWGyTqXx?S@2mAL#tlA!zVTfH2<}6Bu+m3R>6Jkw}g{x-t|2nuXaL^w<
zd>}F0L-+OAU(qe^u$6RMORlLARq6&L1X1QyR8$qQoSdA}3Gu6QG-_@d8k%nQL83i$
z`5&LX-5-+(sl!z7@=eI=)KSrHu+U2xA?&7MDizD;k`nBn#H3(dX`utm4Kle}e>XNl
zLPI^J)i4A_<+QcP;2DPZhy9*a&vIxvxyH`N$LDz@4uKH$P0S-SZF5EHZyg<F4?n$<
zk-yMI?J>~PQ&6E=<`fpLKB$U~qq~VVjP0=S=gwjR_(*mPOiU6tXyM8$jT{1bvY7EX
z`?ek=a6r5}&;Od4VeAAg{5R=hlvoLiHk874n|e0Nd(3qOdGSJL`>XoSRw?Jz?L?17
z*c$Wlm_2Pgj<;vIM!{nTYBLHLRb#Bi#>P&`@Yp8IArNg_3sF=8VX8FL)S1%mvEW^q
z;-aGFFK}-IRKom^>HD$>2oUW^P|NxGfV;mp1vae?6Z6WNAyAf^XvC`~XJuyQsu1ou
zZH!j|K4IzW>I#$b>J<pcV~Kdc&BYbqwXjx6C+C;>ba;Q?HQA)n0=4LKbF;^h1I|#5
zT6p^0pLggw%_JlwY685x9UCAwFf}z5l7=V&SUl4h70*?mL0+{`n91EX&!C5wXoZfO
zo7>mdH#;J9N<kfP*z~duQg~7=T2j{5*0$J}ir4)5v?_2<Bepm(QSZfzjcV&YraQAw
zK=8dVB){vVoFQb@g`d0%DJ+$_&zCGdu(xK<y?D%<cdju&Uz#C@=XIZp(*v&W_oaQP
zU(q@zG_Py;<uSEq@7a%#I+8*H9l8Je_e1_jBnVna@ZOz9G6h9%-?H=bx7baJ2Yy0E
zD;fK{X6Fo2u3SY>K^^aG#<+Xi(2yE8Cj$nMLY?#yi=U~vXCD8_lP5hrJ&^h|tJZV>
z?_eQu`$_zcAMv}rD9w(Gg)r)5!L5n;B_(C$ylC)TnVFgRjMdfELowZY`uf2g%Ibdq
zRA5;sbX3gDvP;{87MbXekB@WNbcZi5FVTW$NQe0a6<|jK#JmJ8jBOwx6jk0Prle4g
z%S=Cn5P4@U5ldg=&Yf;vq6nH7g@J*nTL@9AHUww`!pgqB8h<N%huVunN{aSt2G5RA
z)uT>aIt;qUyHY$vbTNeVQcQTo>WCIDqTTGuw6wGUt8E$FWpb^kM3}jAniNf?jGOcj
zV|o>EzDf@qZmWj~llCtv(bR#i-EMFro3>hD(!HZay%0GsB~K;gcj0vl#y-^^X2mdz
z#uLQa(6R$dz!y+5nwXkaNxgih0NxS69xoRc63ZC`fit-NUuU2QQ23Fpv~<(bj2ALc
zx<&*AeTA^pQ1~QX8W{_TiLxt%ofo=?3sfQ0o45F!KxzSlfTqd-WE$_iD#3g*m>rY2
zKqy#a9fc+-E}nA5{Q!C(b?;2CZH$ys)jgGMHuP}7gK-jaU8J)Fn1XF7nKGWF-&|IP
z@<7wVg`OMrWFS+0dhJ}T&4~$Dgo!CYW3HgOn$!r+Uw_UUPR%SS3e5fe{rbHQtep~o
zOS{BwAP093jU#{|!43MeCUMs#V(S?vWTyN5VI04JX=Ad+BNE~^r512xpf{-R{B(SJ
znpj*AiL%6qm$eOREzi&Q1|hns0}jcN^@Yf6@$~7jQ!Kw$Ui1GTLB*FZ#TA~1j7`Zf
zVqfIVFkQXe82`d97Q8~EMreO221GSdtM2UWxjc|n*tT9$UanGAN7e=<!;OiLoKNED
zrB!fi&`8?)TTk|v(9&`^wX*>$L~ay7VN&(rPdcElc$-KO5fLC)9J>-QPuwp~-S}7U
zLmI4O@jXpw>U<r(1!Uo!&qygKeE-;Zd&aBg&S;^ur~Volv9`9p+Wl<L5&)mx*Do>5
zf%<(A5)WHj+Ye=BI^#e(*uCn@)#3U}@E=Ge0a8oa9nr5lOICz`r$1j$kP2CchY63G
z?v~d*Jw4K|!4fRwagcu#(28;~Go$LE!Ze<zY{IWzw*@F7AcZtCGJ<HuSvTCxWaz*d
zAfzE{kd%-hkGp*gOkfqNA>P)~OJhtyS8NBtVvx^)4L<tV!;38FC5=h;7$nOwzyYip
zle9gz3BR2!xxK%?KoHH_25Ah!EhQ!8ynvC6mX;Qw3$a$o2NxRwcg(}XgUPxlk^RL^
zDiW%KlA=W8&Lur+^b-ZyQDvDojK}RzQLnOn+&6h6An(~fj9(QN?seWuArt}t;dCsS
z8+f4#o>c!%;**cU04K};VZMxwi12`XC-t=T3fbHe9lSU{zt=b>hFZu=vx(h#+<F(6
z;C2{2E)Gs0Zcrlvj`xC}(i;>M6xUdFCHZ?jNjGlXP?L`sB&$mUK91%&s<7<dOanST
zA%hGTD=RCost3D=b5KlrEXlqS51DvkhXhp{jLw%=zjf=@^x0>wVDfI#ve-gmL5&s`
zyink!=r*Pr^ffi{u^xX3!aV0BoO@twd>!lYXql0To(PkDIKlNW;rKEnx2HGJZf|cN
zyI)@~>HGuMH3P-N%6(pt3NVM<FUU1h*xBWH3)?~m3Z6z>hL=BIu_uMsfnjuio`8#h
zfFZ9gN}Mf$w)<2C)Cibea-8i0mW<{{cxXT(ZRPz-ke7UfqjU~wN5uO-fBy9K_CA09
z9NcKPg>uA_F->KvBR~$NJ+2MNkY2@?0ia#5eGA6JUZ?JVvPd<J{kf1bK%8kk)FX@~
z{g!uaMMXt(vy7gzi%X;#`>ZT3Hns)%Z7)sD=l~QBMt0yd5Q&#ZauJq=0RSanEriX|
z_czni(?d1H%!aHE#Lw9BHTUEz0XM>O#M~?owByY`O#tAz@J%s3{}~!W;Z~4sEi5Xc
zWMqs(M@ntza3SEa!ODsBqpFEG3_2b<tPSb5hSHLfHlfJqXtrS87+SISuPsMVT$z3`
z-FXS$hLaOZ76sGx*|SJ;I+y7zOO^lve0-R5WiM{`Ta*o+_g6B6h2$AJ7NOFspdd5{
z8J8ENdLFHb#0ZOpc4ah)c;*1m2*$Wht7bci_uO8T`~y&EZ>hhpuWx;_PC)P8{rm2J
ze=EOAa2m)U3i<>&R-u)9z31VmkdrqN5s}9)aH;AnC+DQ;dVYSgGPvIafp5k);uH)K
z1Bh+@ADnRuy^(y4Do~7Qr9B`Z0FbB?`$J4xYFy&`^}W1mvj!Y0Dk^FVunnag2%eYD
z&Mb>KLp?)$%}3XJKKyo??}F}7Y?MMpKWuCSxYNQt|6?izubsKhX5Dw#@r+TFd@R~O
zNSm9R<F!<9maxzaH8nL=Rl_+0bn%SU?EeNK5Eg9HP3}a`O^l4*u#x6I&ow-Du=10U
z@jb0+Zq6)uz%Ug4?%g}II#J^VII~KRciW5t>KAXlBt&GYjs5&N?@yEA<DV9N+27vT
z`8F>vkY{+mhv(8<9fdA>%F~`d!24h=)HNIB8Yt1g7Y3m`NtwoYNbFC;((FVGda?+V
z+R94A(!~AL*RP*}Nk&;_+cK>pTX`Uj#7KYr?%gWvfAySJ``_5?0$u0=><S(vhIPHw
zH$pCAKi}h2CYDy@MxWcJrhkCM9LT=a$5^0PWCo-WyaPzPW)sr`+5!>-Zv<%m0WP^M
zBL}?u`CODhe7Arr>T_mO3pUGS&~S?X$gtH)mhs$O0PPf$10kK{(0S5(s3fR%F1d3s
zmsARjiv<z{kScn%O(Nw^x3}kqX0cH;hw4O~G6C0CKOcF2dg?`YxFqnZ?klvDIYZYH
zoTZV`QH7ST%$N+F{)Zbhv2t25ZE>s!0Pt{VjbW=W(7siuXj^sdSvtCH>W+T8c&q0E
zXeedm88Q1ybvv`@hQ~kNYJlo9W+V^OiG~;cr2Jv2VzdA!ml-=c_u4YEE$Vp%X}s?g
zpt`T2cvaF*nH+ZaP{9er{o4urTZsGrjVky5k@bv=0ChA(^nEa-DIX2!)3Sniha|V$
z)_#HPa^S5vWMG$)z}Z5@AAZ4WLyY(f?gRIB-_Woggikqsr;!?7x1biFI?JjW>$sPE
z6oQoOwucd55z--+jL`^P<~kw|*2f$ssuv-2|HFX}-vE5W!_O}Zd6>kN7x)?pm!<Cj
zrT$smio7ZR);Kd#K*?@yZtmpdWa_;1wt<JICxow_?rsoy0{Dc28Zw{hTSCw892ud&
z)WHu0CLa7LID$^=)@&fmAi_*cPR@MxPKC>at5Tc=hEEue9SVGzkfj58mk3}N=NDuW
z8{G-44JS`FW?EYT7u>mXXPYs%ygc?R1j>bKb8R><2vSFM`^y8+`(m(*iHZVY{FVJ;
z%O4y85c#9XaPJ9{<{kO}Ts<Cfj+B&?4km<l6H-RFAp!f!<yBQx6%|{_-1<!VLOgtY
zre<aXP4Y)j2BP43z{SJEiia6wsHVO$>$X0c@Tejf+XCn;36eJ8JH`vEtA&NV|Na*H
z1=Pz4B-rY>i}VXXxC5M()Q4|3pJ-+cYaBX$=00(H#i^C~)WRabh=->XuFyHuOcB^7
z9a6%c{?_cD#oFuZ>jmeaZ>tBgwE{eANz3CNQu}OBp!@s>!G?c_M7|aN4Jrp|Y1H`-
z<?ZeHQ~z|p|BkGo27Uu>u-Y7?%phb8RRWR(igWQDU<|-+u?oq^WFAqAaG;^1C#-C4
zPM`jr(3U;H=mg{1=Rl*jhVq&F)uk^yi{M7^slb|ZawoeZt!Ht2wjIbw2#VE}m2J<^
z_@+@?oELEin4D)rn$7RGoGe|}*VktY3k^+39-CcTd-VtsQvh^dU`>g~4&(`!NOd{#
zYXKT50ac#f|NU!0#p=_iPbB+$KTS}8`b1yV;Xu<4m-RmU4kATR`}VVNgJr*i1OlS!
zRnl7ocq}dsR@U~UUcRqnZ8An@pigh^Icq}*Y@t}$=1qfHFH893CGToO2q4JkJ9(yy
zVq;^&+BZ3QhUCE$4I$tbl$FVVXc}1`Q0J4!*oI@*vjOMq5(s+<pDVf=XRKo|qBz5c
zGbmt3EsFoKBVlkYe7w9NYg`s!J%K1s`2&FSikEdn+_uk-gY1Dd>WF0%ke(m}U=IMg
zi3%XuL;LZzuy9%lrK!F?_Y?@lymTgSC{ImIO+ERZwu96Q(rGw331JL|5nKm05g4Ia
zKpTxff&!rq^-#nuEWFNY;K?9mvAU_V1jRn$@gcSN(L$8PnI+gD2507bcX#*Tpa36V
zXV2F+v_g+Cz&gOg&MhoJL~nFkf587P63Uij{7&FmM*W+a&^(6omBC483R(uWw8)-+
z`|-oMH<=sF%HFiahlZbom>8uvsO(^GFZlGnj?Qg2c+i@V8zG1N6h40RC_f>Aq?vYQ
zWyRu|>xvleqGIU3n<r#he0+VGQ6T4lAU0ZVx;{UzZ))1#zc&oP_FkX|$H#asr3eT@
z8HL1^nrsRu%{{C`4<0-K5R?`W5J<i{Ud5_J@&pT?y6uW`r9`(NKH@&3jK|RT@0yB=
z=pUhR4d75oT6T7<uud!RmTFkU2H+RJm?wYP0OGCWnmtwb7B`Rv%S#!cimHSNpPkK+
zEcm4LbgdME4Rl7d^pp%>NJvQVPW1KR<pcflV|tq5vf%C8P^E}T$Oe%qOCs>@$}Hy#
zz`zr?!RZ<u8F?Kaf1BkVe=TYT5NIEPvznZkU?yp#vQ`uN{^$)XhhmL_q7EZ+;5@Ri
zW0wT=brhnd0mYAg2qm5-o|3rS<j2*eJFp`Brj1NOl9Etq?1T~zps(bFgh0uYzS(yX
zC>$N%zA5_M#)Be}1JKov***XlyTbr>@sd|B*Ma#SC@L#`&t864$N_2RC7A94-y9Gl
zbjB>4oOKpog1}2MErL;)S3SZ^5aQ|qJ%5g#j!r=Ej0p+|2OiwJ_p7rLvl!V8yztpP
z_rr%<&I@Xpcgm;MR{_1fLmR<ehLQ^NIo9c$<}!YE_Rgj{Bv)BzK>@~pu{S6u0QoVH
ziE~z-o}P~P3PD%Ah2(&D_V+`06lAVokKnlYuCy-rxasX#AGi?=|MkNpI`k16Gdd`Y
zQm!S6;e=d_<m7DI@)jj`L5A~qCq5-5Yy&(zkv571<K|N+U%<lv!47aCY@yf);O>Oa
zpdZ7MGGqT>%sh!a;5Cb`wzf86z~$MqcQG+{qmF90uf!olNpd5n3}InZzJ2@l!v{WG
zNUzv9Ie~ynXfBEsIZGvCAVYoe_%ViWAA8m9rqi=CR$I^(FVxDBv>7fTd5~U|U5dA`
z4mimud1I`CQ4h|kydE~x`;~g&kefHXj6v4#7;*pmsCUB9j-iy4F5$r7xj-!(AYnmq
z4A=Pc=TGQs$$OlcmGv2ZH8<1I(=(eSq=6-6+xyZhp#Z&7dptY_1PCjYSH^czR80oA
zYkY)3Zi~GX-A@V&w&6{^kaf#o-}!qMh3vil?NI#p6s3ilqaCE#=kL+}BA3l&-|}Yh
z9m<u5BQC-{#R3OHUg+r)@JUF^Fan3MpyeEeW@tfNhI|Fo@VKE84HP0c16Bvjq%w%J
zpHH?QLLJx_1}OdmSY%niWif)D9&Qa62OnQna`HY9iu}RWAeR_wWRj3bEvC$da#Uhs
z&<%@`_xIVVlmWDU{MZC7R;eAJUT1Lj-d~Y_GNXhN3<)B%D6ry5f{q0PgPe*=5#MDZ
z`6_b^2SFq9gvnG%iA_W#N#i+MEULdgP}|gfcTj|}DIGMO{TO6CZf~+FKD0W#iy;_r
zxOD5mhoT~JaNj^?L1A@Q_L>TGUkUyvZD7rJLp_J0?d_n%K}@Jy5vParJw?Ne;5s-u
zT24$q1_M^7d@$VWYrukp;3N5M(ELH@+~42dvvD2Q0_9UuRu;jcW>_oQt5>f;rwd$=
zutCr+<E|8H^V4A#=jZ8iYv<=DE>>1n3;c$a0nn}iq+bDkN_+p_4irt{pt876R9mO{
zDx%`+RJFADeGS0@@X^gaJ~07gavYC5TrSi9TrLIbIm9r{3~_=6$i1Lq%13y$2iZP2
ziCj%!Sm9RQy-QG?E7Waqk52)k5`g~)<GOk_*<81T$zp9r;WMCJfW*^Je)jY!u(C2V
zTVw6*^B@SAujam@0D%kOfYg)}@UakAc~d*WLqjp2@3h#DuskMz%A1q1JP%}Bdm@x;
zdn64V_ycet5K5;wuKjv9H}?pb7IHEQia+&Etf_b5!pZ;lh0~Fc1o8qZ5@<ON(s<ql
z*?)I&B*fQtP`WGz@Rp3W!i?|NKh6(@$+3-&foD{|1H(V(PQYogVFGw7Y-wR=49#$W
z>{~%kPfts$T=|3YiGgPpir@Z90??-+><-j77(ne0j0zbU87q-sE8#Ao;CbQ6WCK*i
z<Wp*E#ht<)U~B{a@7W-dJJ&HVfLBXRx1Wb#5i#U4=RW?wXnV`HEW533m<9<!DQS@q
zq`OO`yAhO-?(S4ly1TnUx=ZQq?rx;x1m4MY-D|z;4?J#sa1%K>ju~T)vB#KzK0H{V
z0M5J<qW^!lg1H2p#6ZwtVX2x<$Mb*lR=eKD-hPwB68V`fUgSvTvZIWjd<|AEG%NzI
zCnQ120AwR*7au0a!09d?<P8OsPTYo}At6BZA+9K>p@ILJ)ZCfGn}U$rUR+B{OGD$X
zxV#lujQ*K{w}AQj3`n?FH=pcmZJEabVQ?e%#q6bvHm;HCs>(S?x35lCNIsuhGl!@K
z!_)&(Pki%*&J81uU(9w9&)c_9dq02ve13kCH~;$<()uqZ;r|%@E;gU5gg^7nf0yw-
z(^H`y6w4s5Z*n=`1}g=e!6<t_KH=ix_9`n@f-Op!F26sPjTjex2Bc!%=BnUyUk9!_
zXO3$)xKnWKkQ$48`n0~W0qTrCk5#~}@9*tp&H_RQ_|I5kpl4CCVt|`R81d}k#6$!N
zh{F7u3wgf%2?X#fm{`OeCQ3>KY7iCZS{Db<t=d1};Uxl@09YKIut4X;2k1lc{ey$^
z?om?(C^rCny*eYH_W*JY(3XM?;BnL-2UTbL@@m+J4zQTu(8a*P0me8MwP=tLQ~f6+
z7W=c>;q&Sk^Z~Frr;UKl%gYnDg7+MHp4Kxy73d6h08DmcyP|>nH$O@AMvacjgVa<p
z5~}YNG4bQ>7+Klw)1#O&+T^Q5kE_EukoQUGG8*(_MZLVW`1Y+c_iSx@yZfnvbxXj<
zU9%UQ{mEXjU?g6n4Th(sX@UKywthLgu#kNUHtNrqt)AgwP)<bB^zg%Yf4X_|B#ib<
z&)C}LG{?qHfyxbdD`06uIUMoPL2e!u_1g75{^RZGf55tMB;&4-5L7C4%d{;(=g~7T
zh`GADa_2||1C}y^X6OvW2XKj~sF+x|!FSQp@^Y&OlrmpXN_-OmHv~T0kU%N!O%X!O
zHxE)k3W|TwhRHvmNSA+h<t{>=9<PB8Zc;`{>Nn-zcmV(#f#$4}g-U?9*gHBR!ol$$
z!gT(Vbe8^|B^a{~0Qr`!ctCdp)EEH20_BCWu`$)~R~oCdb02`QrGts?i+&F#G!3{V
z<1mPc!%V)5p@SY-p;4fETyydPFRTXup}D!aQ<6a4^<UN^)xjV?1sQqh8>Vld3@x~c
z7ub@gin+%6kEgbok3qdYv2GpUuZxoc9nWgPbQjO?8=(1VD6HE$VffwyJc|-NsQL2z
z&i@yK{@*)o`@IH8+W0^^bf5n8t6cqo@9}1BjLdvwqzw|SZIIoZ{C~^wp9AwzDvd$%
zTmj@))hBLeS1k0U56+G|;&NYA%tyd=3}40(d$I%2k(Ae!{>NARD69opT~QJa%>6dv
zz`i$<?s-FecDOc?K0EgmLqdPTh-1WV=&Z~Zf5GAW7ow(W&hLJ8AN*A3grlS2zOs>)
zOOTUGkt-}|-T{RLYyOJ>6+>P~q~e$6uO{$E9t-<d7Yqujpw83I@F4v_qtxxg6aRYJ
zHP_i;3W1#w4zAYmAQAgzdelASF5x>7n+i5aSFmGa57<vCw|a?crQZobi(<eDJ%4RJ
zMXW=%Y*OtV9AtQ5PnvHf^3Pu1bAY;l#bo@%gFp2~fShBaKC}`G$=Z4nnf(&cy}4#W
zoZzL%dz%9X_RE#O4_%j{j!B=6sZBseXB**NGS?U^H;W&Ih|KuoEz&<<PfR~s+y#eX
z4tX{!em?O$*1-&VxC<r<v)T=IFPLu&Wb25Y-04#xv*8{#Bt}Qud91u+F~Ac~U?aZy
z%h2$2_e6-L^Y7!rMWQ0N5-l~e`Wp_`<JF~k(NfN7i`CNNvwbi;>B?VRhuHC`3GWps
z*tjv|jal4a@wVAHexaS<Hte`-(LUds_SFvHpz|ia(sq4hxqqCTRmkoh>b(&k*$i4H
z<Jj7*{iW9<o6rpT%75Kcys)7sHeJI3#}qyD3y<(c6NwY|R$nU{iw+?sj>Uc73Oa|^
zNYP$JcFvUBr(w)Cxi)q>OV+BUtp!=n0h)%58mbc-FLIl|D91*g1}J|}tn&n|4z9JR
zxW2??U6_d_p4qU(3G`8Nu#n8x>!^pnburnM?yoXXqe`l9Ubz04=UK?qPIaYK|K3`x
ztk>>FEGp^gs<D`zi#|U>{c%>0XYxC0u8#AK0)286qrCU4|C`U1ot^vpB?9i(w`-Zx
z1y8u^hAw13%yn5RtiH5Cq2~p@&3?^phf5|au$=2xbM=iOV@vmF;sZsrR%a1aog5io
zFRRH!+b<p$Xlb4p1;}1xAm8g98USd#N@e7F3cEksmA-SJ?!+NU8tqE-ENnQCWuHSi
z+&SQLQi-xX=6vK}qvfwLP02(~T;60cIQg1)*fqo_vqU>f8=95pFrJ4ZU5Erte^*Oe
z7TgG<6yL?8W9^DF(sV8$eRz;hZY_$1wiKR|b6qACE{73%$FLjyDZC>}hS?e`$v)6P
z4I0zoW&Sbl9Rb|+bf!!RKMIo}B)11tE$3&0%&xvELKD|j)bvi8?&#w+G2<y@d*+gz
zg8y0T5ngFoXnH)a|0<MYv)F8h;7RBqI8|Hm#7vJt>W??i^yatq8_|f;S-IvVRl>~s
z5{cYTUzqZ8F{9MlmmGv8N5T0znDhr4hWGL8wc7pGobxsw<C6TUroGZ`d)f8A>Ai!W
zjDj1j7S=Lykp2|WNXv&_w`-l{lEPNYya<I+rx1#!1iQ<<o@$SYH?T)s4!CnD@%J~U
zUqqthQyNbAqu0*QFR_i#6t|~}Mh-u-MCZww?jJFS;;jBSWEGIYrW&`iV6s~VHplJj
zmly9;49tkc$;NM*ph&bBLzE*Fj+otfS%GM%C=Q>CR0eS*x2wt71BNTn+enZKm7_?`
z%$THU?JSlonG*FNPtedim%!l-st5`@(J;;WwoyK_-YS(TT1X=5ZQ0P`?#lvIroPQ`
zjIOqH;o#i&#L+FW@~kZ^d|os`vlg9FnqQYV{atXgRZ|R<z8wD*S<|<Vfh1hU&&BwD
zhY9)5uIIhY=Bf9&F3>>W-<C<k*<?|>+-8E2?kkp?`y)A)yOnKVO!`uWTn^#xmek|?
z7}G@>kF+?q@O4b11HO#hPv$d(-kSaNi!-I;x|5aQ;PXExzXBQBjncjHrRt;aBe`_Z
zX38K28NsarW+oC>Sv#|j*exdwx^y~9W7RhExNGonRaWNjRHzd(&%-LIEo~kdvf|XH
zL%+jN!}}P#C*H)ySYGDY&y$+=Adawy`ruAO*WM+-B5a&^b<DYc6n`Qw@YFNYMK=FY
zbU0)0_sQaVIXVGeR2x)+tAke$LGMvjfKpn70FE`wOB0Nf7^1eCKjP@a_)yf68cg5I
zQ*X<;>+vH}%65*--vBf3h;HPm$jl<*4*D=w-=bgmGSk4%SZj0LH%E=R@XkzR?Uk<v
zWPee=fQ5haPhf@<di0Q`Barxm7&6&O$7?z`c7SINE$2N&INzqnvPdEqmN*taa(5B9
zqiqhknU5S(AbUZEpl*Wsz1j*_LKBy!{X-2T{qM08{GSsALGQ!^9~V7&CsadB)`RYV
zQ6efju5<j7CZA*+b5=ag0}iIm$9|XbZhuuB$ugUJEv0z#{>Q$0la9^hi>+xwYA&wN
zFGE9UA129ykdZXu*nHt`%=U>rT(^})MO8&Lf><M>3?B&vB{Wm|Ha%NvBx22NU70Xf
zbNgCbMN)BAUmwvv5|71wK`txUu-idNhGh>ulD>;k&~(~@jlWmg<bq%hm*Y#2<bSy+
zvNI1F;A_yQqH9O~MNn4pzav8Kt*xypvul<$&?KQE+R)?6b^)QINyi^f)jFqiLE&F2
zDyK)6=vj@Ff4{R=6X7qLdt9DtgBtc4O9w4J3_^_b?ONUTh5Z~1<aU{#?2dVNI(xcR
zZYfu4QbzacJEFObiB=Aw=~DZ570n=KYjgXnF)_}V-QlzfYKq)~?EL(JEF&Gjwi~~$
z*=1zKYJ}rH=trQ?@K%{*o!cK3Y@=kYxImS7>h^H_miexbp2d{H4fogEcX0JoM)Fd&
zh?R{V0cy?3f!~#a10pT;RpKyPjNsL82;;W9A01x3!{fT^%%-h<9ZsOxGu}hW=l*EJ
z-nB(6jW0-F6{ncw74meA)Ccq5)j{reLPA`g#4|M`lAiD?x8ZJ_hQ2N$Qck65+R7RC
z!qcsOMezIV=U`GhUAyD<yqv}o*PV)4;NqZ7NZrjD*|njD#u-;bP;lFPq;LPVwEt05
z{VB)`6}`gW9Sw2i#@bfwSC&aMqPAx`)V7lQ!_BaQ&yT6Sjs?dzYQ0;OmlQtn6V@#W
zw^eTiPTtFCdG>wK_y_~JeAE)w$mR+AtocQ`cM=gfI4F^*jlWIOPdUFaY(@ILWTb}$
zi$`0<VF4R2AN?a!WeZBqCfkzLkEnBX0!(RF1xCIo==W=Gcsmhd3W~V4Dh=Z6wJ0mF
zDJ~|V+<e2e3VmR4q>@ZW<tUq`_UbV3$Fn}4e7A^mpflcRrf{|-CLL6A79PL*4PXE~
zAyk(Wlsr850Kz9=H=hMO5dh?E?I<o3pr?}LfN+|K2k4ihJn~IuPoVJGyU1v10gD|K
z{09zjAnMh|V?fzGYmtM`3?y!^$V4lw*wL>v*XpeNfnLRYu|CqGZv#*-08<1c{3TEf
z0P*b~!$?yU_X^Nm7Rloi)Zw*n!ATGh-TewRX%7U&a4q0=9FOKVdcxbB|Nfx6k?P~A
za2cALls4Qr&&9sj(Baan%k@rwj*=fjQ%2}<oB7LFst6|v*wivDv;`_Z#wtwkOH*I(
zJmmT7a}_bet1lAwP1uMaS*yKk@9*{;kz84Jvvi4JZDEgThMCiw5ULW{KgsV=i{c=H
zbT>IWf6CxfRz2HC{>{^7&)J-pvz3ke*9x6d-{VGI{F;KiU1TL4UL(o_kF(B8kRf{L
z^k5;Mq2bRkN~L1+WuJa^nsD-a-eBGutKX3Xa%&=2mqVtT;<*SzbPWV9J%8lI6%;EY
z&*x6EPRh&JYR7$3lD_tE=xZdjAl6GmZ}T@hJC}*$(0f3>d(yyB4Z6(!iGZYjoGHU*
z8Sa$3PST7{O=8Epxb?Enbf|fbgAra4U{pH;+`7glCSW%V<~{GP(A;OIr-8OW6BLxd
zl_z<0bQB#Oje<gPX(Vz9R}$;tLo*04iMj6FpMin$^72S8UmDg$ng+(kV&Ai?g@D2T
zMc1$h3dm}Uh3Un`FQB9YToiD0{rN*qPQJOXf&rV;cy@K9dE+(Ix`M3-#4P4|U>@p#
z0v9k^p!@?~0-z=UN=1NmdVqE_nj2QxRQbtuI3gy#7sRL==10N;sp89<qdp@#jwMhN
zH%@qOrknU!?5NC?uVrx{hxcp^0UYMGSjr2&FNyK_d&6qA+cO`;MZJH+^zoK93g7Rz
zyI|J!u+EY>>8igq?<?X-(IvntClFaV9*8_gJH{qvcVD^Ou}`}{*dQW-U3M5E%;iEP
zVRj)MM<*9*@Jl8+q%qe~6Lv+Ibvpg>#-n3_AdM|?pw1x4@4_Eh&^X!O_QM_G@s8la
zVoy(r3ACkvE!V`tv`KZjVoYrfA|4+G|LC%Ksl*fco!a+KgID?x?-n^4s>887(&l5H
z@ZO^lj`-9ViO4uVf|Qa-iDIDvyY#QtGub_FbMeOAkV3{LL<7tf&!X$b)5%4$hS$;+
zt6h-Y-a7J^>ZZti9o(h6opbZe52iqT&gvh9fCvk%fB*RSxa0T4gp#mN$u3e`^T$`o
zxi+@8KeR2{JSousJYDQzLZ(q1GV%a^2S}R8$jD~i+XHGy5GFCPmNI$sA^F)%RAS;>
zfdU!e0IpK=zB{EucJrx#$dJA3_w@ADwS)lFX1;2v#%PX8v8p+8p}()cKhTL+S+S$d
z28W>Df~x_mP(0GxIfQ}oX2SaZ>L@Qy{}^R-V!~(Ubm+Hvr`R9RDJ8*Fyaty46v)8=
zGuGdFz#0~IC~Q_!T@B|I2vuV?BP(lIP9`HQt*lIrfP@TC2bdLXz~t%Rvs*6Sbij@|
z6+F&h9Vc-phd0>9i<agYbS5cX$gy2LYGSj@lv%hZDhdw#aT}t(Qm-UVdvVIDi$Wzi
z*WQ}Ra+@7h=*3Ca?vXr64sZC#AhlT<!iNK7Q@+EAsoQKfQ8r~i(GpYi-!xq!L?Z@k
zzLCZ8LWnVC^SDvDxEv8+KB+Dnm?iQFk&`dKx0<EMP5&}GF}{$cPsPf;O`Ojx?zb3{
zeX&dZbeb#eVJS2X38e4V*z6FY5T|wumNE21r|NehS{Txf9W%vO+Sz_j%<Q!Gs1ZNv
z<3+=~HTu)V*}A)dP9|j*9TUMq>0mb(ac+b9wad$=gdZh%KJ#XkUgd8KrmKBZCC{`k
z6#fa>)PPlmgoY9Ik=+9{4@j#=@|+np?hj_lC88;r1c+DbvH%!@gM|eFXbhk%9S))n
zDq~_|fNNyr5T+2P!m0uaVE6jV>wr6D?}BZGUgZg}kkDSd0c0<bdhu{^G0Du=T7bR|
zQy4c^!>tp%tzJ(r5l~J_%F7v=4XapBhuYm~f%0P#u#)?NLjZR6yg7vqT5k2=aXHi9
zwlW1o9T1@o(npx~oUC92hWgth0yjXq*t>k5UfBS)L`9{tND1_W0<j@BJ|6M;UYwUG
zD0+`M<DgTFICIm@La(Q*+gz5PZ!QgS%f<>!Ke9Dion?A(eSG+_&WD^fT=nu7RK3e}
zx_C+#7fPPnspqG*Gld7$7&uz|q8g>M9v{MAeIovL)hpBH9x{A1{MtC;`dIhuwIYYh
zL`V6iO$Tk_PE_n(Y_JF7$g9#eRFM_AlLf(fhA%*I7`tNn*zBy&qlDZ}&;O{yR&ly>
zPjnF`_mzR|d-^$dC?5Jyl$}#LZk#8h!h9E*^juKt`x`ZFmeZ@6O-Rd(IWko*S0!KK
z@b6zI?)NgMsKI;zUc(a$`H37tyI&)upAX(hEN&-CbOI=7Svff)rCeyH+L{_aZqQ8W
zWchD8oRIfBL*(9p0jXU}-5ggNV4phUQQnin^XnZR2A;wIZLdTD_fsM0#~vsXUATGx
z#hxY&#Zdx$>Vx+bAk=(h{;F5%ztEEe$EA^y_~i~TaSSei{$d#Hmjr+(`Hl=W98Tjr
zo+me>mq|@Ug?BhUyCxsi=kK>$+9t<5`BxF7PUJMhT)uTHWM0V6ctq4k;Q3q)c#l+3
zA3OoJWG`fy%r=SR-8+~dAkze9tKWGMroBLmxaI*T8j9bQ=SjxK7F$RLhkPmk1h>Rw
z?V$a_yPX4o6Kj-f@#Xzkoq|RM`hyHX`FQ}HwK+DT7t(@NVCac~O3a%e6T1zbeUR@J
z6BCoUB-9NTj0iYD06?d!H09Hr5N63}`*}ch<gpc8dVFH#Hj0}-Gl$T6qdE=8_(K`)
zIXsg4Rl>jVYzhqy_CW@51nNn*3I?dG2|e@ItT}<3<~filfAR+CUrI_!bD$xR8v<@$
zZ~IBX`-!O^dle)nOlp48NvuC*UYr8aG*D>?n*f=VZyBCY5yKYpDd^AxHdRLF_RzaD
zuPV<6S1OpR<E4STBE;LOcf!E`@^wMXFuryF)@pU^uLyfy&EQb!J6sZdC8RUN3@9?B
zchoU-?LbD7nW>-qiG(Cwt-O~E*{^qSaB`A7*e=HAE8*F-(ktjkA}gy(2=((YHo&+8
zvoBCH0e#8gLS4Ab3t$`C+7_i8;&wWQJT&F3C|!@pmpv2w+JI*V)H=|)epBLIUC01j
z<>>U_V9ERv-taQo!$Ry!FIW*j_%_eTv0n>FXGZ^hAmEa+n2X}Y2eAGXMC{!Q7=Lpe
z9Vl#&d-a0}7N3Hf{rwv@DFf<BFNUTaSi4S5PC*x!$lMG;Fh8TbKng5FI+bn<uO0~G
zG1|aJ0-ge}&p?0?6B`S*dOEvBQhifDNIgfMv9wQ$RGQv^K%J`~bqUl4xazq$IVt$v
zhI+zj@+7ZccY<Gvgv{8P{{@;Jw&I(%_F45CKE4*v%kkIeoI9n@(6SN|fC>PvKpKd*
zw?AO^fW26E00Zup=D!~g0NU;^+w&kMf~<v|1`3Js9BJ-3ouSUb{{a(nT388Xq_1CX
zwUYM8p9NF~v$SV&@}Hr%S1ty0EG$|8-@H;DCW^6n0{vV=NMxEr0|z)4Sv!y-^!D|E
zW#tXULerWBrDg_pHopqyDj*G5R?r1OM>==r3iw_DAp`gupj80NTvl8>Ffb5EVD(#>
zR;$)8&AyAZy+b<Vc&_~ebL5XHMu7^~8~JOaJ#aOlBx6H)4zaPY_#ivKu&+UYf3pT&
z*c0IMz~~G{GxI0Vpc~71KCaa1(4QG8t9rvr$jO}m8FDo|rA(4<9W+_*%8P$DFFbAs
zOy8?Iq&T2$P;M8gC}N{(M@pA`@oeJ->wouz=mQniE})Ifq0g-TzK20=cE7CzDfcL0
ztL7?OvIxee)jt)~^V=h|9Ra>X+1kMYa0PhJ4@py`;KTc~&9n`h)a%9b2O}d}1Ghh*
z5fXoD1DOw?jyH(Xo*2Qd0hAw*vTAl~3x3nn%WME7w)=bqHOOWt{|$@%IIDnWV|-VT
zk(Ff&I;OGHv;$ELfMK}^he2gBi=~JLCiVvGT%a@otPCF6kC6!Ao2Z4l6{A-34nW1r
zDkw0vvSK(!-pR=!H-2;ik1{+ULFO9Zf3@t+9x9`fmPKL}_3j?nA&2Lze|VZ&|E+l#
z(5-r^Z<rH!>h3{`mpY!Hfn6){y_$1S`IFuj>4pB^fT9jquXDE%U=^N?fdcE=DCyaO
z2$nnsmX5ES2;2=Yt^t-_9<e$<<ny=9^vo~NV9vdPLnRhCc`K=*+k+B=e**sdxwom|
z6MkCy7gRmi5}$XTl?{^7@eW<zk8@<>e{EVpb*&(<apYo7_Maz8r|yePZ`BOm{KnZB
zarz1M?=t)X-3pB9=Q%4t8Hhr+>H_37z*$3ec5#7q8$q7a=na-O7!eTQ7PCZ9@?erb
zK01K7251|wOFsGmaoZFBi)ZVUVND>v#~lbQsY5{EE>QOo8bM5f-y>hy1-h>J@PL2+
zY*b>yGujyXk-svOZBJRlOfD!8oArap@x;J`n<JpsW`=2=v?=vOuF^d`f*T15z0D_s
zs?3YVvCK7jq3?(iDp>0xWJtZ}z|#03j=Y7;2>HO`32{DCO3C83--Q~#kKP8U3O_%8
z1X$MCTwEc_WY1d(EEgaz0D`E@kJ8eEzz0kpxDG0m4C$_}sk45$O(*a=KYsiGe6ujT
zi_7ilsr5~F@6eE6Iu;OYf(~-E;KQ&I6B2*})8FD$P2px^zwZI?q$4--mpx7|P#BRm
z&nE$UL)hqkJ@P~VsjtFzMG!4%9?W~p4?KMGB5hw(Aia3!QUfDU@nZ7?{-1FtB`1gT
zWCz@GPVUc^>pidrgoK1Pjov`8idBl4Ld(kNqw$#ak@%;x8to3I>9&_bf`Yy&VYdSv
zU{bZ^Qqq#>;|oDo5T#z6&_XzHsoR0#FQ@G@6>#jx8vqIU?-lPXpd*Cn-xz^boJPwR
z(69&<g)2ZDBLecJ1XDomJ;w+uQlMp{1R|XpaPrjduZY?RIc_Hp4$h>vJ?hl1v<%h*
zd0evG=KYhABV}`IauVgf#*?<mwfNdViPfTZIDQPyYPz<7k)dI$OR9?eymu9g#Lg($
zQp;O&tM&ofN1yB9l{QyfOS`Bb+6%e~yG~Oo8MnC4f8t)xz5)Y&@;wl{_BOZBL~d4@
z`qMSK@HWb60@_UGvYo57+;X7$!STTT&INHV(hV%l-R-64e&}QAmkbnanzrm2@xK9t
z3P!JPd7c60Pl`xrXh2crz4DEO-@P~Q(_auN0i8zh=1sdCGe2PI!Kr+IcS)o9qOkUv
zPi;l>22?|3A+mg%r-$+QFFi3wg4_2{dwx&zr^5(vaDiB>pkwG=vdH(u*H~iD%0lla
zJkZJxoYku`B<$vIl3&B$XaVDc9XvFNfx(vN!yS-8l@^GP&Og3uZR6A{7x$kxxzON_
z<j5<2Z|)eD(B^myr|D5S$t5D;Qgdi`EAjKuLsDRcqdh=4o_c?wTQ2obRrrP0Ib1!b
zi!;G&jH*M+h5v(>^2Oy=PH!2G>mdA$GM!r6Ee}If^x47vSPjcOn}gArcU+?I<i*xX
zM#!R+G;u8hQV;hon_ZQy=k$-Ck!6OPTIBels!LgxZEGWXJ64I<JEmp7aTIM!^>Dm!
z8?hS1?VnD9>a{H`C}0L7Ptq%l7ooh~^<w3k>hM`UzI^msC(+LNgSKZ{HbDG}(6U8!
zDQw9DMPx*l_M08drxMN{)=!Tg{|&kPo{;AYWI#y}b6;!<s8G<85nSF2O!t7&1hN75
z8}Av`2Qn-ytoAx+`9c9Bz$z0b1-c8eDtdf>+5;JbpA$Q15xdDY2=a}rw$J_0=|L9^
z&;oB0>MnsU%v_T9w9gu3E<iVCxlnrq^vhsc?ry94&7>Z*BkpesC@ick$fEZ?X_>28
z%_?W6pXKW6Zx|1K@F?AWrH%I;a#dM{*iejDVQtMkL$N>5j=7ozFAULDlI%LZu)C|>
zAtCXHpJR5AE+I-G+?#nx)UL`hQl2pp6WfBRlhbv@ZgW7#d0q96XZCZFIOf#$?wiko
zes^Dc+-be}Hpwqa#MELc?a0yC_Ocmmk0T!aP}lnN;JP)2%n36lv+_QuG_jdbiT@o*
z%({E}GWcL`N0T@n7ZpY6(N0fT6QkcNCF!!J;JS95MM4HcE1sAHU2R)k_subXXYRp6
z$Y%P8Z`tx?BmSm(4)&y3G-*8pXM*Gj_ASA4e)13X6ycS=7_!0MziI2gN@d6w++2UG
z?}viXSu32^is&}vB~Xz_CP6iq<|UG6S6)gYrIw+A{zb6AiUi&AKxS;B!yWpWIbk;d
z%t`B8HPw|Rrk_|!%Eizu0<Tc$_}k1JR|3hY23p-RRPMEs!<XJ3)i0im$a0!R1cwt`
zqzyt0?goyI2B&(NiW$aVbOzfE^flL%Z2uuz?eo3na8fUhhW(k%pb>T?9%mw7V=(}U
zJe%oxk2w5g2$v0wq0-Ct1#3fIvSRsO!{xir6r0PB4Q%>-LbU`9DU(#g$pU#!&XDE{
z^uw6b@l&XAK~*QG59a|<SvGuuS;agKy|g=4hrM3t|D`-3_kox?*ugj>&gZ%BaeQB$
zM=NRC{iSnWnrz*)H2SQPJW;NGP2=g+_q{b-)fcJ_&WJu`x}4g)pGq^`lY;eyW@ige
z-S?)MaP~rx#p%|s<aSETi6Oc0Y_3}_+|XDQ;pFk^cKT*;j=P{8$@{m}W);8i6U))U
zG|<@M!p+gjbFs}3#AF+6d1_E$v8M!{h;{ZVZhPjGA1C4r^+jiDCa5}^$XY#|CA(8?
zomAYZxWbu>`ts8!DB$&kO`si7GoLKIAyd!k^gje{Z(BGg^tAQyO;Wza`*>6eHvOr%
zbPss!&b{Eo*qVpq=zIGY@aa|@qi^Hi2EW#c*d!6&qcmkd(uS1%@LQ=HmNQmJ*JAt(
z)w4<^>McUo3t`E_Y*6@2Zjq%b6uP(*KR?OzO-ZLzX5O+Xp@S9Y<K~YM$Yuc>Zi4h!
zUA|E5W+i}Q$PxoSw|Lpwe5%u#z`sx#ci)^;OEE~h8cdUL=+SY-A-paApS4J014CP2
z(Sz$Z0aMbtPKabJ^vSKV{#TT0_9Wr5$JvS^fhkfM3>|s<N&sBc`~BiK9CEmx7`A^q
z=7ckm3Uu|ifF5;wJtI^6>Z%D!tbCK<U93PEPvVzYrJCd>6VSEA9i-kh?=QKmO#D;g
z7u1GKkM{;k6mxZUJ^nTX`VoJ+WatbEwq?<n!+`ex{=Ey=y?Mv1O!h7pi|}i2d#Utp
zk1&M3U6zyq)M>8I0qkEkekI=)#{G(d>Hty^S3!}RQFT1dlh+v>Y=(v<j;(MF<wJ)v
zl$~t7Sv_->1O?h>&_gc4rA`!Bd-Zi*!^iYAy-}Cal#vb&(a1Imi3-KHBYSkqgfn=G
zia`i&Btgx2ji*P&(pMDmZYE2{^MzbCn$OAHr%UZHsNYPjoHc>8L0t-Ox{Iw%T;NV<
zf5Gr3X~{}2vET5ughGb?XEn%Cz$X-Zz58`Lm{R1@a-3hL#nWmL$#uf4qW~lA`knG3
zN--u;>GV4aXnNbkNLmv5UGpAYpj8Y`Ls;wgxW~fJVh+nMvV_yBer0{QtL5P@@yC{$
z>WCcw&ev$7r(xSRoRnGU=V_yv%}6cVK=6km=}>qaj=b(I_~Y*_tTTd*&WU+szchMo
zZ2L>-UYw~QgZnD8n`paE9&lAYdCM`7ZV}E6k5ySM*W0-FFw3FvjZ5{RTCWYZZVnrn
z!2eE6es&6yokyQuaGYI(4D&yXB|q0*0NT$w1?aV45XO}&Sj;685qURm=hwI%c0VWv
z%Z<T&zA*S0k6`aFGS%KSQq10bddMF3YL)AvNG(RWR7sja1Y~Iu2E}k=X84bx11*gP
zl^>FI6cpciex-SvGUufrq?*9svWgKblkB8_R0?6+aE<5nlJGuqvaMhI@h6P}9mnwW
zm~~q?Ezl_WVcAQS>3ZT4?awXs2lqbeS`ArOr90`4cF!M6+s%hdquynm!WwCoow_5h
z<K3Rx-M<a;zJpgy6n$j>pC$F4;XrnTk{`&|Fr&i9enKezKIlx;9qiUM!xtYDei1Zf
zHFl&8y(ZsmE{T<h<kw0AU4%1*F-e+loLyC5lz*nwgsKj%`=ET=!||BlKW9)F=4oQ#
zs?S7c{d;=)Q&!`a@f=0T`u?IDcsC@p`9pEc`C8pb_lib*^*sj|5(~-h5!_rpVxRQF
zyCFNvu}C)53shP$nuse2RP=PW<X3q`<YVu&N@)u?x$`kZU6;%lcNg(>qC?{d7I<0T
zl%t2;PX{PWCbz|V3@X}ashW>aM(hs|%!ZfrpmYbgM5_`LA%wL$A00xq`D<NzNDaTf
zo@oeW!5_~r^wcRYW|-X&`8P%QKUx*z-^$}ZdJ?4njuiNp|M%aD+!q1}lB25XRg=@)
z$%-0j<#L-Fz?$#tto+Wf!_mRLEiU)$#YM2*&QD<>LR6ps_PV0~_CopE{!5vTzg@!l
zeEWA((KS>Llpcnp>W4?f04Tn8aeif{47!WAF(t}S_UNa4q+gUXbzVXvC|(lxI>Dp_
zp_A<Lf55iKyhB=6&E;Z95H9$Jej1+Q%W+y$q~{!;nwlCP-{Z;s?=Db&gQb(>T48yl
z!GVrG)3|KO&mSAgeYwx&=8BJ(tUfo_z+wDWStHWATzh4DYz%bc8e1aLLWxynq%E-s
zk&)45{CFGmDfMm>1N4h2H9zFtXSY2s(w?&}Ul?8P8)|PK?Pz$}jPs#XeDbfQj){Su
zx%s=8avecC-0;TkJVTr~b`^d&L?bn4+?3QW2d-?VXC-QLr&lG~3*F1Ty|cet=)n)u
zab)}qXsRnrEcEls4R;dG91rrXQ-f7AauS9mLv-8ua0L}0zcAd}TM*V1kgxU?o#8~%
zFF&56y4b+dQnffPzAjweM5J?~sVq2{2DN>9c)7_k*7dZ=%;*FVyu16`FI!&3#t~Q}
zSs#{JS^~&JSAYnlIS~R`R2$2e00v5Fxy-0Yka;k|dTh44WksdD{7b-2_H#2|>wC2I
z{#1UtIm~T3fMt|n0b%j3vy)0s?}IHl+q)n5Oj)+Cuw^dyWeN)vmzO6K)Yry?{uGQU
zr*dOq%==tWDr;DN`V^(Rz@s=nzsWWSDV3djMI`L72Xn7hWo9_0m-ac|Bo^)bxT@yh
zIxlDQGWjLr3BOoD^<u%4i)$(UL@w(PrzE#wYF?*VCi0fQ%M6~VMUTxSYOGm<+7U)Z
ztiPrv=ZjX^udcTg_7==}iese`=Ef5?FGbO?^p$zo-^{g`q{#c(W(S<yurKQpN_I=n
z{Bk<~1V;FiT-(hyeTzQTZDVa|@uV>Ag-(`<m`%<>-R|~6o>%m=#|P|Ip$B|~3;7Aw
z(NT8eY0C;Q%2_V=9aj$SYhNpDOh<X^Kk(hu5E+}C{Em#ew45KSJA%p_w}pL~sUXNe
zf#w5xy!|4<09i*bNhB`zIt<AnbIn5QW4RyA(}s*3#Tyirw(HH0>^<29tIL;4J%4P^
z_5!}tQpVq}cc8JmZO2&)c=63qH&y5MNt_aJ=X(#4K1s=YN16&M=;~r2Rg^PaGWtli
z$St*~H8{rz5%CTGs@_B;5^d?nqAwK(f!Hs=4hy@m<-YLZeP`}gcMB`?;c+G*qS*c;
zY$dgF=AttsWd2vQqGH7!Ywm-t<?3{2zvx2CBDY?6U(vyo6z7tocCMRoO=<m9*f+Kv
zD1&J3@0y-9^9p;!8Yk;_rGtufCOY%xf>SRumxRiWR%5X!cnWuFvnK>jWZhK?XQOXi
zpEj>ETlFs4hS<CKW;yms!CY*t#-P)PYR$+O%nj0A+=P4qlkq3hWUdN<XiB*hZ%o%S
z*45d=d0tFl=QyZe;}?sgoD4sU$={!nT`{@&?+1CzcNQyZv#&!`)O;mrUbCoa@A2y+
zLzG`0Gr3MDnSIQ~{<%j(UyV+bn^neUyY6o?FoFVBzfxS9WP;aZN45dD5aAyBJBSa;
zF<Cu@GlD2`E8>_5heN?fs<`~KQ-=$DPaS!WG-s7%F2(&FI`~%-kgjd@AlvlG8A=VR
zuns9%;hJ?4t8Vn8to(k;wHXAtwow$k^hDXmkykFA18ZX)iG!m<u$?x@H&2LE?c!|S
zNY>7rQ}s*^_w)`&2o9I7Tkp2$18I5|Q&@}{FL!@gn~1zSB4x9F`)ajxp^&HWd9E&#
z^=DI?tPc+h3F><Do`>z&!}PZ(hEDP73!$b4x*3mB^XhqxUHmFJRhb!{W5(GMd6R~<
zHvSne;j$z>6a&4PIJ34Js~bu%&q%WuRekHYrmFmuGWW_y7#b&=^B0<^0)5?1nr%+s
zwzq$Uf56u4&a(>ln$EQe(L$S;VydeR^?C<>c9v`HHcNMB68j>#(SA<gHWh4wZ))oU
zg&x<_@?+>$995EhIX(y+HisnzC3t);cAY=SHg*ULl)WE*aLqWD7+IWfx%8QUNrt1c
z-+9Db&9_Z1Bg?1fp#fnxp?=p{K3*J~ZAf=Rm6D$KIbKL7I8j+{Yw>Wj0bTVKEDSj5
z3aM_#n){Xqqx`VW)*J5y6_=Q9y1eMy=Vm-6w{#gp?`2-UUR`fKln|b0%{MvRA|Lv$
zp^PRxU?^SWQe0y+-q5vz*~s=5J|@L!VtWYR?R8=v6<=M15yiysr}#kLwXM^KhljN#
zjT-Yg)4uNnPq%D)Pvp-tmq7^akE*^kv_Sr;(VzhawKQRSG+excUd{WKFOU$5d;mwM
zLl}e0`wT7tX}|jfI@z;*Chow_SCX_JTgQ6dJ5=E9r+y1~IL?5Jil2L=*LCA(@b!zr
zRBGOJVQ$TIk9}7+`I<dO-ZaRuaWw7n(D)zicJ&A9dp_o30H7_-W^a^kPnY-NEiY7{
zEqnqY%o3~zQfDsqE1lEdzPDm;*LR3sfA|4DZDWVn7xdxng#13$D8S140v(p_hBO~V
zyiG*RP!wPbU%!d59mM+&2NO|*xl+aN?qAI1=sZFRl?KHF*LFMNad1yW!ZrqKWzTsn
zG{xXEB3>VvG5r*c66tZ};)*W&EN7KZh}=S<8ZK$W5N)!3U$Q5re@2KF8o{bL7WAR|
zl*%iU$~O{JpaUNWXr3hPCW-lv=(s?4u>xO13*9QT)GGY7K{S<LDwP}jMbb&#HNpU;
zVw9ATl2#o$ix&E^Q{Aiz`@}uI)wK4*00}$Cj||UOC8}{y<s&a<Q)a#$;kIo`uNaiC
ziL~eOlETwzdS@a*YH#<$8KK&?tRw=GJ&{$t0d<iX<AlVJT;BgKIin*N9f0G@Olx@V
zUy1D83@(O3f0?XCe0;503+3Z-9OzZ6JLqBjGF4D>SEvH!Qz8rmgWFe~<L<b>fZ#NC
zj`nLlAv8UgFBPXRBIEoDZ!>sVZJ96D_J<`c*?E=(d=@<VuW<;mOP~~Lp(d8y$k|QQ
z24hO~q8z{Xd_<a}*7Vj5XAEJQa1unfVYwc~r=|U6fW&x$an(T@@`1}&5hwIHTUP{^
zr4yP@SCmD*Y~WPakQi86=;_JtJKI2Cy`~lQW;?L=xHubIRDksWU7=Tco$iW$3pn$q
zq`J<Q>w2_JkBwAn551Xd0-)I|mb!}TX&iUnY6Bbjjl1V&OqZP-+x*VAe$<)NjUJEI
zgYB~Nl8qMQ=Ewd%+Hh9pbdQeaN9HS{mj|~u_Gsa44)^wCbhen*Kvnj)Th%o!K^e2Y
ztIl|=LeSP8xf^l3jj8Zt4-s`U_wDC$u^;s>Prh=oW~s^Zz4<QZ6R8w0;CUWPw`!Zd
z+{A41iGh>e^(J&<yKKODB%OjmT$T_i|68{|iWSSTEa9@!Zu3vBH52=VN?9o_N7FW(
zPy$wCMw2ZGZiV5nH_N#zSpkxb=CjpHMWB=K+QqLo&j+8kI<%p_?ITQ)3rhd;_ae>s
z5*vrBlnTK((srG+j8w9B1&#bMK4jRlNgAy|haY<t1~67NoeGxR3exzfaXI?u5|Pnz
z9S>hSt}??VU@hfr4`wIT<)oxgGgz4(I`0o!1-w=D7E<+M)l4*)af<X;k`iG-rwhNb
zjO57>fBrO`iXm4C>byrXWpjM$&y#E@gmXHg$m-P@u<@kr5dr;a8({}*NGi<Nsx=nh
zrVMN+x<-TsZXMqKK{>@WKCXkbSd`zgrNCmDxtO|Dmt)ot@P66F=*Q5J@1Y>0cQoeg
zoE}~HI5IU-x)9Uq)-C`KUx^CsZ(6~{Z?7aApmUIPTr0ZF2MRv^v!sX)@YW0K>E<DB
zoXtzk+;KyL4qx}x@6aqbjD9DuX8v4chI`}g_V*Np^fF+MV8%z-SnIvL>yw9Ok3o#1
z&Mz3$eWYaAY@EqLT=|Kd+Q>wY?6@USX>}Cj{&H9ae<<Ox*}3}#zi#!pnJJFi+KNlm
zV>SfxIO$DfWg21cj;|eV$#r{1rhBM-oDKI^gxq|d%WE%k-`jS*VB`Y9=}v6OshB{=
z*wD(@aE3FZC-B8VFX<qiN!Y(jK8-jTF@5=?@B3(%v0v3tH24Xc2UN^1{<41$f_H28
z0(M4UOi+dr?c!o>O)}*t+#gItLxZ*IACOl#GRAWvUbX&X>TjmGs&}05&V&9^iH-^R
z=jEvIlA_wiX4Ga71**!e)=;b^YC0E%=!QrO0q-+hHL0zwl)NA5Gk#8VsxoP-evkW7
zEn6*RU2m5ALwuQwq25QCsc{3WTy37!b-V6UuP{`{A0Z7b`I!hFLhR_AR@Uw#VM9iB
z@JMp}h;6fR9vZge=mg_<Pu7OSJmn!oOje`!O#xpVq*|O}ESlu%f5iQ}XpN!$aqCXx
zC0M9LM8ay=rfO*NJv}5v{r9I+sd6*Cf-rTg$5?iL@)j2>3?)Ys)6!D2shD=terx!_
z2Y|QHNntT6M8qOdG}&bKl&cQl&+N}*uW@6r(eY%nfk9=^v0H?b{G4cOliTRD!)h!p
zF(6M$R4||><x=?u8i4_Rgw=v2DJp8J`kKX*OL=H8G)l$sO$1?mZpK#QYai_Q?W4_F
z;}#;AxbQ?Q58V%_Nf*(@x&_}3<*nvJy@y;Nrt9#W?{$>Y+!htWipru|V|A+-`O0;|
zO0q1MWQWuYU*MN;Lcg3nZMof-L+}*N*;HX&(jcfIEYlbWjZ&ySXn{glWq7-`4(EJu
z(^IUc_=Pr!s=x4N7NN!(Wom+POqC;T(P?Jb7Z+ms4L+gOlB-d#GU!KMLz<qq8;R9|
zAQzqp1HqHZn0)CUoi4zR+U)79wj}jPDO64)9DgVFIF#ni<*Hmt3Zsl}gqismjH&ac
zeLPUgk<!7eV3(~D9pT<jXuklD@@nyeGm-(4btfd-M~rTIw5TvI^FV{6eQZfhOQCCU
z9<S)1H_6*EL1(`@Atj^H9+)30R^Y)zc^^c!G%J9W<ehY824<@O_VI9YPFh2Bx_@~g
zTZn*&4-=kZs?3K{_KW>TYaF!{D4*RcVt%Po|FqgPn^f1*6kZ4xmWm7=AsOZjiYf0i
zuqr-vb4hpKpF6>)$=PXCA0)EzH6Qr&oNq7RGj_QfBD|4jvC(X~KD-q^SPEs+iOyCz
zSvFe#f!BjeeJ(AZA15U!D7f-MN~lFu6j4a@_aB$NV-^$8Lkt)Fjt5^j&+%Z_YSp%D
z1(B_evHCo%-g$sx^)NF`%P<GUpxBO~cV9L!e#R_f?sNZa>(Fu)F1kgEUeWMz9x}`q
z$c79ehb??{qg}658Wippzsc8R@~JP}kFDJf)1|507hgO5YlqlL_?`Yjeg@Jni(b0@
zSn%?&pOo~&+5>lB)+DEtaAgbc!EL>HEjGdspTgn5;*@zFcU4m%zxaaTQQ^v@Z-~<}
zTY6XucyKC7mf7~(Pk0)y;g<ulxO{qH@v%ahuhW`5Z_jcc%#0jJSa|9}Sg7UzU^fLd
z27`L#cEM?2BzJh-%k*J`I<k-5qC=z*yI7ohg%~UjAF&kB{%C`oJE`&7+ZEzFUlYL0
zn6p1kHp7B~Um^}|i0a-h7?#4z=D>Z)m?}AvVK|%@6I0;%AW@jH8-%6vX13!uA)g2H
zU$?)^cnd>VMPJ<{Wl}X-D<`4;AXyh%_@({L-<q^Hy|XuI;IA9vw~Y21Q5jrpCnXs1
zqBX8qhpV5eeV3j0K<&bI{j0~v(Z=V>I`v)c098>Ee5ffYV<@ex_XCHgRqeV5nvoa0
zOt$NUW$m0)oD;hlCJOrjv%6jUb`(lx*9+?dqOM~mH%{iWywYsg+DFr>fCcAQgdd4i
zSY|{wF(h3v-nTcs*19y~QqYjBbvn0kd*8yvdbIH;ZDA^w6{PAK0^9uB_FzbAZ{C}U
ztuRg4F;3M;g*7|s^am{cEMJ%*`0$>N1g}#3^rNv2Of${e)?r7F{bVSi*_1X1_vPt@
z+<{nrV*&>ZdOF|DZkX>51zQ)=X{&@44|~|EJoBrL;J04JrjG9je^jbs#)lE~lAcn0
zsfJ>??&o%6SCd!X<FfCr-r50){ssy2Ck<8l3SmyV_7V{7RwaVvUWRDBEiXWw=hA&#
z+Z$l`Gd!!OwID1kWp$d6`x7Q!F&J~TFIdiTmqk`pwxg#`c)fE&f`;KGUX}D$rJ3<z
zyUFUKAGmtKSURUhjG9uK=-3WN6|-9tQ6sXS9j67z3GL_Geq&IrOL0E$wchlI+2oQ}
zxohLMn8{G;ZBa8e+>yid*fo5lG16@PW>#&nXvyx-wIwS@%bL?<ULKyKlhV*p#}=Kk
z=?00M?Ig_1Ne)&9(YHNL3?{i7HZCr3uXylcV|Q3O)YU@BC)yOx=FR9U<Q_a+9dlb@
z3Kij@>H8(nE3)9#b=b<Y7DH6otDzR^+#XY9&Zsn4xZ#30D>aH;U{`HXwD~<${?KXg
zaNpErLDuMY_bl0x?HJDM7h$fNO@cYC7Ix1O0=d_^zR3+Vs8vwQdW#$z-&^d~Tf?-3
z)kW#Hq}FLMVn;9o*0u;b3;yPq&-6j`W@Q+UGa})=zMu?;`MS825-eh5J{r^1+fT@h
z7qBpd<B?JEWSF&u*FTef*g5Wh#KVgmNPstT1qb(}ljB;#R2KbLimDEs+B!u}OUsyC
zmBN7p@ICt)Q@UFSKAsE&Z{J&i;j3t2v69}pB${9dRNZn{%@G?e_vxD&bV$Y4U5YAC
z{Z-43;&^4*SW^exEt?23BWLMTH%U?VmE?BXMQNMBTZn4&OvTlhJ@TpMW0)6S6(=-8
zk0tNPD~f#c(F&sjB1WiQh&ivoSJ4K4t?XreoFooE8VzsV_Kcd;*J~KO8zXQ>o1I!P
z>^qtcY2?{OJ6w-gZ}>HpnxT9yTo>R!44Bc9xH$Cq?r3)NEk`GOULh5i5i(jEt>STc
z1j8s~gNw?825Y=E<+YN-A#c&+Ys7*;T$Gtbpy>6+gMo?1j|TiF?Q9-8KVpA43Zj&#
z8kaA3X+IO0QsZnm6@6gxg{j-}#G(BkGh3?+gk`XOV#hd~D4V^1Sp-R4SD09So^bj6
z$6$L|hx-!>BNdHtsRr7T276gX{3l8Y!}z?~?+u|mvkYEBv`jGW*5m6+q<mt(N$n3=
z95v3kcA+P$Ev~8HDPm<Gy5%(7f2b*yKnWK&%KV@Xn~WHf63ovGpq3h$JU@C%MzbPd
z3ax+hJi@5v7Z<5P{iZdk_z}VhPDo+&?`&G)(jv{4;x={;aXJ4j|2~k}b{Mh0h`I`E
z$cP)@Yq94n)qiHoep9~Eapf_bGRT8D{kyG-W?NG!E-lMz;BdbE^8vIqX)!h*y6h|G
ztK1(L`LT!n2_YIg3cChBcl*>*?ICX7TWh&UlBNBGyCIrO!{PlamUeu8K4S{pj2lE2
zD1!~daNouH-GvCj<dxtn=_BE8hV07qMYGXi+mYl1Oj!&Mjo}DY5IsYPmIPR}gUx@J
zcl96hhrXwuyON!~ihTV~@{a`VU%nb(x1&^9sr>O>VjH+Vx1BWX2TZ<-S;WF?H!Qao
z7c!<>rl!;8`-|}Fx$;McE=5`%wi4%=WVzcBV1!t?oFbX)GVnQc%hmlAzB|_$cPJ=i
zhNV&^c7inM6vmO&1i9uKDmj2Dgjm?bIAWk;jZF#dV+_<iC_xuhjpryuB^7?VjpwQ^
z1ig{6|GD^*#5?Q(*N2#ejr7w9s8da9x2w!H9mDJOGHO1F)nVU}FR6}J70eYlovV;M
zaJU<p24SztxbR~#SZ~G-@ks0IW2ZFC^`lqL8{*fZ!PY%~5cA&S+H8zw;PNX!|9Tfx
z0@HG9bY7!v%6?dwVYJ0^WVjmV?kN)qgW8CqYo$6<nK!VXurTO*^qrdIM>ArlL;~0D
zXqj)kxeOBw>X6!}r8mo#$A=wawP6FZ+*g0geAOaJgJZE-@9y5ehTf*5qEiH`-06Mu
zK5hgRO)18V0}%@&#yqu$QwJ0`E54_7KVlxGZ#|yQE_vBPQhix5$DieZnWop;bfNY7
zWhd%A<GB7G7*Xg{y~U)}^Gk{p@;!@v{dxrX_i_`$Cs_->3eQ+W4j<lz^15O1!J@k2
z5!W?)>=r~~a(uLUaq(gFGT8w}=7`UI`;TAGKT(2iuqSoVL&a=2G*ElM_^5jGoNfW#
zKPgj~QkdqSCs}2ZXyU#II}orX=6m#s#!WwDGxrZN4fUsMV#3R#{3qO#$j3pUP=+%g
zzvrhYZe6EMRM>n`VGhh#((rE&`b7=y?sV<$8-+?i80UT&;&s_la5J;>(?b)Z2J@q=
z2gR%2=1d{|k1C%RP98Z_50Uv-Of!TGV~gLmFaD6@!#9n5X~ym)Sk_JnxwFmx^>S~t
z>_L1vw1+py7U5R&W2EcR6~p-)VQ@m$21>$2mZ%&`70j1z7fct+U@wo?P8tc%>wJ*N
z#_CvmzI!D23HK*=#~6v{nLQpg8Zx6I8U|f^MtpR>d`6N4OCN-eT<CMh`<iO5p=2fW
zaj8Rv6wuB<lp8wJVDP+~PSze3F~oJ3F-qU@k1r=Qzimj7O0X%>r^av9SI2xGIDNBo
zJ_T9+#ljB)v3~-YyIDv8>P2^jNpMUW@z!o{nemXa4+eERLF=s*M0ft3wCX_h@<s9=
z_Kw$1)(0WnP$cdzTXx`LY1-}K%!d)f{gyKV42vt#2rgH9d#CVf1a$16+rr*JQx@~N
zh453Ae)-+F^TiNqGix`N1S{mLsk#dwFcU>8VCo=QJDy&5R(E@Ztzl%vI-48`S|N>_
zKPBJ6J1^XXurWEAhaA)=aA>jY57#<p)!b9I>`fYzr}*YT-wRPDCauFIP|wGneskmf
zT<5ami;Lo)#xzkgp}?la3;ChL#smX-(|9z`s-dF5@@t13Ls`6YX7Aw|R0tLa=;U^t
z&BT8geq36ML0ERZ4(fY0^H%D`j`q=_BBO}r4cX~VAx(iID2zAzs|vwF)CU{~s1h2u
z=-(%%D{W>*$4S(np@{_oCKKy-;oM~syjX2Iei7>`>2Gb85_xow7%==Y{Fnp6vyI)=
z0{6@WOLd_1BGeEmm&06Q2K_7fs)dX{0K0op0UDq1MQw~J)v8ZAMn@{)bDA=t{?ta4
zd^QjW6-h3?xzqTJwj*mL`RCVb7{Z!$S|#Y-Ax=XHP2<yDH1i_X6D#v9tEAYgWBxlx
ztQm*p(r30-o&~yw={n<&=(a8uC8kF#?gN93hXo%{`ihVk>vX=!jc3MWt*xP6z22}g
zakJkFG<OWr9MFgBC0OV}3DQ?FA8y7;BEq5lF!y^A+A|_2#x&gn4c3t1JH4j2m%y4q
zYMcOuPSX1Q%y5@|xtLi6Bt#f}mQ@iSaBBDfD8YRH3OY?e!`q>{&=4C_hq7H<NT@@t
z$#H{<#%{iv0<#m+r%Cze1>@mMIL?A<r~v!7b<n~A5H5Wim_Y_V{>*?rCC<A$#qRFO
zMzl^?46=4Q4tEX{es$TVYN&92Boeo)79$i_Lp&JNFl0ykirkFj7#a=wM`lTU_OH<z
zubT|>T}T0Ta;<Qm8*4fngQFvYsfKm{r139A+mdPBEM75dNosvw;L7OBG8+9<5ylCv
zMP#?o9r$V$s};)05)1yl?w$8TNaV*D()JwtFWg7zc-cIqjFXkhD&s*%jfNm8h}fBV
z#n3L-#QIZFW%2`EJGe@<ojikNbjd|GAD^>YB}&cWfW=v9QFuGhnN-yqv5OWi*L<`Y
zMcZ2li>_Cu85>LI_39udE^)Zb{1yqXx{$(L`=1Z)hmx5_)IVRX{;r6hm*u|`jmw4S
zd~K@w#anlC6vw{qx&lwsur^4t=%`Wb_t@WjFVcT_-A8Z8&GF@G^k<=-jegC?K|^f#
z=Uc*(dH6PpPFio*J=h08cW-;h_>$4-`dG*3oXD8<oK)(ZjNS81emc^Q^nKnMea^X@
zf-41@;9i-o5q|{CuW&`=?Tf``8@toWO%)V>yuCbR_oa>{@~DnvW^U`w;;db%|G)OG
zE3B!l+lJno7z;|5-UOr<>4y?Jh|-ZJy@ON{1VliPUKIoa0WnJNgeDMruc6lfp-YFm
z@tp7f&&z$hZ})j8``df1z2;hT%rVB402TH>^h3hP0{?mxz`Od{MCmG=r2M|XF-Zfv
zC%S8NEZqt@o!6x$-pU)I4^8<Xy0G#&8A<f`R99~H!mZWzyL6ke@@HRn*sU51Bl|^+
zUM(*Z=|7=vSKJOCAgZsveSy`eMk}fCwyp9IKEN_SnVl=*qJ6OH(ZkcZZ_ug~7nK%9
zCyUFd5v=(c+SA)bJQNLTEyUz&W4+B=9VG4n$;;4%S6zeHehB3ksR}vBIxF9)KT9VX
z#D+Mhlz>I$4E4utk)zL|K|9N}N}r-!XwQ1!>!gn_3FoFzwyMSG;#6@7vbl#cPoF>k
zp>ZF3O9=BW@oTO1xE%|2{bED?_loceO@M9ccMuWu>x<sc!(_OH`H+2{nL9M@QVt+E
zmMjShN7uLj&1fCA6wP=bo>ViXCJnBlL@GEGTdIZcK?OUhyOLu2MNNP4vAhtnJy04x
z+r~kG%gqRZ^vk0qimtZ<+07oa*=TXxHXb*Vb>e5=Bz3#pZjRww2zF02+qZ29kG>@i
z&%xWbx;eJZLWLWYT!;m?Zdr~7nyR~$^R2%swzzl^1DqBMKbvz?o=^F)a}?K|p2TKW
zYj*}Qw)KKK`XwB9rj&KZMPIgv&$0~I;>dN@4FnG4edq)6iHiH3^(pbxb#WEl%a}eU
z%)poC7QH6@qk!LMdzi^jD4q5;eYKLe4|6~s%nWY#J_wI43iNL@qcwT_PA}fZg+DOB
z6YDaXcB6B77=jlbWklVj>#P!4RK^ub9r4w1x+7#@Ra#<AW{eZMtXGkPC}lNDd9!AJ
z)iDBvLUj5*>oExC`qe5FJvgK~pD<<EM6+58XmT5uoJ>byX07hqBc>)tCzDN0(!SCh
zsoyg8{GfT&;s16_+!OpUiCW?{vQW_mtBQTiPK2+r*wEY6mDm1|j)N?UcHDHR<Nzt<
z5<Eck<&7xM&e2M#rW$4`H~DbG`9T`wSbOZ_dL8WGO!(1X7B*_LxR&bfG~nn!XRCdC
z=6ryKRfP5OPIIj$kC}>e0h7|L5<EQh8iL=-0$Fuh$Mne)xuRI%-w|F*Y_5s7#k>Lw
z)r~qYcnCS0u%7~Z?(40ijE7uY#N~HtL&A5DW^sm+D#^X}^}4v7fw1OPxt(o4?GAy6
zMkXk8L?@deDYUUkomDd9zA>=i{U(Gn5)Ew%SOI`>!R`?xoX8EE)NndLzcTV#OVx0;
z`ZSI;>NCDG7cOeWz`Qup47a#B`u0hWurq(+p3}bZ;<8Ao>Xf0gEKo0c<99VF<epLf
zp)%adq~EP02mEG_ry1D#f%Ir9YZp2B)q>3WQh%^2_2?@(<aB;>Ofj8V!lyIRq`reN
z3BZ{<1z-0|j1(EjE5d!ci82=HFsiSKj!wa;z&#l9{n;9cTtqSg*zoy6Oyq9~>`95p
zvX1a+XnA&56WnX`wxeO;VKJEdxgD}QoISvRxFD{l^rzS@giR+VjygUJU>zjUYNuwF
z$}4gBW^2{1%-nwi=E%nN&$ikYIiNQl7<$sif$Y!fZkC$)#8P6uE&Al7yOV?TLEWui
zpstPXei5=QA|Zc@3^897u_$v{#KcReV+DNpxrQw4=bK}o2BU5WfOD7>1F{>+cpm!V
zZ2wWbNH?)S)ycBuiIjt<t>%PQ5o%SeKUIV8ff7;->A-8#vKlqqy-sAH(UTqp!J|`L
z6YeFq-_juVy(e)3G4*|4GxvLHwuX#W9q=Kn^i^<-&iZ;=kwXB5%?ICcT2RDP;w2tA
zCBz?dKU7IJa~3zREiCw58lXA}Q-P~q`d84q=;Zq2*$<1b;n*t)w^HA+S|dVT8MUn0
zrnF9-?>|fw@Hxn8xD22&p{@O}F`T?pV4kf$?KSBg32`1?PV%1x<xn>+TZupFmgbo=
z&8*DV^od9n@}#%kdEetWCtBV!*YVlvm86m9jDOZoPX*kYf}Kn`DD*vMs$E+*_aZ%k
zV|e6T&z=S>k*uSYw@CdSxxWpZFz3{Alet;cczrfh(K6bqoW58($T<>fCcJy-yqT2~
zp<KPu!mPTkeFMAL!iIBpX3HWwU35$St&}@#h#$GSw(Jy|#N1~HMs$~lfa5*TyX;5u
zz_d*fWwnZE=?n>*6jV($J2ixEHZ4S0<eCZ;)tVBm$@-lN0kGG6vRSf8f%ZQ7H;zI1
zthyiJOgw#czKpVLJMk{`OQV`;r(K%e^HGP!Rl!42scHqslxJE(J8_iNHC9AiZrC&S
zMi!{96mWU8SA%%v^ORv5io3X}q}hk^!HmFiY@1baxj^}Qux~^*RKE4aWL$VFj*vyA
zYmDJLcQeEHQnY|KnEqw!7_D~j)2XDyK6T{H^{0(5GSWb9&tRaC@;MmHjIin|KB3F7
z&IcR%9!(4G2}L^rN7C)h2reWndZYeB8numFCU79!6)pqk>DQiroE6z1yZs3(ZaQ9#
zu0<i@9@b}{*IVBLh19<ZRgg6^vhhiad2OUMQk5koj-ocJScl8%RrK7jGplz`cJ~}S
z57i+&SxTDIpUM|+b*=%KlB`x3t2vO)wvAZZrdMo`oRT-ZlnrP6)i>i{xA{HCNz`+O
zGtM+feGr>;Nc#G{ZLaseY7vS+Q^U~LVn1FNIE{n`^B+vh{oz1f@SplP&*5SuUp|Df
zA56+mKvewW`;CM)I@Uz;klPYw8)1pWZvr%Os^$X0QVB>;We<ts!+OiTs^hBlkb<4Y
zmer+1qYLL3`%eW*%m#Tp=A_t?1fL3oF>xUK)$7##0$HI`Q3M>GDI#AQFMFWKN1;-|
ziTyx_SzDW#oG54LOdt~a9B|2Sk$k5iniD;&V|>D#C5__S`1%1;c=F&-R=LLL*RmM)
zKLBt*=(9FDDN%R#ObiNEu?cfq>XHxM-s(*13OIcS1S9iQ5f@#j>9<nx;pQ!6HVMk<
zDs;>NS1&p6z#Ismey5he*ze|+bBFugc%n#+QdqhKJW&w<5<7#d`psm}RSueT7u}@h
zs}TEHY<HiZ^EbFJ0!pu*()itC`uQ{U4(K^#r;Wr$HkZS$COzi7k^yPxe!ol^>-XF)
zitjY0^k-T0$9#`AjqyTMA9JOwDA}Bj4>IYF!_xz>9`IR|u7tacWe+@rrIX1?b$szA
zo@0Pp@)p}onU6_b$80eTQ3A`XmTV%?cJSlOPs>4TzpM#njYH7G2&eCYtM6Uhq4SJD
zRp>`s-W7p;;W~|-(O<ZHs#`TBfEn7ltNHxx$EH`-Juh+Z#cLM)p62S-cCB)xqp#&C
zYAr1!SNkiRI}(#B`Sb!(a}1JqjCxZV<XIGJ;=tlN`jLzo`WuBqSIkhnY_m|u_`5M1
z7#HANgFw)Wr-|srkKP*$YEI>KFb7crAOhPls=<Qv#ohGr&{tUa{+W-jUC-(GAATtH
z-o;Q!m#AGU1j>`#7t0S~USsK5)2C)e>>XxRm56_8VmUwYLJK{C>CWv@()!14OTX3`
zS?=AdC$a)gLYdX`ox3MOQ)f_`^jcXH<};<fk|k=wQJS*|4fa54H~_ir$$PNq_ohQD
z<Oww2n#A!R-X}0?eVLGh-SkI;4=~%V0H_PdPz-|^Wf)?!mdnY%%BaUgvwM*tn?D)W
ze7MKgME*`03tv{sJyG(Ia0<Pl?RR!)Af;-MvhScoD`|{YG6uHVkHq|}$+<9<DfFi9
zE)!GyZ6ca#pn?$3%)~Ax=HB(Be1!r)1kweYZfXOM73Pz%zo?|>Ini{o+p(k5)w)8L
zyd6c?EP@d@?*Qo4b$(S6PGp;O$YY)8(CI+a?LgDTuiZNEgdI9r<k;8@)RhR`dk;SN
z{JLrtbuS_oudo_Dylp?`9XWyZs~awFK+T947!x^F?>$51E#F>wLWYNLuGKnQWr{}^
z+J>vV^?n|2ic^UODK3syQr$;y2jIgWWO0gya_R&(^DMN-jS1dfNg7QmuNWbHqVF)2
z1249?b?1qt{{?m9O$ieED(HkmgybpS3+r`y$VAPv+!r_$2`pK7MUl7XX{kT=-w&9X
zL8xe(;^AlK{~`^JZU&0fMe7+oF}<Gesmu*YI(lcvJDcjh)yMmhnSh0O^vSZaotb61
z@d{jlSTz!vQPWXp_aizwv#2&XJ^W*OxG-+L@Z*XN!I#PNVaP??doLGTEORV$D-PAc
zNuy874_B7Kn<Fk@L2HMX8)~qddu#|imQ}0>k1tI}fI7!*w$J48-l{nMW2aU`{wJm2
zLQsmhBt*qjZD-%%{yJ8|=Fwzc83O(TetYZ*c%2Z|yz=efwLkBRpLi2o>zP0D5lB&S
zmk<Qx+Fe|oq`Oz4R=NX0Y2r;Jj{$0A^F$|jBm^h8ci2{qOr{=GWt_6qv=^~cIu3w%
zwe4Flk)!mv565!*M2u&*){n$`pVhla?#Sl8@Oi+Bx*g3*A}HgT;{gSdScoc6_i0?!
zZN1Nd$^Gr${>8p@or8aE=<egFMqIx@jVPfn>D&QBulpLp1)WJq=bz<H9;d^Rx3=;~
z0r4M79Mc69H6t6UK5jQU2c2}O|D?d%*=O-;o5LZEuQ8uU^Ic%Gp%6pHiphU0iLPaQ
z&E^W6Fn{t~ZKS6r-p8S6^EMF*L|lyyZplaD$>)Ju=f`7JOe8NZE^-Q16KN2G00F`A
zgFnM#{GZr8sf+=%!p{1!KVS_Jz3(B$niyQ(K#G!TwQOLP>4ofuclU~0my#pec`cbF
zg?-m2k^s&Nv9}Ju8W3`|TQ4wMp25hohPAVV!if8x3Nq>h=WMUrNhr0!?+bh91CwVU
zsuN^H_|MG!y<7p%PX4jx0NzeRV8KXO#|03}DYdB4@-VH7Sry<_u;fVURwJ_K_PJyt
zfDkDh_Pg($7MnFNcb5;D_eIUliVMMcZ73WFuiPxe`3(GlcoxT4$D;LE#v9~IDfgR%
z(J<IoQO-Z2@I(p$alC*08p!V{&1=ky<f?Y!i5KziaqeX9E*=-oOCc^mODO&Yv8}Kx
zPRAxnYwi#{=DSl8Z7x}fEQsibAv&zwqUp*JahF5jHl7g0<airQ4VZMjB;*(*h~;nP
zt^R9wdb<wtzLDVG)KKVM8W8ZHH#$pzb9e!0aREjk*!<~J&LuJ5MymC~_mnAbl%;%?
zvP64t_?z^y4S{^dhM!(vhSLZ~0|1_l-xuk-ygVXw&;*O)^)5G0CVcV?IWRKD77j35
z*sT>a`i4VdKjjxGv}u?Uu57&W3Eq^xO8i!9dKtMkO?w;oYlyd7v*D<TAuP4FYF5LP
zM4v%>6*RTAB5ZW<qQP6fY&EovPA+Zmig29Qk&JLRaaKaJ=cLkQXj--XN@a&!{c>X1
zrM}Uoq-2SnJ~N&LpaqHwY!9Q?jc7jL_ii3a3Y`}Kw7|tzLn$@#8umpr(OcDps#5h5
z$%hqbY7{Ks61TErbu+iOFPv4`p}fqDiB>NS_kH6a{wYcoS?E4ff0M#;CVpGg5#7SD
zOt8FPd!NZ0-NHorMSKvy$I;6C=buGr!}(p6V%{4fGf?JJH}-599HzfH$^%4ZPC)gb
zha(dlK}MsHmZnX8t1Co5Ddt_I-ielbglq1vkr6w9H5b{X+lgvHWH;+ADl81STLws;
zqR~pudof=S&d)emHNKu5g<d+;;!cvka6sAGS?B&njGhI_bKF<V4ep%687EA7#*qBt
z*o3vjfhYgjaP9G}&Bm@KeRej-GkD>sSF$IWuKwnkz69=cvxCs4cWA-@vcc^2rWnpe
z!7~9U8FA)<VR2e~rXSpfC=@$Bs8~yblst1XfA5u@YI800{71I$4R@8X#J1{)JS9oT
zesScdsuYC8L1<cTJSIN=z(Fuuf9-wn$5u}8+bpH>SM*SrWl_HEtq~qWVDtu|M$yF-
zFD!cob6;d&a&M@yC3PRmJ)AgMn8!tb7!>n<lFMJg7kfAF>cx|goz3t!wGv2$T5e8O
z!GJXYxH|<9uu6==mbMm5CtZ&YJzd?##)pdmMds=@4?x!prgO9(9~&I2^zaTJ2OgGG
zP`4}{AD{O#T_2`JaV0z|A|f|RO<gKb(1S#zzn0falIy{D%xs0zwwI@;BW>n<#SKRl
z(&#rTP@E5kuWcy-pJJiRYq8o97|g}o-Ufr#ZkPX2;W02e`m@++X8~i<F`8StfLXP3
zcFrk)@2oD8!^2d&$zCm@bvh8)P*^0)HVJ=TyQ90gk7_n-O4$1|#uRukqI;o$ulo1z
z-xH#Wjc9#N3}C^m1APMn0OfuH)v5~&mv#rLwYixJ-`v{Cq@&@3)N@0_jrDbLaWR7h
zO5@FHiMncWQBk1dS<+>&%pXZ0>@l!#j{r+?k;}o{L!fgQ0R`uSSituG_4QRrhKTOY
zOxNoZE#S^rQh9*c0N^7_aExho$QdI3g_7{O>4H+RI)K(Tq<PZh&$hCcOAt$!r!PE2
z56wquhRFxR0pcd`-xADh14+T&VCcF2I##-5fa~?|5+ffS=Ypr-_8DjDh<275Mj~S8
z)zym~B?C@C<XC<TaIU#l`Lj(Yh81rx0ceeP`wGzW>+GOP3@Ipq?mx?vbaAaP!@Rt>
zP@u_Vj#eWRRt#YJ)LG|x?40CZ?ZbD(8o&ib-Q_HLg}8p-u>lxA{4Eav-kK9Y>HbLO
zZbBCm)t`LfZ(Rk$Ev*0L(muV1{poYSnAyl+j-<b_j05-d8uZ`a>KCMjqY616e3k2D
zX5$Fku97Z;?Vo(vy8G#m`PtvH78uhPU5eUCz=~wDAN3|eT|bh?4HbFKX}8^|o8xtL
zZE&vmi0q%u19;3L<-OcM_rsCx!XuGMy`;i#plkT=C}o;|LM{X0{?5TcYr-g3yuo*=
z)Q*)qfq@lXqx;Lt%iA5iW$nH@$KPHMrB$5wU%b?1S$-4LHA2ug6VPH)6E?1~okcAo
zqm3`ASL+-5iru8@+R>EC=;G2u9K~XBeV;1b?`WA$r_vtPo%t0<_lzowhX?V}?_~Ir
zDgM7>cX1w2U7vi9)mqpWqh}k)awfzu4c%E5?5KKDIci0HyFk06Q{?Fhz}%#Le#_Fy
zIFtu>`ANvR=GAUOH_z4BsC=6G{-EQoK<xSEX0e?GW4O@r<RgUktJ?y*lWTI7<2x=&
zUUt7A3)n*Tr{zZn%Lp!jw-Z1$+<JFl=II0<)82n3uCqz=_Yn_zKe0UV$@3h0p7FP#
z_%(*F1ml&pM0P}8X)zN}+i-dFNC*blEh)SZUmC961dQkijvV1iF5dU*zQ-BW`?r_R
zHuo;)F)*$SRrdl8*MIdrKM(B=^bX*K<=GE4JYB}Ga$c7a=CI$LVfKV88WjxOmDJU9
zf(JK;0*;RSMl9GD%M%YvPmKV9IxXJ8cXSXH?%#17n94^WrC=;BIL2<$H_kV}@jH|>
z&0D|fP%=3gu!B1HS5JD+D1sJD>q)ypVirmWGUylw&3tuRq9OniRP#5*UjTFEBpW}i
zD~Yy>C5)GCxxSj;cWmF5o@BxabHIxKL9455bHXgUNsJ;e25hc7{$1aXm!nTZ0%KWB
zala9?R)2e1UOJ(0IcW0P`?k=3Y3US#Bl(eS-x}Mjf&#12P$f*R6Mb2Fm+Qf2f)}5P
zqq5|_$}oA*(>-ClQij=~#|4D=0e!nLztD(28;4kiZvPt@64R<uZLU6nz}?e5VIJQ-
zIrF{xE?%L?fa$6zT&hb?e<QL|qoRX}7)O0xpYzvLQRb0bZ+#)I*2hs4ca;Dp;eTY6
z5iZe##eMu~^|2!8VOZR{J_O2stg<<!5KKE2i;LCQSgZZ15tQD_NZrnT<8~<)V;JMo
zz(!rdr1<1oUdgHo6iD{g-_7n}Xl{^OO}{2Dv>*-EJWT$BOeVPFh=JoXC}j|55QE%F
zu?u7P_ads}U$r(n&e3hw=1PTVJ1cQLU_zKjn^&3X`fLngKYUoT?6kUhSJKqhrm^>|
z#hAKxN7&{F50Los(ocpz$rU|XmxzdnIQlujM9Yt|F)@Kd`a#E#nZC+-_v1&0OdL!I
zGYbamLxx8P@C+Z|F!OIQV-k2?k`kHuSEG^w1J8q_>z<L5m@S74Jr(fAg4%1rpNe9E
z8ELY}AKHAg36TPq>Rq+N1AxLZ<9fBUj5ST?`^xYpwTW?R(pX=|Edd6Ezm7-E%#QUT
zg4mVixV!vpf3XiR_P)zEz4!Kd;E}?=jds5vi61*fE2`z<m{S*`3b%V|o_NF?`i+uD
znzr0qKE7-FO@%&DCF_os#oicm(}#m!-mc-%#$=&Xdu6MAS(8J<i6iVA!f)&g?`FmA
z?a)g3V7}S(iCaar<|IDxnd@C}Cs~xsWMFKJDB3<AkH%6@g<MoV@DzF9_AmEY>H0O0
zZs^_5<^oTj8dv5D_Tuwd2`JXYKKP{ng(KmP{Jnnell@1mc{xnJ^I9B+q-pKahE*pW
zTg|<b+~!n=u+;N?G`WE;(-5~ndL`8AldYv`RgEV6iJJv|jY6EzeJSbmAohfj%IRgg
zdE1n#6qJ0@?{lu4IuYGlWyvN)r2mS}%X4ef-A0yQfi^cvn1=XfU_tnc#cGty`iFJ7
zlDLoe-p5Hh&4?`xQad?X{uQa6$JE*x69Er6L;A7jJoa(eCxyN`*w9;@op;%x=T?qp
ztXZE>+i=;*njq)HtK;ePO^~#_Z6|2Ie?%HLyH)c;$X?Y?_`L0_yMJLz(A<pFeOf9i
zA)CHoxD_6s0``AY(l@~a`@eevr2^A`ypsR_(*KXf|M9nhJ@0-^KamUd<^A!<1ZYhP
NQc=`U_%3H2@;_z&P~iXo

literal 0
HcmV?d00001

diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
new file mode 100644
index 00000000..5910bd6f
--- /dev/null
+++ b/docs/deeplearning_operators/matmul_sparse.md
@@ -0,0 +1,262 @@
+# Sparse Matrix-Matrix Multiplication with Tile Library
+
+<div style="text-align: left;">
+    <em>Author:</em> <a href="https://github.com/botbw">botbw</a>
+</div>
+
+:::{warning}
+   This document is still **experimental** and may be incomplete.  
+
+   This feature is still **experimental** and need further optimization.
+
+   Suggestions and improvements are highly encouraged—please submit a PR!
+:::
+
+:::{tip}
+It's suggested to go through `docs/deeplearning_operators/matmul.md` first.
+
+Example code can be found at `examples/gemm_sp`.
+:::
+
+## Structured sparsity in the NVIDIA Ampere architecture
+
+Since the Ampere architecture (sm80 and above), sparsity support has been integrated into Tensor Cores. This allows a 2:4 (or 1:2 for 32-bit data types) semi-structured matrix to be compressed into its non-zero values along with associated metadata, which can then be fed into the Tensor Core. This enables up to **2x throughput** compared to the equivalent dense computation.
+
+:::{warning}
+   This tutorial primarily focuses on CUDA, as this feature is not yet supported on ROCm. However, AMD provides a similar capability in the matrix cores of GPUs such as the MI300X.
+:::
+
+```{figure} ../_static/img/sparse_mma_storage_example.png
+:align: center
+
+Figure: Sparse MMA storage example (from PTX doc)
+```
+
+## Compress a dense tensor
+
+To utilize sparse Tensor Cores, a dense tensor must first be **compressed** into its non-zero values along with the corresponding metadata.
+
+Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see references [here](https://github.com/pytorch/pytorch/blob/a8d6afb511a69687bbb2b7e88a3cf67917e1697e/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu#L47) and [here](https://github.com/vllm-project/vllm/blob/a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh#L116)), leveraging `CUTLASS`’s built-in compressor (or reimplementing it in `PyTorch`).
+
+A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
+
+```python 
+from tilelang.utils.sparse import compress
+A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
+```
+
+Here, `A_sparse` contains all the non-zero elements of `A`, while `E` stores the corresponding metadata (indexing information) required to reconstruct the original sparse pattern.
+
+> NOTE: When using CUTLASS compressor, there is no naive position correspondence between the positions in `A_sparse`/`A` and `E`. (i.e. the 4-element group at [n, k] doesn't match the 4-bit metadata at [n, k] if you consider metadata as int4 tensor)
+The metadata is reordered internally to optimize memory access patterns (e.g., for ldsm instructions and vectorized loads).
+For more information, see **A note on `gemm_sp` and `gemm_sp_v2`**.
+
+
+## `T.gemm_sp` with CUTLASS's compressor
+
+:::{warning}
+
+It is strongly recommended to use T.gemm_sp_v2 due to its greater flexibility and faster compilation time.
+
+:::
+
+A 2:4 sparse GEMM kernel is similar to its dense counterpart, except that it also requires handling the associated metadata.
+
+Check comments in below kernel code for required modification.
+
+```python
+def matmul_sp_sm80(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+    trans_A,
+    trans_B,
+):
+    is_8_bit = "8" in in_dtype
+    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]  # Calculate shape for given datatypes
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (K, N) if not trans_B else (N, K)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)  # Allocate smem for metadata
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({  # Annotate reordered cutlass metadata layout
+                E:
+                    make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared:
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)  # Call gemm_sp with non-zero values and metadata
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+```
+
+Under the hood, `gemm_sp` invokes templates adapted from `CUTLASS`, and a compatible metadata layout must be specified using `T.annotate_layout`.
+
+## `T.gemm_sp_v2` with a custom compressor
+
+To migrate to `gemm_sp_v2`, simply replace occurrences of `gemm_sp`.
+
+Unlike `gemm_sp`, `gemm_sp_v2` can operate without `T.annotate_layout`, and it also supports user-defined layouts and compressors.
+
+The metadata is stored in a `(u)int8`/`(u)int16`/`(u)int32` tensor, where **each 4-bit chunk represents two 2-bit indices** of non-zero elements within four consecutive elements. Here, we start with an `int16` example, which is the **default dtype** for `bf16` and `fp16` on Ampere GPUs.
+
+Suppose we have the following row vector:
+```python
+t = tensor([[0, 7, 0, 3], [1, 5, 0, 0], [0, 0, 2, 4], [9, 0, 9, 0]], dtype=torch.float16).flatten()
+```
+
+The non-zero elements and their corresponding indices are:
+
+```python
+t_sp = tensor([[7, 3], [1, 5], [2, 4], [9, 9]], dtype=torch.float16).flatten()
+indices = tensor([[1, 3], [0, 1], [2, 3], [0, 2]], dtype=torch.float16).flatten()
+```
+
+The corresponding uint16 metadata is:
+```python
+# metadata_bits = tensor([0b1101, 0b0100, 0b1110, 0b1000])
+# Note: storage uses little-endian order: tensor(0b1000111001001101, dtype=torch.int16)
+# Note: the above code is not runnable in python as the interpreter won't take the binary
+#       as 2's complement
+metadata_int16 = tensor(-29107)
+```
+
+You can decode an int16 metadata tensor using the following utility:
+```python
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+```
+
+The compressor can be implement at either `PyTorch`/`NumPy` level or kernel level.
+
+For example, `PyTorch` provides an Ampere compressor [here](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L47-L179). Note that in this implementation, a [permutation](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L173-L175) is applied to match CUTLASS’s metadata layout. If you do not annotate a metadata layout when using `gemm_sp_v2`, your compressor should replicate the same behavior as the PyTorch example—but without using the `_calculate_meta_reordering_scatter_offsets` function.
+
+If you want to use a custom metadata layout in your kernel, one approach is to define the layout in `TileLang` and then apply the same layout to both your compressor kernel and the matmul_sp kernel.
+
+```python
+
+@tilelang.jit(out_idx=[1, 2], pass_configs={
+    tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+})
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:  # NOTE: Make sure compressor metadata layout
+                T.annotate_layout({ # is same with your computation kernel
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared,
+                            mma_dtype="float16",
+                            arch="8.0",
+                            block_k=block_K),
+                })
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            non_zero_cnt = T.alloc_local((1, ), dtype="uint8")
+            non_zero_elt_log_idx = T.alloc_local((elem, ), dtype="uint8")
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    T.clear(non_zero_cnt)
+                    T.clear(non_zero_elt_log_idx)
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+```
+
+## A note on `gemm_sp` and `gemm_sp_v2`
+
+Initially, `T.gemm_sp` followed the same design as `T.gemm`, lowering to a `CUTLASS` template. This inherently requires metadata to be reordered offline following a predetermined layout.
+
+However, fixing a specific layout introduces several potential issues:
+
+1. Painful debugging experience: Debugging a failed kernel becomes difficult due to the reordered indexing, including permutations and swizzling.
+
+2. Limited flexibility: For example, concatenating two compressed tensors, such as `A_sparse_0` and `A_sparse_1`, into a new `A_sparse` makes sense. However, concatenating their metadata `E_0` and `E_1` may not be valid unless the layout allows it mathematically.
+
+3. Alignment requirements: `CUTLASS` enforces strict alignment checks, and many hyperparameter configurations can lead to compilation errors. (For reference, sm8x was implemented in `CUTLASS 2`.)
+
+`T.gemm_sp_v2` was designed to address these limitations, following the approach of `T.gemm_v2`. It lowers directly to PTX, removing the need for a fixed metadata layout.
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 9f794776..45e7f5ea 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,6 +33,7 @@ tutorials/auto_tuning
 deeplearning_operators/elementwise
 deeplearning_operators/gemv
 deeplearning_operators/matmul
+deeplearning_operators/matmul_sparse
 deeplearning_operators/deepseek_mla
 :::
 
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
new file mode 100644
index 00000000..9336351c
--- /dev/null
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -0,0 +1,363 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+import argparse
+
+import tilelang
+import tilelang.language as T
+
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.sparse import randn_semi_sparse
+from tilelang.utils.tensor import torch_assert_close
+
+from triton.testing import do_bench
+
+import torch
+
+torch.manual_seed(42)
+
+DEFAULT_CONFIG = {  # take best config from autotune script
+    "4090": {
+        'float': {
+            'block_M': 128,
+            'block_N': 64,
+            'block_K': 64,
+            'num_stages': 1,
+            'thread_num': 128,
+            'policy': T.GemmWarpPolicy.Square,
+            'enable_rasterization': True
+        },
+        'float16': {
+            'block_M': 256,
+            'block_N': 128,
+            'block_K': 64,
+            'num_stages': 2,
+            'thread_num': 128,
+            'policy': T.GemmWarpPolicy.Square,
+            'enable_rasterization': True
+        }
+    },
+    "h20": {
+        'float': {
+            'block_M': 128,
+            'block_N': 64,
+            'block_K': 128,
+            'num_stages': 3,
+            'thread_num': 128,
+            'policy': T.GemmWarpPolicy.Square,
+            'enable_rasterization': True
+        },
+        'float16': {
+            'block_M': 128,
+            'block_N': 64,
+            'block_K': 128,
+            'num_stages': 3,
+            'thread_num': 128,
+            'policy': T.GemmWarpPolicy.Square,
+            'enable_rasterization': True
+        }
+    }
+}
+
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul_sp_fp16_custom_compress(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages,
+                                   thread_num, policy, enable_rasterization, use_cutlass_layout):
+    e_factor, e_dtype = (16, "int16")
+
+    @T.prim_func
+    def gemm_sp_fp16_custom_compress(
+            A_sparse: T.Tensor((M, K // 2), 'float16'),
+            E: T.Tensor((M, K // e_factor), e_dtype),
+            B: T.Tensor((K, N), 'float16'),
+            C: T.Tensor((M, N), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout({
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                })
+            T.clear(C_local)
+            T.disable_warp_group_reg_alloc()
+            T.use_swizzle(panel_size=10, enable=enable_rasterization)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_local, False, False, policy=policy)
+
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return gemm_sp_fp16_custom_compress
+
+
+def torch_compress(dense):
+    """
+    A naive compression function, where each 4-bit meta matches 4 elements in original matrix in row major layout.
+    """
+    if dense.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
+
+    m, k = dense.shape
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError("Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(
+            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"
+        )
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, _m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, _m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1,
+                                idxs0.unsqueeze(-1) // 2).view(
+                                    m, k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12))
+    elif quadbits_per_meta_elem == 8:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28))
+
+    return (sparse, meta)
+
+
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4  # 4 groups per uint16
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+
+
+@tilelang.jit(
+    out_idx=[1, 2], pass_configs={
+        tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+    })
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+            A: T.Tensor((M, K), dtype),
+            A_sp: T.Tensor((M, K // 2), dtype),
+            E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout({
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                })
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            # TODO: alloc_var seems buggy here
+            non_zero_cnt = T.alloc_local((1,), dtype="uint8")
+            non_zero_elt_log_idx = T.alloc_local((elem,), dtype="uint8")
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    T.clear(non_zero_cnt)
+                    T.clear(non_zero_elt_log_idx)
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    # TODO: use T.device_assert(non_zero_cnt <= 2) after rebasing main
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(
+                            val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument(
+        "--use_cutlass_layout", action='store_true', help="Use cutlass layout for E tensor")
+    parser.add_argument(
+        "--use_torch_compressor", action='store_true', help="Use torch sparse for reference")
+    parser.add_argument(
+        "--accum_dtype",
+        type=str,
+        default="float",
+        choices=["float", "float16"],
+        help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
+    args = parser.parse_args()
+    kernel = matmul_sp_fp16_custom_compress(
+        args.m,
+        args.n,
+        args.k,
+        args.accum_dtype,
+        **DEFAULT_CONFIG[args.cfg][args.accum_dtype],
+        use_cutlass_layout=args.use_cutlass_layout)
+
+    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
+    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+
+    if args.use_torch_compressor:
+        assert not args.use_cutlass_layout, "torch sparse must be used with naive layout"
+        a_sparse, e = torch_compress(a)
+    else:
+        a_sparse, e = compress_kernel(
+            args.m, args.k, 32, 32, "float16", use_cutlass_layout=args.use_cutlass_layout)(
+                a)
+
+    c = kernel(a_sparse, e, b)
+
+    ref_c = a @ b
+
+    assert not c.isnan().any(), "Reference result contains NaNs, please report an issue"
+    torch_assert_close(c, ref_c.to(c.dtype), rtol=1e-3, atol=1e-3)
+    print(
+        f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}"
+    )
+
+    latency = do_bench(lambda: kernel(a_sparse, e, b))
+    ref_latency = do_bench(lambda: a @ b)
+
+    total_flops = 2 * args.m * args.n * args.k
+    tflops = total_flops / latency / 1e9
+    ref_tflops = total_flops / ref_latency / 1e9
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 505f2b88..91682a9e 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -5,7 +5,7 @@ import argparse
 import tilelang
 import tilelang.language as T
 
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.utils.sparse import compress, randn_semi_sparse
 from tilelang.contrib import nvcc
 from triton.testing import do_bench
@@ -14,9 +14,7 @@ import torch
 
 arch = nvcc.get_target_compute_version()
 
-ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
-
-default_config = {  # take best config from autotune script
+DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
         'float': {
             'block_M': 128,
@@ -59,6 +57,8 @@ default_config = {  # take best config from autotune script
     }
 }
 
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
 
 @tilelang.jit(out_idx=[-1])
 def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy,
@@ -84,15 +84,11 @@ def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages,
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
             T.annotate_layout({
                 E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", backend="cutlass", block_k=block_K, arch=arch),
+                    make_cutlass_metadata_layout(
+                        E, mma_dtype="float16", block_k=block_K, arch=arch),
                 E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        backend="cutlass",
-                        block_k=block_K,
-                        arch=arch),
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype="float16", block_k=block_K, arch=arch),
             })
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
@@ -117,10 +113,10 @@ def main():
         default="float",
         choices=["float", "float16"],
         help="Accumulation datatype")
-    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], required=True)
+    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
     args = parser.parse_args()
     kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype,
-                            **default_config[args.cfg][args.accum_dtype])
+                            **DEFAULT_CONFIG[args.cfg][args.accum_dtype])
 
     a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
     b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
@@ -128,7 +124,7 @@ def main():
     a_sparse, e = compress(
         a,
         transposed=False,
-        block_k=default_config[args.cfg][args.accum_dtype]['block_K'],
+        block_k=DEFAULT_CONFIG[args.cfg][args.accum_dtype]['block_K'],
         arch=arch)
     c = kernel(a_sparse, e, b)
 
diff --git a/examples/gemm_sp/test_example_gemm_sp.py b/examples/gemm_sp/test_example_gemm_sp.py
new file mode 100644
index 00000000..fe26df14
--- /dev/null
+++ b/examples/gemm_sp/test_example_gemm_sp.py
@@ -0,0 +1,16 @@
+import tilelang.testing
+
+import example_custom_compress
+import example_gemm_sp
+
+
+def test_example_custom_compress():
+    example_custom_compress.main()
+
+
+def test_example_gemm_sp():
+    example_gemm_sp.main()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 59c79c28..8707c943 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -1,7 +1,7 @@
 import torch
 import tilelang
 from tilelang.utils.sparse import compress_sm90
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 import tilelang.testing
 
 
@@ -40,15 +40,11 @@ def matmul_sp(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.annotate_layout({
                 E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
+                    make_cutlass_metadata_layout(
+                        E, mma_dtype="float16", arch="9.0", block_k=block_K),
                 E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype="float16", arch="9.0", block_k=block_K),
             })
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index bdabefaf..4c0ae08b 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -307,7 +307,20 @@ TIR_REGISTER_TL_TILE_OP(GemmSP, gemm_sp)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK() { GemmSPNode::RegisterReflection(); }
+TVM_REGISTER_OP("tl.GemmSPWarpPolicy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmSPWarpPolicy");
 
+TVM_FFI_STATIC_INIT_BLOCK() {
+  GemmSPNode::RegisterReflection();
+  GemmSPWarpPolicyNode::RegisterReflection();
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.GemmSPWarpPolicyComputeWarpPartition",
+      [](GemmSPWarpPolicy policy, int M, int N, int block_size, Target target,
+         bool use_wgmma, int bits) {
+        policy->computeWarpPartition(M, N, block_size, target, use_wgmma, bits);
+        return;
+      });
+}
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index aae5b27b..a634e922 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -23,6 +23,14 @@ public:
                                            int bits) const;
   TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmSPWarpPolicy", GemmSPWarpPolicyNode,
                               GemmWarpPolicyNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPWarpPolicyNode>()
+        .def_ro("policy_type", &GemmSPWarpPolicyNode::policy_type)
+        .def_ro("m_warp", &GemmSPWarpPolicyNode::m_warp)
+        .def_ro("n_warp", &GemmSPWarpPolicyNode::n_warp);
+  }
 };
 
 class GemmSPWarpPolicy : public ObjectRef {
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
new file mode 100644
index 00000000..d7c4f4aa
--- /dev/null
+++ b/src/op/gemm_sp_py.cc
@@ -0,0 +1,289 @@
+/*!
+ * \file tl/op/gemm_sp_py.cc
+ * \brief Implementation of Sparse General Matrix Multiplication (GEMM_SP)
+ * operators
+ */
+
+#include "gemm_sp_py.h"
+#include "utils.h"
+
+#include "builtin.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/transform.h>
+
+#include "../target/utils.h"
+#include "tvm/ffi/string.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/**
+ * @brief Construct a Gemm operator from serialized TL arguments and a buffer
+ * map.
+ *
+ * This constructor deserializes operator parameters from `args` and resolves
+ * buffer references via `vmap`, populating an internal GemmSPPyNode with:
+ * - device pointers for A, E, B, C and their corresponding Buffer objects,
+ * - transpose flags for A and B,
+ * - matrix dimensions M, N, K,
+ * - warp allocation policy and clear_accum flag,
+ * - strides and memory offsets for A and B,
+ * - optional kPack (must be 1 or 2) and optional wg_wait.
+ *
+ * The populated GemmSPPyNode is stored into the wrapper's internal `data_`.
+ *
+ * @param args Positional serialized arguments produced by the TL frontend:
+ *   expected layout is:
+ *     [Aptr, Eptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
+ *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
+ *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
+ *      (optional) kPack (Int), (optional) wg_wait (Int)]
+ * @param vmap Mapping from access pointer vars to Buffer objects used to
+ *   resolve the Buffer corresponding to each pointer argument.
+ *
+ * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
+ *       fails with an ICHECK (runtime assertion). No other validation is
+ *       performed here.
+ */
+GemmSPPy::GemmSPPy(Array<PrimExpr> args) {
+  ObjectPtr<GemmSPPyNode> node = tvm::ffi::make_object<GemmSPPyNode>();
+
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+
+  node->A = node->aRegion_->buffer;
+  node->E = node->eRegion_->buffer;
+  node->B = node->bRegion_->buffer;
+  node->C = node->cRegion_->buffer;
+
+  node->trans_A = args[4].as<Bool>().value();
+  node->trans_B = args[5].as<Bool>().value();
+  node->trans_E = args[6].as<Bool>().value();
+  node->M = args[7].as<IntImm>().value()->value;
+  node->N = args[8].as<IntImm>().value()->value;
+  node->K = args[9].as<IntImm>().value()->value;
+  node->policy = GemmWarpPolicy(args[10].as<IntImm>().value()->value);
+  node->clear_accum = args[11].as<PrimExpr>().value();
+  node->stride_A = args[12].as<IntImm>().value()->value;
+  node->stride_B = args[13].as<IntImm>().value()->value;
+  node->offset_A = args[14].as<IntImm>().value()->value;
+  node->offset_B = args[15].as<IntImm>().value()->value;
+  if (args.size() > 16) {
+    node->kPack = args[16].as<IntImm>().value()->value;
+    if (node->kPack != 1 && node->kPack != 2) {
+      ICHECK(false) << "kPack must be 1 or 2";
+    }
+  }
+  if (args.size() > 17) {
+    node->wg_wait = args[17].as<IntImm>().value()->value;
+  }
+  data_ = std::move(node);
+}
+
+/**
+ * @brief Create a copy of this GemmSPPyNode as a TileOperator.
+ *
+ * Constructs a new GemmSPPyNode by copying the current node state and returns
+ * it wrapped in a GemmSPPy TileOperator.
+ *
+ * @return TileOperator A GemmSPPy operator that owns a copy of this node.
+ */
+TileOperator GemmSPPyNode::Clone() const {
+  auto op = tvm::ffi::make_object<GemmSPPyNode>(*this);
+  return GemmSPPy(op);
+}
+
+GemmInst GemmSPPyNode::GetGemmInst(int block_size, Target target) const {
+  int warp_size = TargetGetWarpSize(target);
+  int num_warps = block_size / warp_size;
+  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
+                     (num_warps % 4 == 0) && CheckWGMMA();
+  if (allow_wgmma) {
+    return GemmInst::kWGMMA;
+  } else if (TargetIsCDNA(target)) {
+    return GemmInst::kMFMA;
+  } else if (TargetIsCuda(target)) {
+    return GemmInst::kMMA;
+  } else {
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+  }
+}
+
+/**
+ * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
+ *
+ * Evaluates device-memory placement, data-type combinations, transpose flags,
+ * and K divisibility constraints required for the Hopper WGMMA code path.
+ *
+ * The check returns true only when:
+ * - B resides in shared memory ("shared" or "shared.dyn"); and
+ * - (C, A, B) dtypes match one of the supported combinations below and K
+ *   satisfies the required alignment; and
+ * - for combinations that require specific orientations, A is not transposed
+ *   and B is transposed.
+ *
+ * Supported combinations and constraints:
+ * - C=float16:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
+ * 32 == 0
+ * - C=float32:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - A=bfloat16, B=bfloat16: K % 16 == 0
+ *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
+ *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
+ * - C=int32:
+ *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
+ * and K % 32 == 0
+ *
+ * @return true if WGMMA is supported for the current buffers, dtypes, and
+ *         transpose/shape constraints; false otherwise.
+ */
+bool GemmSPPyNode::CheckWGMMA() const {
+  return false; // not supported yet
+  // if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+  //   return false;
+  // }
+
+  // if (C->dtype == DataType::Float(16)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Float(32)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::BFloat(16) &&
+  //            B->dtype == DataType::BFloat(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::Float(32) && B->dtype ==
+  //   DataType::Float(32))
+  //     return (!trans_A) && trans_B && K % 8 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Int(32)) {
+  //   if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else {
+  //   return false;
+  // }
+}
+
+/**
+ * @brief Parse and return the numeric GPU architecture from a Target's "arch"
+ * attribute.
+ *
+ * Examines the target's "arch" string and, if it matches the pattern
+ * "sm_<num>", returns <num> as an int. If the attribute is present but does not
+ * match that pattern, returns 0.
+ *
+ * Preconditions: the target must have an "arch" attribute (this is checked via
+ * ICHECK).
+ *
+ * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
+ * the arch string does not match "sm_<num>".
+ */
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.has_value());
+  std::string arch = s.value();
+  if (arch.rfind("sm_", 0) == 0) {
+    arch_int = std::stoi(arch.substr(3));
+  } else {
+    arch_int = 0;
+  }
+  return arch_int;
+}
+
+Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  auto block_size = *as_const_int(T.thread_bounds->extent);
+  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+
+  auto [warp_m, warp_n] =
+      policy->computeWarpPartition(M, N, block_size, T.target, gemm_inst);
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.lower")) {
+    auto prim_func =
+        Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target,
+                                T.thread_bounds, T.thread_var));
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
+    ICHECK(global_symbol.has_value());
+    if (prim_func->body.as<BlockRealizeNode>()) {
+      BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+      auto block = block_realize->block;
+      {
+        BlockNode *n = block.CopyOnWrite();
+        n->name_hint = global_symbol.value();
+      }
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
+    }
+    // warp with block realize node
+    return BlockRealize(
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body));
+  } else {
+    LOG(FATAL) << "No lower function found for gemm_sp_py";
+  }
+}
+
+LayoutMap GemmSPPyNode::InferLayout(const LayoutInferArgs &T,
+                                    InferLevel level) const {
+  if (completed_)
+    return {};
+  LayoutMap results;
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.infer_layout")) {
+    results = Downcast<LayoutMap>(
+        (*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target, T.thread_bounds));
+  } else {
+    LOG(FATAL) << "No infer layout function found for gemm_sp_py";
+  }
+
+  completed_ = true;
+  return results;
+}
+
+TIR_REGISTER_TL_OP(GemmSPPy, gemm_sp_py)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() { GemmSPPyNode::RegisterReflection(); }
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
new file mode 100644
index 00000000..2f79c5e1
--- /dev/null
+++ b/src/op/gemm_sp_py.h
@@ -0,0 +1,94 @@
+/*!
+ * \file tl/op/gemm_sp_py.h
+ * \brief Define gemm_sp_py operator.
+ *
+ */
+
+// TODO: @botbw: remove redundant code with gemm_py.h
+
+#ifndef TVM_TL_OP_GEMM_SP_PY_H_
+#define TVM_TL_OP_GEMM_SP_PY_H_
+
+#include "gemm_sp.h"
+#include "operator.h"
+
+namespace tvm {
+
+namespace tl {
+
+using namespace tir;
+
+class GemmSPPyNode : public TileOperatorNode {
+public:
+  bool CheckWGMMA() const;
+  tir::Buffer A, E, B, C;
+  // pointer to the A, E, B, C
+  BufferRegion aRegion_, eRegion_, bRegion_, cRegion_;
+  bool trans_A, trans_B, trans_E;
+  int M, N, K;
+  int stride_A, stride_B;
+  int offset_A, offset_B;
+  PrimExpr clear_accum = const_false();
+  // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
+  // only will be enabled under cdna mfma instructions
+  int kPack = 1;
+  int wg_wait = 0;
+
+  // use GemmWarp Policy here as the atom size are flexible in v2
+  mutable GemmWarpPolicy policy;
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmSPPy", GemmSPPyNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPPyNode>()
+        .def_ro("A", &GemmSPPyNode::A)
+        .def_ro("E", &GemmSPPyNode::E)
+        .def_ro("B", &GemmSPPyNode::B)
+        .def_ro("C", &GemmSPPyNode::C)
+        .def_ro("aRegion", &GemmSPPyNode::aRegion_)
+        .def_ro("eRegion", &GemmSPPyNode::eRegion_)
+        .def_ro("bRegion", &GemmSPPyNode::bRegion_)
+        .def_ro("cRegion", &GemmSPPyNode::cRegion_)
+        .def_ro("trans_A", &GemmSPPyNode::trans_A)
+        .def_ro("trans_B", &GemmSPPyNode::trans_B)
+        .def_ro("trans_E", &GemmSPPyNode::trans_E)
+        .def_ro("M", &GemmSPPyNode::M)
+        .def_ro("N", &GemmSPPyNode::N)
+        .def_ro("K", &GemmSPPyNode::K)
+        .def_ro("stride_A", &GemmSPPyNode::stride_A)
+        .def_ro("stride_B", &GemmSPPyNode::stride_B)
+        .def_ro("offset_A", &GemmSPPyNode::offset_A)
+        .def_ro("offset_B", &GemmSPPyNode::offset_B)
+        .def_ro("clear_accum", &GemmSPPyNode::clear_accum)
+        .def_ro("kPack", &GemmSPPyNode::kPack)
+        .def_ro("wg_wait", &GemmSPPyNode::wg_wait)
+        .def_ro("policy", &GemmSPPyNode::policy);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+
+  TileOperator Clone() const;
+
+private:
+  // Target GEMM instruction
+  GemmInst GetGemmInst(int block_size, Target target) const;
+
+  mutable bool completed_ = false;
+};
+
+class GemmSPPy : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSPPy, TileOperator,
+                                             GemmSPPyNode);
+  TVM_DLL GemmSPPy(Array<PrimExpr> args);
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_GEMM_SP_PY_H_
\ No newline at end of file
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index b92fc73b..bf2a5100 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -127,6 +127,16 @@ TL_DEVICE int4_t make_int4(signed char x0, signed char x1, signed char x2,
   return result;
 }
 
+TL_DEVICE int4_t make_int4(short x0, short x1, short y0, short y1, short z0,
+                           short z1, short w0, short w1) {
+  int4_t result;
+  *((short2 *)&result.x) = make_short2(x0, x1);
+  *((short2 *)&result.y) = make_short2(y0, y1);
+  *((short2 *)&result.z) = make_short2(z0, z1);
+  *((short2 *)&result.w) = make_short2(w0, w1);
+  return result;
+}
+
 // Pack eight int values.
 TL_DEVICE longlong4 make_longlong4(int x0, int x1, int y0, int y1, int z0,
                                    int z1, int w0, int w1) {
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 020cb1f1..3f8ce5e6 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -108,6 +108,16 @@ __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
   PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
+template <>
+__device__ void debug_print_buffer_value<uint16_t>(const char *msg,
+                                                   const char *buf_name,
+                                                   int index, uint16_t var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=uint16_t value=%u\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, (uint32_t)var);
+}
+
 TL_DEVICE void device_assert(bool cond) { assert(cond); }
 
 TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 74b9729f..cefe986a 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -2,28 +2,46 @@ import torch
 import tilelang
 import tilelang.testing
 
-from tilelang.utils.sparse import compress, randn_semi_sparse
-from tilelang.layout import make_metadata_layout
-
-torch.set_printoptions(threshold=float('inf'), edgeitems=float('inf'), linewidth=10000)
-torch.manual_seed(42)
-
-STR_TO_TYPE = {
-    'float32': torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "int8": torch.int8,
-    "int32": torch.int32,
-}
-
-SPARSITY_MAP = {
-    # 'float32': (1, 2),  # not supported for now
-    torch.float16: (2, 4),
-    torch.bfloat16: (2, 4),
-    torch.float8_e4m3fn: (2, 4),
-    torch.int8: (2, 4),
-}
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+torch.backends.cuda.matmul.allow_tf32 = False
+# torch.manual_seed(42)  # only enable when debugging
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(
+            M,
+            K,
+            low=low,
+            high=high,
+            dtype=map_torch_type(in_dtype),
+            device='cuda',
+            transposed=trans_A)
+        B = torch.randint(
+            size=(N, K) if trans_B else (K, N),
+            low=low,
+            high=high,
+            dtype=map_torch_type(in_dtype),
+            device='cuda')
+    else:
+        A = randn_semi_sparse(
+            M, K, dtype=torch.float32, device='cuda',
+            transposed=trans_A).to(map_torch_type(in_dtype))
+        B = torch.randn(
+            (N, K) if trans_B else (K, N), device='cuda',
+            dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
 
 
 def matmul_sp_sm90(
@@ -60,21 +78,17 @@ def matmul_sp_sm90(
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.annotate_layout({
                 E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
+                    make_cutlass_metadata_layout(
+                        E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
                 E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
             })
             T.disable_warp_group_reg_alloc()
-            T.clear(C_local)
+            T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
                 if trans_A:
@@ -85,8 +99,8 @@ def matmul_sp_sm90(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm_sp(A_shared, E_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
 
     return main
 
@@ -107,7 +121,8 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    E_factor = 32 if is_8_bit else 16
+    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
@@ -118,22 +133,18 @@ def matmul_sp_sm80(
     @T.prim_func
     def main(
             A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'int32' if is_8_bit else 'int16'),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
             B: T.Tensor(B_shape, in_dtype),
             C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor),
-                                      'int32' if is_8_bit else 'int16')
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.annotate_layout({
-                E:
-                    make_metadata_layout(E, mma_dtype="float16", backend="cutlass", arch="8.0"),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared, mma_dtype="float16", backend="cutlass", arch="8.0"),
+                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
             })
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
@@ -181,19 +192,14 @@ def run_gemm_sp(
         kernel,
         out_idx=[-1],
     )
-    A = randn_semi_sparse(M, K, dtype=STR_TO_TYPE[in_dtype], device='cuda', transposed=trans_A)
-    if trans_B:
-        B = torch.randn((N, K), device='cuda', dtype=torch.float32)
-    else:
-        B = torch.randn((K, N), device='cuda', dtype=torch.float32)
-
-    if "float8" in in_dtype or "int8" in in_dtype:
-        A = normalize(A.float())
-        B = normalize(B.float())
-
-    A = A.to(STR_TO_TYPE[in_dtype])
-    B = B.to(STR_TO_TYPE[in_dtype])
-
+    A, B = generate_dense_input(
+        M=M,
+        N=N,
+        K=K,
+        trans_A=trans_A,
+        trans_B=trans_B,
+        in_dtype=in_dtype,
+    )
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
 
     C_sp = kernel(A_sparse, E, B)
@@ -206,14 +212,22 @@ def run_gemm_sp(
         if "float8" in in_dtype or "int8" in in_dtype:
             A = A.to(torch.float32)
             B = B.to(torch.float32)
-        return torch.matmul(A, B).to(STR_TO_TYPE[out_dtype])
+        return torch.matmul(A, B)
 
     C = _matmul(A, B)
+
     if 'float8' in in_dtype:
         diff = calc_diff(C_sp, C)
         assert diff < 1e-3, f"{diff=}"
     else:
-        torch.testing.assert_close(C_sp, C, atol=1e-3, rtol=1e-3)
+        torch_assert_close(
+            C_sp.to(torch.float32),
+            C.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-3,
+            base_name="tilelang_sp",
+            ref_name="ref_dense",
+        )
     print("pass")
 
 
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
new file mode 100644
index 00000000..a82c29f3
--- /dev/null
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -0,0 +1,666 @@
+from tilelang import tvm as tvm
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+import tilelang.testing
+import torch
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({
+                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_ss(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(
+            M,
+            K,
+            low=low,
+            high=high,
+            dtype=map_torch_type(in_dtype),
+            device='cuda',
+            transposed=trans_A)
+        B = torch.randint(
+            size=(N, K) if trans_B else (K, N),
+            low=low,
+            high=high,
+            dtype=map_torch_type(in_dtype),
+            device='cuda')
+    else:
+        A = randn_semi_sparse(
+            M, K, dtype=map_torch_type(in_dtype), device='cuda', transposed=trans_A)
+        B = torch.randn(
+            (N, K) if trans_B else (K, N), device='cuda',
+            dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
+
+
+def test_gemm_ss():
+    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
+    # GEMM tests for float16
+    # TODO: support transposed A compressor
+    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float", 128, 128, 32, 2)
+
+    # n8 test
+    run_gemm_ss(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
+
+    # int8 test
+    run_gemm_ss(128, 128, 128, False, True, "int8", "int32", "int32", 128, 128, 64, 2)
+    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
+
+    # float8 tests
+    run_gemm_ss(128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64,
+                2)
+    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
+
+    # tfloat32 test
+    # run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def test_gemm_rs():
+    # GEMM tests for float16
+    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
+
+    # n8 tests
+    run_gemm_rs(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
+
+    # int8 tests
+    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
+
+    # float8 tests
+    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
+
+    # float32 tests
+    # run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+
+
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({
+                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_shared, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def test_gemm_sr():
+    # GEMM tests for float16
+    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
+
+    # n8 tests
+    run_gemm_sr(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
+
+    # int8 tests
+    run_gemm_sr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 128, 2)
+    run_gemm_sr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 128, 2)
+    run_gemm_sr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_sr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
+
+    # float8 tests
+    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
+
+    # float32 tests
+    # run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({
+                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        })
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def test_gemm_rr():
+    # GEMM tests for float16
+    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
+    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
+    # n8 tests
+    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float", 128, 8, 32, 2)
+    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 64, 2)
+
+    # int8 tests
+    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
+    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
+
+    # float8 tests
+    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
+
+    # float32 tests
+    # run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
+    # run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
index 449b6b94..f49b5956 100644
--- a/tilelang/intrinsics/mma_layout.py
+++ b/tilelang/intrinsics/mma_layout.py
@@ -151,12 +151,43 @@ def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
     return row, col
 
 
+def mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+        groupID           = %laneid >> 2
+        threadID_in_group = %laneid % 4
+
+        row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
+                groupID + 8         Otherwise
+
+        col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
+        (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
+    """
+    row = (thread_id // 4) + 8 * (local_id % 4 // 2)
+    col = (thread_id % 4) * 2 + (local_id % 2) + 8 * (local_id // 4)
+    return row, col
+
+
 def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
     row = 8 * (local_id // 8) + (thread_id // 4)
     col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
     return row, col
 
 
+def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+        groupID           = %laneid >> 2
+        threadID_in_group = %laneid % 4
+
+        row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
+            (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
+
+        col = groupID
+    """
+    col = (thread_id % 4) * 2 + ((local_id % 4) % 2) + ((local_id % 4) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 4)
+    return row, col
+
+
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 6e49b058..5811eb53 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -22,8 +22,10 @@ from tilelang.intrinsics.mma_layout import (
     shared_16x32_to_mma_32x16_layout_sr_b,
     mma_load_a_32x4_to_shared_16x8_layout,
     mma_load_b_32x4_to_shared_16x8_layout,
+    mma_load_b_32x8_to_shared_16x16_layout,
     mma_load_a_32x16_to_shared_16x32_layout,
     mma_load_b_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
 )
 
 lift = convert
@@ -291,6 +293,8 @@ class TensorCoreIntrinEmitter:
         if not ldmatrix_available:
             if DataType(a_dtype).bits == 8:
                 mma_load_layout = mma_load_a_32x16_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_load_a_32x8_to_shared_16x16_layout
             elif DataType(a_dtype).bits == 32:
                 mma_load_layout = mma_load_a_32x4_to_shared_16x8_layout
             else:
@@ -417,6 +421,8 @@ class TensorCoreIntrinEmitter:
         if not ldmatrix_available:
             if DataType(b_dtype).bits == 8:
                 mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout
             elif DataType(b_dtype).bits == 32:
                 mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
new file mode 100644
index 00000000..bae86bf4
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -0,0 +1,190 @@
+from tvm import DataType
+from typing import Literal
+
+from tilelang.intrinsics.mma_layout import (
+    mma_load_a_32x4_to_shared_16x8_layout,
+    mma_load_a_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
+
+
+def shared_16x16_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x8_to_mma_32x4_layout_sr_a(i, j)
+
+
+def shared_16x16_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 4)
+    return thread_id, 4 * (i // 8) + (j // 4)
+
+
+def shared_16x32_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x16_to_mma_32x8_layout_sr_a(i, j)
+
+
+def shared_16x32_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 8 * (i // 8) + (j // 8) * 2 + (j % 2)
+
+
+def shared_16x64_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x32_to_mma_32x16_layout_sr_a(i, j)
+
+
+def shared_16x64_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 16 * (i // 8) + (j // 16) * 4 + j % 4
+
+
+def mma_sp_load_a_32x4_to_shared_16x16_layout(thread_id, local_id):
+    return mma_load_a_32x4_to_shared_16x8_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x8_to_shared_16x32_layout(thread_id, local_id):
+    return mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x16_to_shared_16x64_layout(thread_id, local_id):
+    return mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id)
+
+
+def mma_sp_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    col = 4 * (local_id % 4) + (thread_id % 4)
+    row = 8 * (local_id // 4) + (thread_id // 4)
+    return row, col
+
+
+def mma_sp_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    col = (thread_id % 4) * 2 + (local_id % 2) + ((local_id % 8) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 8)
+    return row, col
+
+
+def mma_sp_load_b_32x32_to_shared_16x64_layout(thread_id, local_id):
+    col = (thread_id % 4) * 4 + (local_id % 4) + 16 * ((local_id % 16) // 4)
+    row = (thread_id // 4) + 8 * (local_id // 16)
+    return row, col
+
+
+def get_logical_id_32bit(thread_id: int) -> int:
+    return (thread_id // 4) * 2 + (thread_id % 4) % 2
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int,
+                                                        local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 4 + local_id * 8
+    col = logical_id % 4
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int,
+                                                         local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = logical_id % 2
+    return row, col
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int,
+                                                        local_id: int) -> tuple[int, int]:
+    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(
+        thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int,
+                                                         local_id: int) -> tuple[int, int]:
+    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(
+        thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def get_logical_id_8bit(thread_id: int) -> int:
+    return thread_id
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int,
+                                                       local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 4 + local_id
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int,
+                                                        local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 2 + local_id
+    return row, col
+
+
+def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int,
+                                                        local_id: int) -> tuple[int, int]:
+    # local_id is always 0
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 4 + (logical_id % 2) * 8
+    col = (logical_id % 4) // 2
+    return row, col
+
+
+def ldmatrix_trans_32x8_to_shared_16x16_layout(thread_id, local_id):
+    row = (local_id // 4) * 8 + thread_id % 8
+    col = (thread_id // 8) * 4 + local_id % 4
+    return row, col
+
+
+def ldmatrix_32x16_to_shared_32x16_layout(thread_id, local_id):
+    row = thread_id
+    col = local_id % 8 + 8 * (local_id // 8)
+    return row, col
+
+
+def ldmatrix_trans_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id // 8) + thread_id % 8
+    col = (thread_id // 8) * 8 + local_id % 8
+    return row, col
+
+
+def ldmatrix_trans_32x32_to_shared_shared_16x64_layout(thread_id, local_id):
+    row = (local_id // 16) * 8 + thread_id % 8
+    col = (thread_id // 8) * 16 + local_id % 16
+    return row, col
+
+
+def get_ldmatrix_offset_b(
+    matrix: Literal["B"],
+    row_idx,
+    col_idx,
+    stride,
+    dtype: Literal["float16", "int8"] = "float16",
+    transposed: bool = False,
+):
+    assert matrix == "B", "matrix should be B"
+    dtype_bits = DataType(dtype).bits
+    if dtype_bits == 32:
+        if transposed:
+            transform_func = ldmatrix_trans_32x8_to_shared_16x16_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 32-bit dtype")
+    elif dtype_bits == 16:
+        transform_func = ldmatrix_32x16_to_shared_32x16_layout
+        transform_func_trans = ldmatrix_trans_32x16_to_shared_16x32_layout
+        if transposed:
+            new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+    elif dtype_bits == 8:
+        if transposed:
+            transform_func = ldmatrix_trans_32x32_to_shared_shared_16x64_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 8-bit dtype")
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
new file mode 100644
index 00000000..629d95d9
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -0,0 +1,864 @@
+from __future__ import annotations
+
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm import DataType
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm.runtime import convert
+from .utils import (
+    mma_store_index_map,
+    get_ldmatrix_offset,
+)
+from tilelang.utils import is_fragment
+
+from tilelang.intrinsics.mma_sp_layout import (
+    shared_16x16_to_mma_sp_layout_sr_a,
+    shared_16x16_to_mma_sp_layout_sr_b,
+    shared_16x32_to_mma_sp_layout_sr_a,
+    shared_16x32_to_mma_sp_layout_sr_b,
+    shared_16x64_to_mma_sp_layout_sr_a,
+    shared_16x64_to_mma_sp_layout_sr_b,
+    mma_sp_load_a_32x4_to_shared_16x16_layout,
+    mma_sp_load_a_32x8_to_shared_16x32_layout,
+    mma_sp_load_a_32x16_to_shared_16x64_layout,
+    mma_sp_load_b_32x8_to_shared_16x16_layout,
+    mma_sp_load_b_32x16_to_shared_16x32_layout,
+    mma_sp_load_b_32x32_to_shared_16x64_layout,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_32bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_32bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_16bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_16bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_8bit,
+    metadata_16bit_load_32x2_to_shared_16x4_layout_8bit,
+    metadata_32bit_load_32x1_to_shared_16x2_layout_8bit,
+    get_ldmatrix_offset_b,
+)
+
+lift = convert
+
+
+class SparseTensorCoreIntrinEmitter:
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    M_DIM = 16
+    SPARSE_FACTOR = 2  # 1:2 for tfloat12, 2:4 for 16-bit and 8-bit datatypes
+    SPARSE_SELECTOR = 0  # always use lower threads to provide metadata
+    # use lowercase as n_dim can be dynamic
+    # the smallest instructions can be m16n8k16, so the n_dim can also be 8
+    n_dim = 16
+    WARP_SIZE = 32
+    dtype_abbrv = {
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "int8": "int8",
+        "int32": "int32",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
+    }
+
+    E_FACTOR_MAP = {  # e_kdim = mma_kdim // e_factor
+        "float": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float32": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "bfloat16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "int8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "uint8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e4m3": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e5m2": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+    }
+
+    E_REPLICATE_FACTOR = {  # metadata replicate every 4 consecutive threads
+        "float32": 2,
+        "float16": 2,  # 2 of 4 consecutive threads provides
+        "bfloat16": 2,
+        "int8": 1,  # 4 of 4 consecutive threads provides
+        "uint8": 1,
+        "float8_e4m3": 1,
+        "float8_e5m2": 1,
+    }
+
+    # Represent the thread binding in the form of (tx, warp_n, warp_m)
+    is_m_first = False
+
+    def __init__(
+        self,
+        a_dtype: str = "float16",
+        e_dtype: str = "uint8",
+        b_dtype: str = "float16",
+        accum_dtype: str = "float16",
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        e_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        warp_k: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool = False,
+        thread_var: Var | None = None,
+    ):
+        self.a_dtype = a_dtype
+        self.e_dtype = e_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        self.e_transposed = e_transposed
+        # Hint Information
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.warp_k = warp_k
+        self.e_factor = self.E_FACTOR_MAP[self.a_dtype][self.e_dtype]
+        self._initialize_k_dim(a_dtype)
+        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
+        self._initialize_micro_size(self.M_DIM, self.k_dim)
+        self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
+        self._initialize_mma_sp_prefix(self.k_dim)
+        self._initialize_is_m_first(is_m_first)
+
+        self.reduce_k = reduce_k
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
+        self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
+
+        if self.warp_rows == 0 or self.warp_cols == 0:
+            raise ValueError(
+                f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
+            )
+
+    def _initialize_k_dim(self, a_dtype="float16"):
+        if isinstance(a_dtype, str):
+            a_dtype = DataType(a_dtype)
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        # When referring to the physical data movement, it should be divided by sparse_factor.
+        self.k_dim = 256 // a_dtype.bits * self.SPARSE_FACTOR
+
+    def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
+        self.local_size_a = (m_dim * k_dim) // warp_size // self.SPARSE_FACTOR
+        self.local_size_e = (
+            m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
+        self.local_size_b = (n_dim * k_dim) // warp_size
+        self.local_size_out = (m_dim * n_dim) // warp_size
+
+    def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
+        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
+        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
+        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+
+    def _initialize_mma_sp_prefix(self, k_dim: int = 16):
+        if k_dim == 16:
+            # typically used for tfloat32
+            self.mma_prefix = "m16n8k16"
+        elif k_dim == 32:
+            # typically used for float16/bfloat16
+            self.mma_prefix = "m16n8k32"
+        elif k_dim == 64:
+            # typically used for int8/fp8
+            self.mma_prefix = "m16n8k64"
+        else:
+            raise ValueError("Unsupported k_dim")
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+        self.warp_rows = warp_row_tiles // m_dim
+
+        if warp_col_tiles % 16 == 0:
+            self.n_dim = 16
+            self.micro_size_y = 16
+            self.warp_cols = warp_col_tiles // 16
+        else:
+            # must be divisible by 8
+            self.n_dim = 8
+            self.micro_size_y = 8
+            self.warp_cols = warp_col_tiles // 8
+
+        self.micro_size_x = m_dim
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        self.micro_size_k = k_dim
+
+    def _initialize_is_m_first(self, is_m_first: bool | None = False):
+        if is_m_first is not None:
+            self.is_m_first = is_m_first
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def extract_thread_binding(
+            self,
+            thread_id: PrimExpr,
+            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+
+        # if is_m_first is None, then use the default value
+        if is_m_first is None:
+            is_m_first = self.is_m_first
+
+        if is_m_first:
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
+            return lane_id, warp_n, warp_m
+        else:
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
+            return lane_id, warp_n, warp_m
+
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        a_dtype = self.a_dtype
+        a_transposed = self.a_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(a_dtype).bits != 16 and a_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(a_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_a_32x16_to_shared_16x64_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_a_32x8_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_a_32x4_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {a_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_a(
+            A_local_buf,
+            A_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = A_shared_buf.shape[-1]
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            trans = self.a_transposed
+
+            for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (
+                    rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
+                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+
+                if ldmatrix_available:
+                    T.ptx_ldmatrix(
+                        a_dtype,
+                        T.bool(trans),
+                        4,
+                        ".b16",
+                        A_local_buf.data,
+                        i * local_size_a,
+                        T.address_of(A_shared_buf_elem),
+                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                    )
+                else:
+                    for j in T.serial(local_size_a):
+                        mi, mk = mma_load_layout(tx, j)
+                        A_local_buf[i * local_size_a +
+                                    j] = A_shared_buf[wk + mk, wi +
+                                                      mi] if a_transposed else A_shared_buf[wi + mi,
+                                                                                            wk + mk]
+
+        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_e = self.local_size_e
+        a_dtype = self.a_dtype
+        e_dtype = self.e_dtype
+        trans = self.e_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        # include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+        ldmatrix_available = False  # TODO: use ldmatrix when possible
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(e_dtype).bits == 8:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 8bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 16:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 16bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 32:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_32bit_load_32x1_to_shared_16x2_layout_8bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 32bit: {a_dtype}")
+            else:
+                raise ValueError(f"Unsupported dtype: {e_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_e(
+            E_local_buf,
+            E_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            for i in T.serial(warp_rows):
+                # Assign E_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (
+                    rk * warp_k + ki * micro_size_k) // self.e_factor
+                for j in T.serial(local_size_e):
+                    mi, mk = mma_load_layout(tx, j)
+                    E_local_buf[i * local_size_e +
+                                j] = E_shared_buf[wk + mk,
+                                                  wi + mi] if trans else E_shared_buf[wi + mi,
+                                                                                      wk + mk]
+
+        return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        warp_k = self.warp_k
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        b_dtype = self.b_dtype
+        b_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+        replicate_b = (self.n_dim == 16)
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(b_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_b_32x32_to_shared_16x64_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_b_32x8_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {b_dtype}")
+
+        @T.macro
+        def _warp_ldmatrix_b(
+            B_local_buf,
+            B_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = B_shared_buf.shape[-1]
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            trans = not b_transposed
+
+            for i in T.serial(warp_cols):
+                # Assign B_shared_elem
+                wi, wk = (
+                    warp_n * warp_col_tiles + i * micro_size_y,
+                    rk * warp_k + ki * micro_size_k,
+                )
+
+                if ldmatrix_available:
+                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk,
+                                                                                               wi]
+
+                    if replicate_b:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b + lift(local_size_b) // 2,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx,
+                                                  lift(local_size_b) // 2, stride, b_dtype,
+                                                  b_transposed),
+                        )
+                    else:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                else:
+                    # load 16x32 data from shared buffer to local buffer
+                    # must be transposed.
+                    for j in T.serial(local_size_b):
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b +
+                                    j] = B_shared_buf[wi + mi, wk +
+                                                      mk] if b_transposed else B_shared_buf[wk + mk,
+                                                                                            wi + mi]
+
+        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+
+    def mma_sp(self,
+               A_local_buf: Buffer,
+               E_local_buf: Buffer,
+               B_local_buf: Buffer,
+               C_local_buf: Buffer,
+               k_inner: PrimExpr = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_e = self.local_size_e
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        a_dtype_abbrv = self.a_dtype_abbrv
+        b_dtype_abbrv = self.b_dtype_abbrv
+        accum_dtype = self.accum_dtype
+        accum_dtype_abbrv = self.accum_dtype_abbrv
+        mma_prefix = self.mma_prefix
+        replicate_b = (self.n_dim == 16)
+
+        a_is_fragment = is_fragment(A_local_buf)
+        e_is_fragment = is_fragment(E_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        assert not e_is_fragment, f"currently E_local_buf must be a local allocation, found {E_local_buf.scope()}"
+        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
+        e_local_stride: PrimExpr = k_inner * warp_rows * local_size_e if e_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+
+        @T.macro
+        def _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf):
+            for i, j in T.grid(warp_rows, warp_cols):
+                T.ptx_mma_sp(
+                    accum_dtype,
+                    mma_prefix,
+                    "row",
+                    "col",
+                    a_dtype_abbrv,
+                    b_dtype_abbrv,
+                    accum_dtype_abbrv,
+                    A_local_buf.data,
+                    a_local_stride + i * local_size_a,
+                    B_local_buf.data,
+                    b_local_stride + j * local_size_b,
+                    C_local_buf.data,
+                    i * warp_cols * local_size_out + j * local_size_out,
+                    E_local_buf.data,  # metadata
+                    e_local_stride + i * local_size_e,  # metadata offset
+                    self.SPARSE_SELECTOR,  # sparse_selector
+                    T.bool(False),  # saturate
+                )
+                if replicate_b:
+                    T.ptx_mma_sp(
+                        accum_dtype,
+                        mma_prefix,
+                        "row",
+                        "col",
+                        a_dtype_abbrv,
+                        b_dtype_abbrv,
+                        accum_dtype_abbrv,
+                        A_local_buf.data,
+                        a_local_stride + i * local_size_a,
+                        B_local_buf.data,
+                        b_local_stride + j * local_size_b + lift(local_size_b) // 2,
+                        C_local_buf.data,
+                        i * warp_cols * local_size_out + j * local_size_out +
+                        lift(local_size_out) // 2,
+                        E_local_buf.data,  # metadata
+                        e_local_stride + i * local_size_e,  # metadata offset
+                        self.SPARSE_SELECTOR,  # sparse_selector
+                        T.bool(False),  # saturate
+                    )
+
+        return _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf)
+
+    def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_out = self.local_size_out
+
+        is_global = pid_m is not None and pid_n is not None
+        BLOCK_M = block_row_warps * warp_rows
+        BLOCK_N = block_col_warps * warp_cols
+        M_DIM, n_dim = self.M_DIM, self.n_dim
+        C_buf_dims = len(C_buf.shape)
+        assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
+
+        thread_binding = self.get_thread_binding()
+
+        # STS
+        # MMA Store must be in simulated instead of TVM Intrins
+        # As TVM Intrins is like a hack that the threadIdx.x should be always
+        # equal to the warp_size
+        @T.macro
+        def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        if C_buf_dims == 2:
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
+                                  (warp_n * warp_cols + j) * n_dim +
+                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
+                                                     j * local_size_out + local_id]
+                        else:
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
+                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
+                                                     j * local_size_out + local_id]
+
+        @T.macro
+        def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        C_buf[
+                            (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
+                            (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
+                                        local_id]
+
+        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+
+    def make_mma_load_layout(self,
+                             local_buf: Buffer,
+                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        dtype = self.a_dtype if matrix_is_a else self.b_dtype
+        dtype_bits = DataType(dtype).bits
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
+        if dtype_bits == 32:
+            transform_func_sr_a = shared_16x16_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x16_to_mma_sp_layout_sr_b
+        elif dtype_bits == 16:
+            transform_func_sr_a = shared_16x32_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x32_to_mma_sp_layout_sr_b
+        elif dtype_bits == 8:
+            transform_func_sr_a = shared_16x64_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x64_to_mma_sp_layout_sr_b
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
+
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        # the layout of mma.sync is row.col.
+        # so the b matrix expected a transposed basic layout
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
+                j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
+                j, i)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, _ = inverse_mma_load_layout.map_indices([i, j])
+            return lane_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            _, local_id = inverse_mma_load_layout.map_indices([i, j])
+            return local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r] if is_sr_axis_order
+            else [micro_size_r // 2 if matrix_is_a else micro_size_r, micro_size_s],
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.warp_k
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // micro_size_r
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r],
+                                                 repeat_on_thread=False,
+                                                 lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1],
+                                                      repeat_on_thread=True,
+                                                      lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
+                                                                           repeat_on_thread=True,
+                                                                           lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s],
+                                                 repeat_on_thread=False,
+                                                 lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s],
+                                                      repeat_on_thread=True,
+                                                      lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
+                                                                           repeat_on_thread=True,
+                                                                           lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
diff --git a/tilelang/ir.py b/tilelang/ir.py
index cccf97e0..08d4e96c 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -39,6 +39,19 @@ class GemmWarpPolicy(Node, Scriptable):
         return self.m_warp, self.n_warp
 
 
+@tvm_ffi.register_object("tl.GemmSPWarpPolicy")
+class GemmSPWarpPolicy(Node, Scriptable):
+    policy_type: int
+    m_warp: int
+    n_warp: int
+
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
+                               is_wgmma: bool, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
+                                                      is_wgmma, bits)
+        return self.m_warp, self.n_warp
+
+
 @tvm_ffi.register_object("tl.Gemm")
 class Gemm(Node, Scriptable):
     ...
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 75d8d0b4..acfb1975 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -51,7 +51,7 @@ from .allocate import (
 )
 from .copy import copy, c2d_im2col  # noqa: F401
 from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
-from .experimental.gemm_sp import gemm_sp  # noqa: F401
+from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
 from .fill import fill, clear  # noqa: F401
 from .reduce import (
     reduce,  # noqa: F401
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index b391d2d0..0af0dab2 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -3,7 +3,15 @@ from __future__ import annotations
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
-from tilelang.utils.language import to_buffer_region
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,)
 
 
 def gemm_sp(
@@ -85,3 +93,128 @@ def gemm_sp(
         k_pack,
         wg_wait,
     )
+
+
+# experimental currently, for fast compilation
+def gemm_sp_v2(
+    A_sparse: tir.Buffer | tir.Var,
+    E: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    transpose_E: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+):
+    """Perform a General Matrix Multiplication (GEMM) operation.
+
+    This function computes C = A @ B where A and B can optionally be transposed.
+    The operation supports various warp policies and accumulation modes.
+
+    Args:
+        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix, contains only non-zero elements
+        E (Union[tir.Buffer, tir.Var]): The metadata of A_sparse, noted as E
+        B (Union[tir.Buffer, tir.Var]): Second input matrix
+        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
+        transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
+        policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
+        clear_accum (bool, optional): Whether to clear accumulator before computation. Defaults to False.
+        k_pack (int, optional): Number of k dimensions packed into a single warp. Defaults to 1.
+        wg_wait (int, optional): Warp group wait count. Defaults to 0.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation
+
+    Raises:
+        AssertionError: If the K dimensions of matrices A and B don't match
+    """
+
+    def legalize_arguments(arg: tir.Buffer | tir.Var):
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[tir.Buffer, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A_sparse = legalize_arguments(A_sparse)
+    E = legalize_arguments(E)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+
+    A_region = to_buffer_region(A_sparse)
+    E_region = to_buffer_region(E)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_sparse)
+    E_shape = retrieve_shape(E)  # nolint: F841
+    B_shape = retrieve_shape(B)
+    C_shape = retrieve_shape(C)
+
+    A_stride = retrieve_stride(A_sparse)
+    B_stride = retrieve_stride(B)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, \
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, \
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+
+    M, N = C_shape
+    K = 2 * (A_shape[-2] if transpose_A else A_shape[-1])
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(
+        K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_sparse)
+    B_offset = retrieve_offset(B)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    E_arg = buffer_region_to_tile_region(E_region, "r", [r for r in E_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.gemm_sp_py"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        transpose_E,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+    )
diff --git a/tilelang/layout/__init__.py b/tilelang/layout/__init__.py
index ee513257..777802d2 100644
--- a/tilelang/layout/__init__.py
+++ b/tilelang/layout/__init__.py
@@ -13,4 +13,4 @@ from .swizzle import (
     make_quarter_bank_swizzled_layout,  # noqa: F401
     make_linear_layout,  # noqa: F401
 )
-from .gemm_sp import make_metadata_layout  # noqa: F401
+from .gemm_sp import make_cutlass_metadata_layout  # noqa: F401
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index eaaa178f..e5d19029 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -17,7 +17,7 @@ def decompose_col_major(index_1d: int, basis: list[int]) -> list[int]:
     return res
 
 
-def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
+def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
     """Make a layout of metadata that is compatible with cutlass sm90 compression kernel. Note that layout atom is the same for smem and gmem.
 
     Args:
@@ -30,7 +30,7 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
         warnings.warn(f"block_k {block_k} is too large, set to 128 for {mma_dtype}.", stacklevel=2)
-    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8"]:
+    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8_e4m3", "float8_e5m2"]:
         raise NotImplementedError(f"Unsupported dtype: {mma_dtype}")
 
     if buffer.dtype not in ["uint8", "int8"]:
@@ -41,7 +41,8 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         "bfloat16": 16,
         "float32": 32,
         "int8": 8,
-        "float8": 8,
+        "float8_e4m3": 8,
+        "float8_e5m2": 8,
     }
 
     # ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl#L108-L117
@@ -75,8 +76,8 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         shape_i, shape_k = shape_ik[:3], shape_ik[3:]
         stride_i, stride_k = stride_ik[:3], stride_ik[3:]
     elif bits_map[mma_dtype] == 8:
-        shape_i, shape_k = [64], [BlockK]
-        stride_i, stride_k = [BlockK], [1]
+        shape_i, shape_k = [64], [block_k // 8]
+        stride_i, stride_k = [block_k // 8], [1]
     else:
         raise NotImplementedError(f"Unknown mma type {mma_dtype}")
 
@@ -103,54 +104,48 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
     return T.Layout(shape, transform)
 
 
-def _make_metadata_layout_sm8x_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str):
+def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     """Make a layout of metadata that is compatible with cutlass sm8x compression kernel. Note that layout atom is the same for smem and gmem.
-
+        ref: https://github.com/pytorch/pytorch/blob/d0c24b392cbb7b213d22e42c52c6c2d1ac2da1bd/torch/sparse/_semi_structured_conversions.py#L5
     Args:
         buffer: metadata buffer shape, for sm80 it should be a 16bit type
     """
 
-    # ref: https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h#L651
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/layout/matrix.h#L405
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/warp/mma_sparse_tensor_op.h#L172
-
     if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", "int8", "uint8"
+                    ] and buffer.dtype not in ["uint32", "int32"]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
-    kInterleaved = 2
-    stride = buffer.shape[0] * kInterleaved
+    m, k = buffer.shape
+    group = 32 if buffer.dtype.bits == 16 else 16
+    interweave = 4 if buffer.dtype.bits == 16 else 2
 
     def ColumnMajorInterleaved(i: int, j: int) -> int:
-        column_major = j // kInterleaved
-        column_minor = j % kInterleaved
-        return column_major * stride + i * kInterleaved + column_minor
+        i = i // group * group + (i % 8) * interweave + (i % group) // 8
+        topright = (1 - (i % 2)) & (j % 2)
+        bottomleft = (i % 2) & (1 - (j % 2))
+        i += topright - bottomleft
+        j -= topright - bottomleft
+        offset = (j // 2) * m * 2 + i * 2 + (j % 2)
+        return offset // k, offset % k
 
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_metadata_layout(buffer: tvm.tir.Buffer,
-                         mma_dtype: str = "float16",
-                         backend: str = 'cutlass',
-                         arch: str | None = None,
-                         **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer,
+                                 mma_dtype: str = "float16",
+                                 arch: str | None = None,
+                                 **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
     compute_version = nvcc.parse_compute_version(arch)
 
     if compute_version >= (9, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm90_cutlass(
-                buffer=buffer, mma_dtype=mma_dtype, **extra_args)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm90(buffer=buffer, mma_dtype=mma_dtype, **extra_args)
     elif compute_version >= (8, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm8x_cutlass(buffer=buffer, mma_dtype=mma_dtype)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm8x(buffer=buffer, mma_dtype=mma_dtype)
     else:
         raise NotImplementedError(f"Unsupported architecture: {arch}")
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index 5af1fc2b..4750fa7d 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -10,6 +10,7 @@ from tilelang.utils.tensor import (
     get_tensor_supply,
     TensorSupplyType,
     torch_assert_close,
+    is_float8_dtype,
 )
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
@@ -125,17 +126,9 @@ class Profiler:
             if lhs is not None and rhs is not None:
                 # in case of numsplit template, the ref output may be None
                 # which means the value is invalid, so we skip the comparison
-                def is_float8(tensor: torch.Tensor) -> bool:
-                    return tensor.dtype in {
-                        torch.float8_e5m2,
-                        torch.float8_e5m2fnuz,
-                        torch.float8_e4m3fn,
-                        torch.float8_e4m3fnuz,
-                    }
-
                 torch_assert_close(
-                    lhs if not is_float8(lhs) else lhs.to(torch.float32),
-                    rhs if not is_float8(rhs) else rhs.to(torch.float32),
+                    lhs if not is_float8_dtype(lhs.dtype) else lhs.to(torch.float32),
+                    rhs if not is_float8_dtype(rhs.dtype) else rhs.to(torch.float32),
                     rtol=rtol,
                     atol=atol,
                     max_mismatched_ratio=max_mismatched_ratio,
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index 5656494f..a99cbd87 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1 +1,2 @@
 from .gemm import GemmPy  # noqa: F401
+from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 4c676245..90960904 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -3,6 +3,7 @@ from tilelang import tvm as tvm
 from tvm import tir
 from tvm.target import Target
 from tvm.ir.base import Node
+from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
 from tilelang.ir import GemmWarpPolicy as GemmWarpPolicy
@@ -16,13 +17,14 @@ from tilelang.utils.target import target_is_volta
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.infer_layout")
-def gemm_py_infer_layout(gemm_py, target, thread_bounds):
+def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range):
     thread_nums = thread_bounds.extent
     return gemm_py.infer_layout(target, thread_nums)
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py, layout_map, target, thread_bounds, thread_var):
+def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range,
+                  thread_var: tir.Var):
     thread_nums = thread_bounds.extent
     stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
     return stmt
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
new file mode 100644
index 00000000..fdac694c
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -0,0 +1,69 @@
+from tilelang import tvm as tvm
+from tvm import tir
+from tilelang.utils.target import (
+    target_is_cuda,)
+from tvm.target import Target
+from tvm.ir.base import Node
+from tvm.ir import Range
+from tvm.runtime import Scriptable
+import tvm_ffi
+from tilelang.ir import GemmWarpPolicy
+from .gemm_sp_mma import GemmSPMMA
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.infer_layout")
+def gemm_sp_py_infer_layout(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range):
+    thread_nums = thread_bounds.extent
+    return gemm_sp_py.infer_layout(target, thread_nums)
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.lower")
+def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range,
+                     thread_var: tir.Var):
+    thread_nums = thread_bounds.extent
+    stmt = gemm_sp_py.lower(target, thread_nums, thread_var)
+    return stmt
+
+
+@tvm_ffi.register_object("tl.GemmSPPy")
+class GemmSPPy(Node, Scriptable):
+    A: tir.Buffer
+    E: tir.Buffer
+    B: tir.Buffer
+    C: tir.Buffer
+
+    APtr: tir.PrimExpr
+    EPtr: tir.PrimExpr
+    BPtr: tir.PrimExpr
+    CPtr: tir.PrimExpr
+
+    M: int
+    N: int
+    K: int
+
+    trans_A: bool
+    trans_B: bool
+
+    stride_A: int
+    stride_B: int
+    offset_A: int
+    offset_B: int
+    clear_accum: bool
+    k_pack: int
+    wg_wait: int
+    policy: GemmWarpPolicy
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            return GemmSPMMA(self).infer_layout(target, thread_nums)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            # Now only implement ssr layout
+            return GemmSPMMA(self).lower(target, thread_nums, thread_var)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
new file mode 100644
index 00000000..51c6786b
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang.ir import GemmWarpPolicy
+from tvm.ir.base import Node
+
+
+@dataclass
+class GemmSPBase:
+    gemm_sp_node: Node
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        raise NotImplementedError("infer_layout is not implemented")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        raise NotImplementedError("lower is not implemented")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
+
+    @property
+    def M(self) -> int:
+        return self.gemm_sp_node.M
+
+    @property
+    def N(self) -> int:
+        return self.gemm_sp_node.N
+
+    @property
+    def K(self) -> int:
+        return self.gemm_sp_node.K
+
+    @property
+    def trans_A(self) -> bool:
+        return self.gemm_sp_node.trans_A
+
+    @property
+    def trans_B(self) -> bool:
+        return self.gemm_sp_node.trans_B
+
+    @property
+    def trans_E(self) -> bool:
+        return self.gemm_sp_node.trans_E
+
+    @property
+    def e_dtype(self) -> str:
+        return self.E.dtype
+
+    @property
+    def in_dtype(self) -> str:
+        assert self.A.dtype == self.B.dtype, "A and B must have the same dtype"
+        return self.A.dtype
+
+    @property
+    def accum_dtype(self) -> str:
+        return self.C.dtype
+
+    @property
+    def A(self) -> tir.Buffer:
+        return self.gemm_sp_node.A
+
+    @property
+    def E(self) -> tir.Buffer:
+        return self.gemm_sp_node.E
+
+    @property
+    def B(self) -> tir.Buffer:
+        return self.gemm_sp_node.B
+
+    @property
+    def C(self) -> tir.Buffer:
+        return self.gemm_sp_node.C
+
+    @property
+    def ARegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ARegion
+
+    @property
+    def ERegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ERegion
+
+    @property
+    def BRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.BRegion
+
+    @property
+    def CRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.CRegion
+
+    @property
+    def stride_A(self) -> int:
+        return self.gemm_sp_node.stride_A
+
+    @property
+    def stride_B(self) -> int:
+        return self.gemm_sp_node.stride_B
+
+    @property
+    def offset_A(self) -> int:
+        return self.gemm_sp_node.offset_A
+
+    @property
+    def offset_B(self) -> int:
+        return self.gemm_sp_node.offset_B
+
+    @property
+    def clear_accum(self) -> bool:
+        return self.gemm_sp_node.clear_accum
+
+    @property
+    def k_pack(self) -> int:
+        return self.gemm_sp_node.k_pack
+
+    @property
+    def wg_wait(self) -> int:
+        return self.gemm_sp_node.wg_wait
+
+    @property
+    def policy(self) -> GemmWarpPolicy:
+        return self.gemm_sp_node.policy
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
new file mode 100644
index 00000000..50a40bb9
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -0,0 +1,247 @@
+from .gemm_sp_base import GemmSPBase
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmSPMMA(GemmSPBase):
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
+                                                            False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+        )
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(
+                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
+                                                            False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+            thread_var=thread_var,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mma_emitter.warp_rows
+        warp_cols = mma_emitter.warp_cols
+        local_size_a = mma_emitter.local_size_a
+        local_size_e = mma_emitter.local_size_e
+        local_size_b = mma_emitter.local_size_b
+        micro_size_k = mma_emitter.micro_size_k
+        A_shared = self.A
+        E_shared = self.E
+        B_shared = self.B
+        C_local = self.C
+        assert micro_size_k <= self.K, f"K dimension {self.K} should be >= micro size k {micro_size_k}"
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_sr():
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            # alloc_buffers body
+            # insert into parent block
+            return _Simplify(_gemm_srr, inline_let=True)
+        elif self.is_gemm_rs():
+            A_local = self.A
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        elif self.is_gemm_rr():
+            A_local = self.A
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_rrr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rrr, inline_let=True)
+        else:
+            raise ValueError(
+                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index cd364b8b..1021852c 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -3,6 +3,7 @@ import os
 import torch
 import warnings
 from tilelang.contrib import nvcc
+from tilelang.utils.tensor import is_float8_dtype, fp8_remove_negative_zeros_
 from torch.utils.cpp_extension import load, _import_module_from_library
 from tilelang import env
 
@@ -88,7 +89,18 @@ def compress(A: torch.Tensor,
     if compute_version >= (9, 0):
         return compress_sm90(A, transposed=transposed, **kwargs)
     elif compute_version >= (8, 0):
-        return compress_sm80(A, transposed=transposed)
+        if transposed:
+            A = A.t().contiguous()
+        origin_dtype = A.dtype
+        if is_float8_dtype(origin_dtype):
+            fp8_remove_negative_zeros_(A)
+            A = A.view(torch.int8)
+        A_sp, E = compress_sm80(A, transposed=False)
+        if is_float8_dtype(origin_dtype):
+            A_sp = A_sp.view(origin_dtype)
+        if transposed:
+            A_sp = A_sp.t().contiguous()
+        return A_sp, E
     else:
         raise ValueError(f"Unsupported CUDA compute version: {compute_version}. "
                          "Supported versions are sm_80 and sm_90.")
@@ -105,6 +117,8 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.randn((M, K), dtype=torch.float, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
@@ -114,6 +128,36 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
     return tensor.to(dtype)  # dtype like float8 might not have randn kernel
 
 
+def randint_semi_sparse(M: int,
+                        K: int,
+                        low: int,
+                        high: int,
+                        dtype=torch.int32,
+                        device='cuda',
+                        transposed: bool = False):
+    """
+    Generate a random semi-sparse integer tensor. The generated tensor will have 2:4 sparsity along the K dimension.
+    Args:
+        M (int): Number of rows
+        K (int): Number of columns
+        low (int): Lower bound of the random integers
+        high (int): Upper bound of the random integers
+        dtype: Data type of the tensor
+        device: Device to create the tensor on
+        transposed (bool): If True, returns a transposed tensor of shape (K, M)
+    """
+    elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
+    tensor = torch.randint(low, high, (M, K), dtype=dtype, device=device).view(M, -1, group)
+    indice = tensor.topk(elem, dim=-1).indices
+    tensor.scatter_(-1, indice, 0)
+    tensor = tensor.view(M, K)
+    if transposed:
+        tensor = tensor.t().contiguous()
+    return tensor
+
+
 def arange_semi_sparse(M: int,
                        K: int,
                        dtype=torch.float16,
@@ -129,6 +173,8 @@ def arange_semi_sparse(M: int,
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.arange(M * K, dtype=dtype, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index b275708c..b2905fb1 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -5,6 +5,22 @@ from tvm import tir
 import numpy as np
 
 
+def is_float8_dtype(dtype: torch.dtype) -> bool:
+    return dtype in {
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+    }
+
+
+def fp8_remove_negative_zeros_(tensor: torch.Tensor):
+    assert is_float8_dtype(tensor.dtype), "Input tensor must be of float8 dtype"
+    bits = tensor.view(torch.uint8)
+    zeros_mask = (tensor == 0)
+    bits[zeros_mask] = 0x00
+
+
 class TensorSupplyType(Enum):
     Integer = 1
     Uniform = 2
-- 
GitLab


From e547d247a4d45c271efa2ecd9b2dbb970afb1e79 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 1 Dec 2025 21:11:08 +0800
Subject: [PATCH 064/139] [Bugfix] Update TIR registration for GemmSPPy to use
 tile operation (#1361)

---
 src/op/gemm_sp_py.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
index d7c4f4aa..6ad8ca9b 100644
--- a/src/op/gemm_sp_py.cc
+++ b/src/op/gemm_sp_py.cc
@@ -279,7 +279,7 @@ LayoutMap GemmSPPyNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmSPPy, gemm_sp_py)
+TIR_REGISTER_TL_TILE_OP(GemmSPPy, gemm_sp_py)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
-- 
GitLab


From 388ee7ee82016ba8653ab263f6d53941d96df93d Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 2 Dec 2025 00:15:52 +0800
Subject: [PATCH 065/139] [Enhancement] Implement dynamic unroll factor in CUDA
 code generation (#1360)

* [Enhancement] Implement dynamic unroll factor in CUDA code generation

This commit introduces support for specifying a dynamic unroll factor in the CUDA code generation. The `unroll_factor` map is added to store unroll factors for loop variables, allowing for more flexible and optimized loop unrolling. Additionally, the `unroll` function is integrated into the loop language, enabling users to define unroll factors directly in their code. This enhancement improves performance by allowing tailored unrolling strategies based on specific loop characteristics.

* lint fix

* [Bugfix] Correct initialization of non-zero counters in custom compress kernel and update TIR registration for gemm_sp_py to use the correct tile operation
---
 examples/gemm_sp/example_custom_compress.py   |  5 +-
 src/target/codegen_cuda.cc                    | 12 +++-
 src/target/codegen_cuda.h                     |  1 +
 .../language/test_tilelang_language_unroll.py | 37 ++++++++++
 tilelang/language/__init__.py                 | 10 ++-
 tilelang/language/experimental/gemm_sp.py     |  2 +-
 tilelang/language/loop.py                     | 72 ++++++++++++++++++-
 tilelang/language/v2/builder.py               | 16 ++++-
 8 files changed, 146 insertions(+), 9 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_language_unroll.py

diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
index 9336351c..5125aed0 100644
--- a/examples/gemm_sp/example_custom_compress.py
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -275,8 +275,9 @@ def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
             for tm in T.Parallel(block_M):
                 for g_i in range(0, block_K // group):
                     a_k = g_i * group
-                    T.clear(non_zero_cnt)
-                    T.clear(non_zero_elt_log_idx)
+                    non_zero_cnt[0] = 0
+                    for i in range(elem):
+                        non_zero_elt_log_idx[i] = 0
                     for i in range(group):
                         val = A_shared[tm, a_k + i]
                         if val != 0.0:
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 99512b8b..65f23d8d 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -312,7 +312,12 @@ std::string CodeGenTileLangCUDA::Finish() {
 void CodeGenTileLangCUDA::VisitStmt_(const tir::ForNode *op) {
   if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
-    stream << "#pragma unroll\n";
+    if (unroll_factor.count(op->loop_var.get())) {
+      stream << "#pragma unroll "
+             << PrintExpr(unroll_factor[op->loop_var.get()]) << "\n";
+    } else {
+      stream << "#pragma unroll\n";
+    }
   }
   std::string extent =
       PrintExpr(arith::Analyzer().Simplify(op->extent + op->min));
@@ -2661,7 +2666,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
     this->VisitStmt(op->body);
     return;
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
   }
+
   CodeGenC::VisitStmt_(op);
 }
 
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 6f229f11..11c0ad08 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -140,6 +140,7 @@ private:
 
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
   void PrintWmmaScope(const std::string &scope, DataType t,
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
new file mode 100644
index 00000000..1796302e
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -0,0 +1,37 @@
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_unroll_with_step():
+
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, step=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll" in kernel.get_kernel_source()
+
+
+def test_unroll_with_unroll_factor():
+
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, unroll_factor=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll 4" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index acfb1975..1f560a44 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -24,7 +24,15 @@ from .proxy import (
     LocalBuffer,  # noqa: F401
     Ref,  # noqa: F401
 )
-from .loop import serial, Parallel, Persistent, Pipelined  # noqa: F401
+from .loop import (
+    Parallel,  # noqa: F401
+    Persistent,  # noqa: F401
+    Pipelined,  # noqa: F401
+    serial,  # noqa: F401
+    unroll,  # noqa: F401
+    Serial,  # noqa: F401
+    Unroll,  # noqa: F401
+)
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index 0af0dab2..4a20f3fb 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -198,7 +198,7 @@ def gemm_sp_v2(
     C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp_py"),
+        tir.op.Op.get("tl.tileop.gemm_sp_py"),
         A_arg,
         E_arg,
         B_arg,
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 4f8d5c30..3478b6cc 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -4,8 +4,9 @@ from typing import Any
 from tvm import tir
 from tvm.tir import IntImm
 import tvm.script.ir_builder.tir as tb_tir
-from .v2.builder import SerialForWithStep
+from .v2.builder import SerialForWithStep, UnrollForWithStep
 from tilelang import _ffi_api
+from tvm.script.ir_builder.tir import frame
 
 
 def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
@@ -97,7 +98,7 @@ def serial(start: tir.PrimExpr,
            stop: tir.PrimExpr | None = None,
            step: tir.PrimExpr | None = None,
            *,
-           annotations: dict[str, Any] | None = None):
+           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
     step_is_one = False
     step_is_one |= isinstance(step, int) and step == 1
     step_is_one |= isinstance(step, IntImm) and step.value == 1
@@ -108,3 +109,70 @@ def serial(start: tir.PrimExpr,
             stop = start
             start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
         return SerialForWithStep(start, stop, step, annotations=annotations)
+
+
+def unroll(start: tir.PrimExpr,
+           stop: tir.PrimExpr | None = None,
+           step: tir.PrimExpr | None = None,
+           *,
+           explicit: bool = False,
+           unroll_factor: int | None = None,
+           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    explicit : bool
+        Whether to explicitly unroll the loop.
+
+    unroll_factor : int
+        The unroll factor of the loop.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    if stop is None:
+        stop = start
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
+
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # check pragma_unroll_explicit must be False
+        if annotations.get("pragma_unroll_explicit", True):
+            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
+        annotations.update({"pragma_unroll_factor": unroll_factor})
+
+    if step is None or step_is_one:
+        return tb_tir.unroll(start, stop, annotations=annotations)
+    else:
+        return UnrollForWithStep(start, stop, step, annotations=annotations)
+
+
+Serial = serial
+Unroll = unroll
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index aea425ad..68c10913 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -112,6 +112,11 @@ class SerialForWithStep:
     annotations: dict[str, Any] | None = None
 
 
+@dataclass
+class UnrollForWithStep(SerialForWithStep):
+    ...
+
+
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 # Use tuple for isinstance checks and typing.Union for annotations/aliases
 ContinueOrBreak = (ContinueFrame, BreakFrame)
@@ -270,7 +275,7 @@ class Builder(BaseBuilder):
     def ctx_for(self, it):
         self.check_continue_break()
         it = unwrap_expr(it)
-        if isinstance(it, SerialForWithStep):
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
             # Validate and compute the trip count before constructing the frame
             if isinstance(it.step, (int, IntImm)):
                 step_value = it.step if isinstance(it.step, int) else it.step.value
@@ -285,7 +290,14 @@ class Builder(BaseBuilder):
                     f'Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang'
                 )
                 real_stop = tir.ceildiv(it.stop - it.start, it.step)
-            real_frame = tir.serial(real_stop, annotations=it.annotations)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding")
             with self.with_frame(real_frame) as v:
                 IRBuilder.name('_tmp', v)
                 yield it.start + v * it.step
-- 
GitLab


From e37f2eabc26517cc0cba2043a33a8aea9b96324a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:47:59 +0800
Subject: [PATCH 066/139] [CI] [pre-commit.ci] autoupdate (#1362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/mirrors-clang-format: v21.1.2 → v21.1.6](https://github.com/pre-commit/mirrors-clang-format/compare/v21.1.2...v21.1.6)
- [github.com/astral-sh/ruff-pre-commit: v0.14.3 → v0.14.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.14.3...v0.14.7)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 615f173b..04d84911 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.2  # sync with requirements-lint.txt
+    rev: v21.1.6  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
         exclude: |
@@ -41,7 +41,7 @@ repos:
             ^.+\.json$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.3  # sync with requirements-lint.txt
+    rev: v0.14.7  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-- 
GitLab


From f951b9245b452b43bbb3caf0f953ab6c50d88118 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:55:39 +0800
Subject: [PATCH 067/139] [Bugfix] Remove debug print in PyStmtFunctionVisitor 
 (#1363)

---
 3rdparty/tvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index fc7ed0b9..e8b02611 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit fc7ed0b9cb7a52eb1c8bf6e8c26bbb8dff3655ce
+Subproject commit e8b02611fd6b803273c5c3e15aa3a030c32dbd30
-- 
GitLab


From d88594a32a52a46b5dc09ca9f17ed7f22569d179 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 2 Dec 2025 14:44:19 +0800
Subject: [PATCH 068/139] [Debug] Always include line info in NVCC command for
 improved profiling and mapping (#1364)

---
 tilelang/contrib/nvcc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 0d55cbf7..0e6a19ba 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -80,8 +80,8 @@ def compile_cuda(code,
     file_target = path_target if path_target else temp_target
     cmd = [get_nvcc_compiler()]
     cmd += [f"--{target_format}", "-O3"]
-    if kernels_output_dir is not None:
-        cmd += ["-lineinfo"]
+    # Always include line info for better profiling and mapping
+    cmd += ["-lineinfo"]
     if isinstance(arch, list):
         cmd += arch
     elif isinstance(arch, str):
-- 
GitLab


From 6501bd07b87b148d025e0b7c9cbc8f3bb34ef765 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 2 Dec 2025 18:15:54 +0800
Subject: [PATCH 069/139] [Refactor] Update condition for benchmarking in
 example_gemv.py and simplify cached library path handling in sparse.py
 (#1365)

---
 examples/gemv/example_gemv.py | 2 +-
 tilelang/utils/sparse.py      | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 58e0114b..3772dc6b 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -360,7 +360,7 @@ def main(do_bench: bool = True):
 
     print("Test passed!")
 
-    if not do_bench:
+    if do_bench:
         best_result = get_autotuned_kernel(N, K)
         best_config = best_result.config
         kernel = splitk_gemv_vectorized_tvm(N, K, **best_config)
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index 1021852c..a7b17ad9 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -16,11 +16,10 @@ os.makedirs(_CACHE_DIR, exist_ok=True)
 
 def _get_cached_lib():
     name = 'compress_lib'
-    cached_path = os.path.join(_CACHE_DIR, f"{name}.so")
 
-    if os.path.exists(cached_path):
+    if os.path.exists(os.path.join(_CACHE_DIR, f"{name}.so")):
         try:
-            return _import_module_from_library(name, cached_path)
+            return _import_module_from_library(name, _CACHE_DIR, is_python_module=True)
         except Exception:
             # If loading fails, recompile
             pass
-- 
GitLab


From 422fb129b50909bdd90a7016b54126ab517cf572 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 2 Dec 2025 19:23:59 +0800
Subject: [PATCH 070/139] [Enhancement] Add DISABLE_CACHE environment variables
 (#1368)

---
 tilelang/autotuner/tuner.py |  4 ++--
 tilelang/env.py             | 26 ++++++++++++++++++--------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 47ac888c..9b2fca2c 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -325,7 +325,7 @@ class AutoTuner:
         key = self.generate_cache_key(parameters, extra_parameters)
 
         with self._lock:
-            if env.is_cache_enabled():
+            if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
                     logger.warning("Found kernel in memory cache. For better performance," \
@@ -601,7 +601,7 @@ class AutoTuner:
             logger.warning("DLPack backend does not support cache saving to disk.")
         else:
             with self._lock:
-                if env.is_cache_enabled():
+                if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                     self._save_result_to_disk(key, autotuner_result)
 
         self._memory_cache[key] = autotuner_result
diff --git a/tilelang/env.py b/tilelang/env.py
index 39d9e722..b1697ef5 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -196,12 +196,6 @@ class EnvVar:
         # os.environ[self.key] = value
 
 
-# Cache control API (wrap CacheState)
-enable_cache = CacheState.enable
-disable_cache = CacheState.disable
-is_cache_enabled = CacheState.is_enabled
-
-
 # Utility function for environment variables with defaults
 # Assuming EnvVar and CacheState are defined elsewhere
 class Environment:
@@ -234,13 +228,18 @@ class Environment:
     # Kernel Build options
     TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION",
                                            "1")  # print kernel name on compile
-    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # clear cache automatically if set
+    TILELANG_DISABLE_CACHE = EnvVar(
+        "TILELANG_DISABLE_CACHE",
+        "0")  # disable kernel cache, usually for unit testing / debugging, high priority
+    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE",
+                                  "0")  # DEPRECATED! clear cache automatically if set
 
     # Kernel selection options
     # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
     TILELANG_USE_GEMM_V1 = EnvVar("TILELANG_USE_GEMM_V1", "0")
 
     # Auto-tuning settings
+    TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
     TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES",
                                                 "0.9")  # percent of CPUs used
     TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS",
@@ -267,7 +266,7 @@ class Environment:
 
     # Cache control API (wrap CacheState)
     def is_cache_enabled(self) -> bool:
-        return CacheState.is_enabled()
+        return not self.is_cache_globally_disabled() and CacheState.is_enabled()
 
     def enable_cache(self) -> None:
         CacheState.enable()
@@ -275,6 +274,12 @@ class Environment:
     def disable_cache(self) -> None:
         CacheState.disable()
 
+    def is_cache_globally_disabled(self) -> bool:
+        return self.TILELANG_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
+    def is_autotune_cache_disabled(self) -> bool:
+        return self.TILELANG_AUTO_TUNING_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
     def is_print_on_compilation_enabled(self) -> bool:
         return self.TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on")
 
@@ -290,6 +295,11 @@ class Environment:
 # Instantiate as a global configuration object
 env = Environment()
 
+# Cache control API (wrap env, which is managed by CacheState and Environment Variables jointly)
+enable_cache = env.enable_cache  # CacheState.enable
+disable_cache = env.disable_cache  # CacheState.disable
+is_cache_enabled = env.is_cache_enabled  # CacheState.is_enabled
+
 # Export CUDA_HOME and ROCM_HOME, both are static variables
 # after initialization.
 CUDA_HOME = env.CUDA_HOME
-- 
GitLab


From 1da3debf1f15f8ee9aee33e844e291bcdab2a42c Mon Sep 17 00:00:00 2001
From: Yuqi Dong <134183314+yyttt6@users.noreply.github.com>
Date: Wed, 3 Dec 2025 14:38:33 +0800
Subject: [PATCH 071/139] [Refactor]: Remove useless include in
 atomicadd_vectorize.h (#1371)

---
 src/transform/atomicadd_vectorize.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
index a55bc0f4..627dc895 100644
--- a/src/transform/atomicadd_vectorize.h
+++ b/src/transform/atomicadd_vectorize.h
@@ -11,7 +11,6 @@
 #include "../op/builtin.h"
 #include "arith/int_operator.h"
 #include "arith/ir_visitor_with_analyzer.h"
-#include "atomicadd_vectorize.h"
 #include "common/loop_vectorization_utils.h"
 #include <numeric>
 #include <tvm/arith/analyzer.h>
-- 
GitLab


From 92121fc66819444daba11bcb625826497a36c514 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:18:39 +0800
Subject: [PATCH 072/139] [Refactor] Generalize fp8 process (#1372)

* [Refactor] Update condition for benchmarking in example_gemv.py and simplify cached library path handling in sparse.py

* [Enhancement] Extend support for float8 data types in GEMM operations

- Updated GEMM operations to recognize additional float8 data types: `float8_e4m3fn` and `float8_e5m2fnuz`.
- Refactored condition checks in `checkWgmma` methods to simplify float8 type handling.
- Adjusted test cases to ensure compatibility with the new float8 types in tile language examples.

* lint fix
---
 .../example_tilelang_gemm_fp8_intrinsic.py       |  7 ++++++-
 src/op/copy.cc                                   |  2 +-
 src/op/gemm.cc                                   | 16 ++--------------
 src/op/gemm_py.cc                                | 16 ++--------------
 src/op/tcgen5_meta.h                             |  6 ++----
 .../kernel/test_tilelang_kernel_bf16_gemm_mma.py |  7 ++++++-
 .../kernel/test_tilelang_kernel_fp8_gemm_mma.py  |  7 ++++++-
 .../test_tilelang_kernel_gemm_mma_intrinsic.py   |  7 ++++++-
 8 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index ed44aab6..0e2c437e 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -51,7 +51,12 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
+    is_float8 = in_dtype in [
+        "float8_e4m3",
+        "float8_e5m2",
+        "float8_e4m3fn",
+        "float8_e5m2fnuz",
+    ]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 93a0ff0e..7bef87d6 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -57,7 +57,7 @@ static int to_CUtensorMapDataType(DataType dtype) {
     }
   } else if (dtype.is_bfloat16()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_float8_e4m3() || dtype.is_float8_e5m2()) {
+  } else if (dtype.is_float8()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
   } else if (dtype.is_int()) {
     switch (dtype.bits()) {
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index dd14eb74..57c02b0b 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -361,13 +361,7 @@ bool GemmNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -380,13 +374,7 @@ bool GemmNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index f12a2de5..378fcc6a 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -182,13 +182,7 @@ bool GemmPyNode::checkWgmma() const {
   if (c_->dtype == DataType::Float(16)) {
     if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
       return k_ % 16 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
@@ -201,13 +195,7 @@ bool GemmPyNode::checkWgmma() const {
     else if (a_->dtype == DataType::Float(32) &&
              b_->dtype == DataType::Float(32))
       return (!transA_) && transB_ && k_ % 8 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e4m3() && b_->dtype.is_float8_e5m2())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e4m3())
-      return (!transA_) && transB_ && k_ % 32 == 0;
-    else if (a_->dtype.is_float8_e5m2() && b_->dtype.is_float8_e5m2())
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
       return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index 3d994bf5..8b6ff61b 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -52,10 +52,8 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     } else {
       FAIL;
     }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e4m3() ||
-              ab_dtype.is_float8_e5m2() || ab_dtype.is_float8_e5m2fnuz() ||
-              ab_dtype.is_float6_e2m3fn() || ab_dtype.is_float6_e3m2fn() ||
-              ab_dtype.is_float4_e2m1fn()) &&
+  } else if ((ab_dtype.is_float8() || ab_dtype.is_float6_e2m3fn() ||
+              ab_dtype.is_float6_e3m2fn() || ab_dtype.is_float4_e2m1fn()) &&
              ((c_dtype.is_float() && c_dtype.bits() == 32) ||
               (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index b4509fad..13135d41 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -52,7 +52,12 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
+    is_float8 = in_dtype in [
+        "float8_e4m3",
+        "float8_e5m2",
+        "float8_e4m3fn",
+        "float8_e5m2fnuz",
+    ]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 34def174..46f4e123 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -51,7 +51,12 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
+    is_float8 = in_dtype in [
+        "float8_e4m3",
+        "float8_e5m2",
+        "float8_e4m3fn",
+        "float8_e5m2fnuz",
+    ]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index da2e12cd..6e20754e 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -52,7 +52,12 @@ def tl_matmul(
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
+    is_float8 = in_dtype in [
+        "float8_e4m3",
+        "float8_e5m2",
+        "float8_e4m3fn",
+        "float8_e5m2fnuz",
+    ]
     if out_dtype == "int32" or is_float8:
         micro_size_k = 32
 
-- 
GitLab


From 6654064d13decb193931090c0526f7b20254c1b5 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 5 Dec 2025 15:59:41 +0800
Subject: [PATCH 073/139] [Layout] Enhance Free Layout Inference (#1375)

* [Refactor] Update condition for benchmarking in example_gemv.py and simplify cached library path handling in sparse.py

* [Enhancement] Extend support for float8 data types in GEMM operations

- Updated GEMM operations to recognize additional float8 data types: `float8_e4m3fn` and `float8_e5m2fnuz`.
- Refactored condition checks in `checkWgmma` methods to simplify float8 type handling.
- Adjusted test cases to ensure compatibility with the new float8 types in tile language examples.

* lint fix

* [Enhancement] Add injective layout detection and exception handling

- Introduced `DetectInjective` method in `FragmentNode` to check for injective layouts.
- Added `LoopLayoutInjectiveException` to handle errors related to non-injective layouts.
- Updated `InferLayout` methods in `ParallelOpNode` to utilize injective checks and log relevant information.
- Refactored layout inference queue management to use `std::deque` for improved performance and added prioritization logic for buffer layouts.

* remove debug print

* remove debug print

* remove debug print

* minor layout fix

* fix for T.view

* [Enhancement] Improve injective layout detection in FragmentNode

- Updated the `DetectInjective` method to handle symbolic dimensions more effectively by introducing a mechanism to collect symbolic shapes and adjust the detection level accordingly.
- Added logging for cases where the layout detection falls back to NoCheck due to symbolic dimensions.
- Minor update to the test file to include the tilelang testing module.

* [Refactor] Simplify layout inference for bulk copy operations

- Removed unnecessary conditions for bulk load/store operations in the layout inference logic.
- Streamlined the handling of layout application for bulk copy instances to enhance clarity and maintainability.

* remove debug print

* [Enhancement] Introduce layout-related exceptions and improve error handling

- Added `LayoutConflictException` and `LoopLayoutInjectiveException` classes for better exception management in layout operations.
- Updated `InferLayout` method in `ParallelOpNode` to throw `LoopLayoutInjectiveException` with detailed error information when injective layout checks fail.
- Removed redundant exception class definitions from `parallel.h` to streamline code organization.
---
 examples/deepseek_v32/sparse_mla_bwd.py       |   6 +-
 src/layout/layout.cc                          |  79 +++++++-
 src/layout/layout.h                           |  38 +++-
 src/op/copy.cc                                |   4 +-
 src/op/parallel.cc                            |  11 ++
 src/op/parallel.h                             |   9 -
 src/transform/layout_inference.cc             | 172 +++++++++++++++---
 .../test_tilelang_fragment_loop_checker.py    |   1 +
 8 files changed, 274 insertions(+), 46 deletions(-)

diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index e7f9c609..4ff3b819 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -82,6 +82,7 @@ def postprocess(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
     })
 def bwd(
     B,
@@ -159,9 +160,8 @@ def bwd(
             acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
             acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
             acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
-            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
-            acc_dkv_tail_shared = T.view(
-                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+            acc_dkv_shared = T.alloc_shared([BS // split_store, D], accum_dtype)
+            acc_dkv_tail_shared = T.alloc_shared([BS // split_store, D_tail], accum_dtype)
 
             max_kv_i = s_i
 
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 2ada9fd0..c3f99f30 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -297,13 +297,17 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
 }
 
 Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
-                           arith::Analyzer *analyzer) const {
+                           arith::Analyzer *analyzer,
+                           const PrimExpr rescale_num,
+                           const PrimExpr rescale_den) const {
+
   // Fast path: if shape is the same, return the original layout
   if (StructuralEqual()(InputShape(), shape)) {
     return ffi::GetRef<Layout>(this);
   }
 
-  // Step 1. Prove the product of InputShape is equal to the product of shape
+  // Step 1. Prove the product relation holds under rescale:
+  //   prod(InputShape) * rescale_num == prod(shape) * rescale_den
   PrimExpr input_shape_product = Integer(1);
   for (const auto &dim : InputShape()) {
     input_shape_product *= dim;
@@ -317,8 +321,10 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
   // potential null dereference paths flagged by static analysis.
   arith::Analyzer fallback_analyzer;
   arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
-  ICHECK(az->CanProveEqual(input_shape_product, shape_product))
-      << "InputShape() = " << InputShape() << " shape = " << shape;
+  ICHECK(az->CanProveEqual(input_shape_product * rescale_num,
+                           shape_product * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den;
 
   // Step 2. Create new forward indices by reshaping
   // For each dimension in the new shape, we create a placeholder variable
@@ -339,13 +345,17 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
     }
     flat_index = flat_index + new_vars[i] * stride;
   }
+  // Convert new flat index (in units of new elements) to the old flat index
+  // (in units of old elements) using the rational rescale factor.
+  // old_flat = floor((flat_index * rescale_den) / rescale_num)
+  PrimExpr old_flat_index = floordiv(flat_index * rescale_den, rescale_num);
   // Step 4. Convert flat index back to original shape indices
   // For original shape [s0, s1, ..., sm]:
   // i0 = flat_index // (s1 * s2 * ... * sm)
   // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
   // ...
   Array<PrimExpr> original_indices;
-  PrimExpr remaining = flat_index;
+  PrimExpr remaining = old_flat_index;
   for (size_t i = 0; i < InputShape().size(); ++i) {
     PrimExpr stride = Integer(1);
     for (size_t j = i + 1; j < InputShape().size(); ++j) {
@@ -373,7 +383,10 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
 }
 
 Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
-                             arith::Analyzer *analyzer) const {
+                             arith::Analyzer *analyzer,
+                             const PrimExpr rescale_num,
+                             const PrimExpr rescale_den) const {
+
   // Fast path: identical input shape, return self
   if (StructuralEqual()(InputShape(), shape)) {
     return ffi::GetRef<Fragment>(this);
@@ -390,8 +403,9 @@ Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
   // Use provided analyzer if present, otherwise a local fallback.
   arith::Analyzer fallback_analyzer;
   arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
-  ICHECK(az->CanProveEqual(input_prod, shape_prod))
+  ICHECK(az->CanProveEqual(input_prod * rescale_num, shape_prod * rescale_den))
       << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den
       << " input fragment layout is = " << DebugOutput();
 
   // 2) Build flat index from new-shape indices
@@ -414,9 +428,12 @@ Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
       stride = stride * shape[j];
     flat = flat + new_vars[i] * stride;
   }
+  // Convert to old flat index units using the rational rescale factor.
+  // old_flat = floor((flat * rescale_den) / rescale_num)
+  PrimExpr old_flat = floordiv(flat * rescale_den, rescale_num);
   // 3) Recover original indices from flat index
   Array<PrimExpr> orig_indices;
-  PrimExpr remain = flat;
+  PrimExpr remain = old_flat;
   for (size_t i = 0; i < InputShape().size(); ++i) {
     PrimExpr stride = Integer(1);
     for (size_t j = i + 1; j < InputShape().size(); ++j)
@@ -536,6 +553,52 @@ bool FragmentNode::IsCompletedReplicated() const {
                          ReplicationPlaceholder());
 }
 
+arith::IterMapResult FragmentNode::DetectInjective() const {
+  // lei:To perform injective check, we need to reverse the layout
+  // and use surjective check, now we use bijective check for convenience
+  // can be relaxed in future
+  arith::Analyzer analyzer;
+  // Build a flat indices array: [forward_thread_, forward_index_[...]]
+  Array<PrimExpr> indices;
+  indices.push_back(forward_thread_);
+  for (const auto &e : forward_index_) {
+    indices.push_back(e);
+  }
+
+  // Mirror Layout::InverseWithLevel(): if any participating shape is
+  // symbolic, relax to NoCheck and rely on runtime guards elsewhere.
+  auto collect_symbolic = [&](const Array<PrimExpr> &shape) {
+    Array<PrimExpr> symbolic_dims;
+    for (const auto &dim : shape) {
+      if (!as_const_int(dim)) {
+        symbolic_dims.push_back(dim);
+      }
+    }
+    return symbolic_dims;
+  };
+
+  Array<PrimExpr> symbolic_dims = collect_symbolic(InputShape());
+  Array<PrimExpr> output_shape = OutputShape();
+  symbolic_dims.insert(symbolic_dims.end(), output_shape.begin(),
+                       output_shape.end());
+  // Also consider replicate size for fragments
+  if (!as_const_int(ReplicateExtent())) {
+    symbolic_dims.push_back(ReplicateExtent());
+  }
+  symbolic_dims = collect_symbolic(symbolic_dims);
+
+  bool is_static_shape = symbolic_dims.empty();
+  auto level = is_static_shape ? arith::IterMapLevel::Bijective
+                               : arith::IterMapLevel::NoCheck;
+  if (!is_static_shape) {
+    DLOG(WARNING)
+        << "Fragment::DetectInjective on symbolic layout, falling back to "
+        << "NoCheck; symbolic dims: " << symbolic_dims;
+  }
+
+  return arith::DetectIterMap(indices, getVarMap(), 1, level, &analyzer);
+}
+
 PrimExpr FragmentNode::ThreadExtent() const {
   Array<PrimExpr> ret(OutputDim(), 1);
   arith::Analyzer analyzer;
diff --git a/src/layout/layout.h b/src/layout/layout.h
index afa50418..369df4f2 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -6,6 +6,7 @@
 #ifndef TVM_TL_LAYOUT_LAYOUT_H_
 #define TVM_TL_LAYOUT_LAYOUT_H_
 
+#include <exception>
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/object.h>
@@ -18,6 +19,25 @@ namespace tl {
 
 using namespace tir;
 
+// Common layout-related exceptions
+class LayoutConflictException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LayoutConflictException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
+class LoopLayoutInjectiveException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LoopLayoutInjectiveException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
 class Layout;
 class Fragment;
 
@@ -42,8 +62,18 @@ public:
 
   virtual Layout Inverse() const;
 
+  // Reshape the layout to a new logical shape. When aliasing buffers of
+  // different dtypes, the element count may change while the underlying
+  // byte-size stays equal. Use rescale_num/rescale_den to represent the
+  // ratio between the old element size and the new element size in bytes.
+  // Specifically, define factor = rescale_num / rescale_den where:
+  //   new_num_elems = old_num_elems * factor
+  // For example, f32->i8 (4B -> 1B) uses rescale_num=4, rescale_den=1.
+  // i8->f32 (1B -> 4B) uses rescale_num=1, rescale_den=4.
   virtual Layout Reshape(const Array<PrimExpr> &shape,
-                         arith::Analyzer *analyzer) const;
+                         arith::Analyzer *analyzer,
+                         const PrimExpr rescale_num = Integer(1),
+                         const PrimExpr rescale_den = Integer(1)) const;
 
   virtual std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const;
 
@@ -86,7 +116,9 @@ public:
 
   Layout Inverse() const final;
 
-  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer) const;
+  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer,
+                 const PrimExpr rescale_num = Integer(1),
+                 const PrimExpr rescale_den = Integer(1)) const;
 
   std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const final;
 
@@ -116,6 +148,8 @@ public:
 
   bool IsCompletedReplicated() const;
 
+  arith::IterMapResult DetectInjective() const;
+
   static void RegisterReflection();
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Fragment", FragmentNode, LayoutNode);
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 7bef87d6..72e73e16 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -551,7 +551,8 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     // This must be a global/shared layout, so we can skip the parallel op
     // layout inference (parallel layout inference only annotate the loop layout
     // and the register layout).
-    bool is_load = copy_inst == CopyInst::kBulkLoad;
+    bool is_load =
+        copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkLoad1D;
     Buffer global_tensor = is_load ? src : dst;
     Buffer shared_tensor = is_load ? dst : src;
     // check shared layout is non-swizzle
@@ -561,6 +562,7 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
       Layout linear_layout = ComputeLinearLayout(shared_tensor);
       return Map<Buffer, Layout>({{shared_tensor, linear_layout}});
     }
+    return {};
   }
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 94572098..7f755b47 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -214,6 +214,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
   if (loop_layout_.defined())
     return {};
+
   if (level == InferLevel::kStrict) {
     LayoutMap results;
     // Deduce buffers that should be complicated replicated.
@@ -562,6 +563,16 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   } else {
     return {};
   }
+  // check loop_layout_ is injective
+  auto injective_res = loop_layout_->DetectInjective();
+  if (!injective_res->errors.empty()) {
+    std::ostringstream oss;
+    oss << "Loop layout is not injective: " << loop_layout_->DebugOutput()
+        << '\n'
+        << "  errors: " << injective_res->errors << '\n'
+        << "  loop AST: " << root_;
+    throw LoopLayoutInjectiveException(oss.str());
+  }
 
   PrimExpr loop_thread_extent = loop_layout_->ThreadExtent();
 
diff --git a/src/op/parallel.h b/src/op/parallel.h
index 8ebd7366..4ff5484b 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -24,15 +24,6 @@ namespace tl {
 
 using namespace tir;
 
-class LayoutConflictException : public std::exception {
-public:
-  const char *what() const noexcept override { return msg_.c_str(); }
-  LayoutConflictException(const std::string &msg) : msg_(msg) {}
-
-private:
-  std::string msg_;
-};
-
 bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
                            Array<PrimExpr> small_frag_indices,
                            Array<PrimExpr> large_frag_indices,
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index f5ccc42b..e505bc6e 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/utils.h>
 
 #include <algorithm>
+#include <deque>
 #include <memory>
 #include <queue>
 
@@ -72,7 +73,7 @@ public:
 
   void RunInferStep(int cur_infer_id, InferLevel level, bool update_queue,
                     LayoutMap &layout_map, const LayoutMap &strict_layout_map,
-                    std::queue<int> &q, std::vector<bool> &in_queue) {
+                    std::deque<int> &q, std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
 
     // Range check for cur_infer_id
@@ -112,9 +113,9 @@ public:
         next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
                                           cur_analyzer, buffer_oob},
                           level);
+
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
-
       // Basic validity checks
       ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
       ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
@@ -140,8 +141,11 @@ public:
             }
           }
           Layout target_layout =
-              shapes_equal ? src_layout
-                           : src_layout->Reshape(sib->shape, &analyzer_);
+              shapes_equal
+                  ? src_layout
+                  : src_layout->Reshape(sib->shape, &analyzer_,
+                                        Integer(src_buffer->dtype.bytes()),
+                                        Integer(sib->dtype.bytes()));
           if (layout_map.count(sib)) {
             ICHECK(target_layout->IsEqual(layout_map[sib].get()))
                 << "Get different layout for alias buffer " << sib
@@ -152,10 +156,7 @@ public:
             layout_map.Set(sib, target_layout);
             if (update_queue && use_list_.count(sib)) {
               for (int idx : use_list_[sib]) {
-                if (!in_queue[idx] && idx != cur_infer_id) {
-                  in_queue[idx] = true;
-                  q.push(idx);
-                }
+                EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
               }
             }
           }
@@ -233,22 +234,20 @@ public:
               << "Index in use_list_ for buffer " << buffer
               << " out of range: " << idx << " >= " << num_infer << ".";
 
-          if (!in_queue[idx] && idx != cur_infer_id) {
-            in_queue[idx] = true;
-            q.push(idx);
-          }
+          EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
         }
       }
     }
   };
 
   void FinishInferQueue(InferLevel level, LayoutMap &layout_map,
-                        const LayoutMap &strict_layout_map, std::queue<int> &q,
+                        const LayoutMap &strict_layout_map, std::deque<int> &q,
                         std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
+
     while (!q.empty()) {
       int cur_infer_id = q.front();
-      q.pop();
+      q.pop_front();
       // Range check again, just to be safe
       ICHECK_GE(cur_infer_id, 0);
       ICHECK_LT(cur_infer_id, num_infer);
@@ -289,7 +288,7 @@ public:
     int num_infer = infer_list_.size();
 
     // Prepare BFS queue for iterative inference
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(num_infer, true);
     for (int i = 0; i < num_infer; i++) {
       // Check that each infer_list_ entry is valid
@@ -301,7 +300,7 @@ public:
       if (!thread_var_vec_[i].defined() && skip_thread_partition_) {
         thread_var_vec_[i] = thread_var_;
       }
-      q.push(i);
+      q.push_back(i);
     }
 
     // step 1: infer strict layout
@@ -352,10 +351,12 @@ public:
             }
           }
 
-          Layout reshaped =
-              shapes_equal
-                  ? rep_layout.value()
-                  : rep_layout.value()->Reshape(buf->shape, &analyzer_);
+          Layout reshaped = shapes_equal
+                                ? rep_layout.value()
+                                : rep_layout.value()->Reshape(
+                                      buf->shape, &analyzer_,
+                                      Integer(rep.value()->dtype.bytes()),
+                                      Integer(buf->dtype.bytes()));
           layout_map.Set(buf, reshaped);
         }
       }
@@ -431,6 +432,38 @@ private:
     return buffer_map;
   }
 
+  // Return true if all buffers that this op (idx) touches already have
+  // inferred layouts in layout_map. Used to prioritize enqueue order.
+  bool ShouldPrioritize(int idx, const LayoutMap &layout_map) const {
+    auto it = op_touched_buffers_.find(idx);
+    if (it == op_touched_buffers_.end() || it->second.empty())
+      return false;
+    for (const auto &buf : it->second) {
+      if (!layout_map.count(buf))
+        return false;
+    }
+    return true;
+  }
+
+  // Enqueue idx to q with priority if all its buffers already
+  // have layouts. Also guards against duplicates and self-enqueue.
+  void EnqueueWithPriority(int idx, std::deque<int> &q,
+                           std::vector<bool> &in_queue, int cur_infer_id,
+                           const LayoutMap &layout_map) const {
+    if (idx == cur_infer_id)
+      return;
+    if (idx < 0 || idx >= static_cast<int>(in_queue.size()))
+      return;
+    if (in_queue[idx])
+      return;
+    in_queue[idx] = true;
+    if (ShouldPrioritize(idx, layout_map)) {
+      q.push_front(idx);
+    } else {
+      q.push_back(idx);
+    }
+  }
+
   void VisitExpr_(const CallNode *op) final {
     IRVisitorWithAnalyzer::VisitExpr_(op);
     // Do not analysis the call node to the global function.
@@ -536,11 +569,28 @@ private:
   }
 
   void addToUseList(const Buffer &buffer) {
+    // buffer scope must be local.fragment
+    if (buffer.scope() != "local.fragment") {
+      return;
+    }
     int infer_idx = infer_list_.size();
     if (use_list_.find(buffer) == use_list_.end()) {
       use_list_[buffer] = {};
     }
     use_list_[buffer].push_back(infer_idx);
+
+    // Track which buffers this op (infer_idx) touches for prioritization.
+    // Avoid duplicates.
+    auto &vec = op_touched_buffers_[infer_idx];
+    bool exists = false;
+    for (const auto &b : vec) {
+      if (b.same_as(buffer)) {
+        exists = true;
+        break;
+      }
+    }
+    if (!exists)
+      vec.push_back(buffer);
   }
 
   void VisitStmt_(const ForNode *op) final {
@@ -549,6 +599,71 @@ private:
       for (const auto &[buffer, _] : infer->GetIndiceMap()) {
         addToUseList(buffer);
       }
+
+      PostOrderVisit(op->body, [this](const ObjectRef &node) {
+        if (auto *buffer_load = node.as<BufferLoadNode>()) {
+          if (buffer_load->buffer.defined() &&
+              buffer_load->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_load->buffer->data)) {
+              // Check if this buffer is already in the list
+              auto buffers = buffer_data_to_buffers_[buffer_load->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_load->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_load->buffer);
+                buffer_data_to_buffers_.Set(buffer_load->buffer->data, buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_load->buffer
+                           << " buffer.get() = " << buffer_load->buffer.get()
+                           << " data = " << buffer_load->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_load->buffer->data,
+                                          {buffer_load->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_load->buffer
+                         << " buffer.get() = " << buffer_load->buffer.get()
+                         << " data = " << buffer_load->buffer->data.get();
+            }
+          }
+        } else if (auto *buffer_store = node.as<BufferStoreNode>()) {
+          if (buffer_store->buffer.defined() &&
+              buffer_store->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_store->buffer->data)) {
+              auto buffers =
+                  buffer_data_to_buffers_[buffer_store->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_store->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_store->buffer);
+                buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                            buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_store->buffer
+                           << " buffer.get() = " << buffer_store->buffer.get()
+                           << " data = " << buffer_store->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                          {buffer_store->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_store->buffer
+                         << " buffer.get() = " << buffer_store->buffer.get()
+                         << " data = " << buffer_store->buffer->data.get();
+            }
+          }
+        }
+      });
       infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
@@ -615,7 +730,11 @@ private:
           if (shapes_equal) {
             annotated_layout_map_.Set(buffer, layout);
           } else {
-            auto reshaped_layout = layout->Reshape(buffer->shape, &analyzer_);
+            // Use the first buffer sharing this var as the base for dtype ratio
+            int base_bytes = buffers[0]->dtype.bytes();
+            auto reshaped_layout =
+                layout->Reshape(buffer->shape, &analyzer_, Integer(base_bytes),
+                                Integer(buffer->dtype.bytes()));
             annotated_layout_map_.Set(buffer, reshaped_layout);
           }
         }
@@ -699,6 +818,8 @@ private:
   std::vector<TileOperator> infer_list_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
       use_list_;
+  // Per-op list of buffers it touches (fragment scope), used for prioritization
+  std::unordered_map<int, std::vector<Buffer>> op_touched_buffers_;
   // This is a workaround for cpu backend,
   // we need to define a thread_var for the serial loop.
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
@@ -765,6 +886,7 @@ private:
         }
       }
     }
+
     std::unordered_map<int, std::vector<int>> components;
     for (int i = 0; i < infer_list_.size(); i++) {
       int root = uf.Find(i);
@@ -781,7 +903,7 @@ private:
 
     // For each component, try each op as root, and determine the least
     // replicated one
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(infer_list_.size(), false);
 
     for (auto &&[root, members] : components) {
@@ -795,7 +917,7 @@ private:
       // Try each member as the root of inference for this component
       for (int attempt_infer_root : members) {
         DLOG(INFO) << "----------------------- try root " << attempt_infer_root
-                   << '\n';
+                   << " members " << members.size() << '\n';
         // Backup the current infer_list_ state
         auto back_infer_list = BackupInferList();
         // Copy the current layout_map for temporary use
@@ -826,6 +948,10 @@ private:
           do_update = false;
           DLOG(INFO) << "attempt failed due to NormalizeIterException "
                      << e.what() << '\n';
+        } catch (const LoopLayoutInjectiveException &e) {
+          do_update = false;
+          DLOG(INFO) << "attempt failed due to LoopLayoutInjectiveException "
+                     << e.what() << '\n';
         }
 
         if (do_update) {
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
index 9073aebc..df88573f 100644
--- a/testing/python/analysis/test_tilelang_fragment_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -1,4 +1,5 @@
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 import pytest
 
-- 
GitLab


From f8e7fef54a87b5d30551551556088af19c9c5493 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sat, 6 Dec 2025 11:32:08 +0800
Subject: [PATCH 074/139] [Enhancement] Introduce buffer var lca analysis for
 pass plan buffer allocations (#1376)

* Update submodule TVM to latest commit and add PlanAndUpdateBufferAllocationLocation function to transform module

- Updated the TVM submodule to commit 3a32b763.
- Added a new function `PlanAndUpdateBufferAllocationLocation` in the transform module to facilitate buffer allocation planning within PrimFuncs.

* Refactor buffer allocation code for improved readability and consistency

- Updated formatting and spacing in `plan_update_buffer_allocation_location.cc` for better code clarity.
- Standardized the use of pointer and reference syntax across various class methods.
- Enhanced comments for better understanding of buffer allocation logic.
- Removed unnecessary lines and improved overall code structure.

* Refactor buffer allocation checks for improved clarity

- Replaced size checks with empty checks for `ffi::Array<Buffer>` in `plan_update_buffer_allocation_location.cc` to enhance code readability.
- Updated conditions in multiple methods to use `empty()` instead of comparing size to zero, streamlining the logic.
---
 3rdparty/tvm                                  |   2 +-
 .../plan_update_buffer_allocation_location.cc | 359 ++++++++++++++++++
 tilelang/engine/phase.py                      |   2 +-
 tilelang/transform/__init__.py                |  11 +
 4 files changed, 372 insertions(+), 2 deletions(-)
 create mode 100644 src/transform/plan_update_buffer_allocation_location.cc

diff --git a/3rdparty/tvm b/3rdparty/tvm
index e8b02611..3a32b763 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit e8b02611fd6b803273c5c3e15aa3a030c32dbd30
+Subproject commit 3a32b763e9d8393b14e4d0f824b2846f70041bc1
diff --git a/src/transform/plan_update_buffer_allocation_location.cc b/src/transform/plan_update_buffer_allocation_location.cc
new file mode 100644
index 00000000..995b2151
--- /dev/null
+++ b/src/transform/plan_update_buffer_allocation_location.cc
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Planning where buffers to be allocated and update the AST.
+ * \file plan_update_buffer_allocation_location.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/var.h>
+
+#include "tir/transforms/ir_utils.h"
+
+// Forward-declare tir's var-level LCA helper which has no public header.
+namespace tvm {
+namespace tir {
+ffi::Map<Var, ffi::Optional<Stmt>>
+DetectBufferVarAccessLCA(const PrimFunc &func);
+}
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::transform;
+
+// Use TVM's tir analysis API for LCA detection.
+
+class CollectManagedAllocations : public StmtExprVisitor {
+public:
+  void VisitStmt_(const BlockNode *op) final {
+    for (const auto &buf : op->alloc_buffers) {
+      managed_allocations.insert(buf->data.get());
+    }
+    for (const auto &buf : op->match_buffers) {
+      managed_allocations.insert(buf->buffer->data.get());
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief Buffers that are allocated outside of the BlockNode, and should not
+   * be moved by BufferAllocationLocator. */
+  std::unordered_set<const VarNode *> managed_allocations;
+};
+
+/*! \brief Collect the allocate buffer order. */
+class BufferAllocateOrderCollector : public StmtExprVisitor {
+public:
+  static ffi::Array<Buffer> Collect(const PrimFunc &func) {
+    BufferAllocateOrderCollector collector;
+    for (const auto &kv : func->buffer_map) {
+      collector.buffer_alloc_recorder_.push_back(kv.second);
+    }
+    collector(func->body);
+    return std::move(collector.buffer_alloc_recorder_);
+  }
+
+private:
+  bool find(const Buffer &buf) {
+    return std::find(buffer_alloc_recorder_.begin(),
+                     buffer_alloc_recorder_.end(),
+                     buf) != buffer_alloc_recorder_.end();
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    for (const Buffer &buffer : op->alloc_buffers) {
+      buffer_alloc_recorder_.push_back(buffer);
+    }
+    // Also visit match_buffers to collect constant buffers associated with
+    // AllocateConst nodes. These buffers only appear in read and match_buffer
+    // regions.
+    for (const auto &region : op->match_buffers) {
+      if (!find(region->source->buffer)) {
+        buffer_alloc_recorder_.push_back(region->source->buffer);
+      }
+    }
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief The buffer allocated order recorder. */
+  ffi::Array<Buffer> buffer_alloc_recorder_;
+};
+
+class BufferAllocationLocator : public StmtExprMutator {
+public:
+  explicit BufferAllocationLocator(const PrimFunc &func) {
+    // Use TVM's tir LCA detection implementation
+    ffi::Map<Buffer, ffi::Optional<Stmt>> buffer_lca =
+        tir::DetectBufferAccessLCA(func);
+    ffi::Map<Var, ffi::Optional<Stmt>> var_lca =
+        tir::DetectBufferVarAccessLCA(func);
+
+    // The buffer_alloc_recorder Array is used to keep the buffer allocation
+    // order since the buffer_lca Map is unordered.
+    ffi::Array<Buffer> buffer_alloc_recorder =
+        BufferAllocateOrderCollector::Collect(func);
+    std::unordered_set<const VarNode *> arg_buffer_vars;
+    CollectManagedAllocations collector;
+    collector(func->body);
+    managed_allocations_ = collector.managed_allocations;
+
+    for (const auto &kv : func->buffer_map) {
+      const Buffer &buffer = kv.second;
+      arg_buffer_vars.emplace(buffer->data.get());
+      PushBinding(buffer->data, buffer);
+    }
+    // create buffers to be allocated at each stmts
+    for (const auto &buffer : buffer_alloc_recorder) {
+      // Prefer the LCA derived from the underlying data var. If missing, fall
+      // back to Buffer LCA.
+      const StmtNode *stmt = nullptr;
+      auto vit = var_lca.find(buffer->data);
+      if (vit != var_lca.end()) {
+        stmt = (*vit).second.get();
+      } else {
+        auto bit = buffer_lca.find(buffer);
+        if (bit != buffer_lca.end()) {
+          stmt = (*bit).second.get();
+        }
+      }
+      if (stmt != nullptr || vit != var_lca.end()) {
+        if (arg_buffer_vars.count(buffer->data.get())) {
+          continue;
+        }
+        if (managed_allocations_.count(buffer->data.get())) {
+          alloc_buffers_[stmt].push_back(buffer);
+        }
+        // Do not push binding here. Bindings should reflect scope accurately,
+        // and will be pushed/popped when visiting the owning stmt.
+      }
+    }
+  }
+
+private:
+  // Maintain a stack of Buffers per data var to correctly handle cases
+  // where multiple Buffer objects share the same underlying data Var.
+  void PushBinding(const Var &v, const Buffer &buf) {
+    ffi::Array<Buffer> arr;
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it != buffer_data_to_buffers_.end()) {
+      arr = (*it).second;
+    }
+    arr.push_back(buf);
+    buffer_data_to_buffers_.Set(v, arr);
+  }
+
+  void PopBinding(const Var &v) {
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it == buffer_data_to_buffers_.end())
+      return;
+    ffi::Array<Buffer> arr = (*it).second;
+    if (!arr.empty()) {
+      // erase last element
+      std::vector<Buffer> tmp;
+      tmp.reserve(arr.size() - 1);
+      for (size_t i = 0; i + 1 < arr.size(); ++i)
+        tmp.push_back(arr[i]);
+      arr = ffi::Array<Buffer>(tmp);
+    }
+    if (arr.empty()) {
+      buffer_data_to_buffers_.erase(v);
+    } else {
+      buffer_data_to_buffers_.Set(v, arr);
+    }
+  }
+
+  bool HasBinding(const Var &v) const {
+    auto it = buffer_data_to_buffers_.find(v);
+    return it != buffer_data_to_buffers_.end() && !(*it).second.empty();
+  }
+
+  // Snapshot the current top binding per Var for APIs that require
+  // a single Buffer per data Var (e.g. GetBlockReadWriteRegion).
+  ffi::Map<Var, Buffer> SnapshotVarMap() const {
+    ffi::Map<Var, Buffer> out;
+    for (const auto &kv : buffer_data_to_buffers_) {
+      const Var &v = kv.first;
+      const ffi::Array<Buffer> &arr = kv.second;
+      if (!arr.empty()) {
+        out.Set(v, arr[arr.size() - 1]);
+      }
+    }
+    return out;
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    auto it = alloc_buffers_.find(op);
+    if (it == alloc_buffers_.end()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+    for (const Buffer &buf : it->second) {
+      PushBinding(buf->data, buf);
+    }
+    auto node = Downcast<For>(StmtMutator::VisitStmt_(op));
+    ffi::Array<Buffer> new_block_alloc_bufs;
+    for (const Buffer &buf : it->second) {
+      if (managed_allocations_.count(buf->data.get())) {
+        PopBinding(buf->data);
+        new_block_alloc_bufs.push_back(buf);
+      }
+    }
+
+    if (!new_block_alloc_bufs.empty()) {
+      node.CopyOnWrite()->body =
+          InjectOpaqueBlock(node->body, new_block_alloc_bufs);
+    }
+
+    return node;
+  }
+
+  Stmt VisitStmt_(const BlockNode *op) final {
+    ICHECK(!op->init.defined());
+    ffi::Array<Buffer> alloc_buffers;
+    auto it = alloc_buffers_.find(op);
+    if (it != alloc_buffers_.end()) {
+      alloc_buffers = it->second;
+      for (const Buffer &buf : it->second) {
+        PushBinding(buf->data, buf);
+      }
+    }
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      const Var &source_var = match_buffer->source->buffer->data;
+      ICHECK(HasBinding(source_var));
+      PushBinding(target_var, match_buffer->buffer);
+    }
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    op = stmt.as<BlockNode>();
+    ICHECK(op != nullptr);
+
+    // No longer consider buffers created by match_buffer inside the block when
+    // updating access region.
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      PopBinding(target_var);
+    }
+    // No longer consider buffers allocated inside the block when updating
+    // access region.
+    if (it != alloc_buffers_.end()) {
+      for (const Buffer &buf : it->second) {
+        PopBinding(buf->data);
+      }
+    }
+
+    ObjectPtr<BlockNode> n = CopyOnWrite(op);
+    n->alloc_buffers = std::move(alloc_buffers);
+    // Erase buffer allocated inside the block from access region.
+    n->reads = RemoveRedundantBufferRegion(n->reads);
+    n->writes = RemoveRedundantBufferRegion(n->writes);
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const BufferRealizeNode *op) final {
+    ICHECK(false)
+        << "Internal Error: BufferRealizeNode is not allowed in TensorIR.";
+    throw;
+  }
+
+  Stmt InjectOpaqueBlock(Stmt body, const ffi::Array<Buffer> &alloc_buffers) {
+    ICHECK(!alloc_buffers.empty());
+    Block opaque_block(/*iter_vars=*/{},
+                       /*reads=*/{},
+                       /*writes=*/{},
+                       /*name_hint=*/"",
+                       /*body=*/std::move(body),
+                       /*init=*/std::nullopt,
+                       /*alloc_buffers=*/alloc_buffers);
+    ObjectPtr<BlockNode> n = CopyOnWrite(opaque_block.get());
+    // Snapshot to a Var->Buffer map using the innermost binding for each Var.
+    ffi::Map<Var, Buffer> var_map = SnapshotVarMap();
+    ffi::Array<ffi::Array<BufferRegion>> access =
+        GetBlockReadWriteRegion(opaque_block, var_map);
+    n->reads = access[0];
+    n->writes = access[1];
+    BlockRealize realize({}, Bool(true), Block(n));
+    return realize;
+  }
+
+  ffi::Array<BufferRegion>
+  RemoveRedundantBufferRegion(const ffi::Array<BufferRegion> &region) const {
+    ffi::Array<BufferRegion> result;
+    for (const BufferRegion &buffer_region : region) {
+      if (HasBinding(buffer_region->buffer->data)) {
+        result.push_back(buffer_region);
+      }
+    }
+    return result;
+  }
+
+  /*! \brief The map from stmt to the buffers to be allocated under it. */
+  std::unordered_map<const StmtNode *, ffi::Array<Buffer>> alloc_buffers_;
+  /*! \brief Stack of buffers per data var for scoping correctness. */
+  ffi::Map<Var, ffi::Array<Buffer>> buffer_data_to_buffers_;
+  /*! \brief Buffers that are allocated within a BlockNode, and may be moved. */
+  std::unordered_set<const VarNode *> managed_allocations_;
+};
+
+PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  BufferAllocationLocator locator(func);
+  fptr->body = locator(fptr->body);
+  return func;
+}
+
+namespace transform {
+
+Pass PlanAndUpdateBufferAllocationLocation() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return ::tvm::tl::PlanAndUpdateBufferAllocationLocation(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0,
+                            "tl.PlanAndUpdateBufferAllocationLocation", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PlanAndUpdateBufferAllocationLocation",
+                        PlanAndUpdateBufferAllocationLocation);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 1a98c893..1408458b 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -163,7 +163,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.InjectFenceProxy()(mod)
     else:
         mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+        mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
         mod = tilelang.transform.PipelinePlanning()(mod)
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
         mod = tilelang.transform.MergeIfStmt()(mod)
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 6bab8f21..a86ffe21 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -414,6 +414,17 @@ def LowerSharedBarrier():
     return _ffi_api.LowerSharedBarrier()  # type: ignore
 
 
+def PlanAndUpdateBufferAllocationLocation():
+    """Plan and update buffer allocation locations within PrimFuncs.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
+
+
 def StorageRewrite():
     """StorageRewrite
 
-- 
GitLab


From 924225ed5f393fc620344e1e907769209ed11f06 Mon Sep 17 00:00:00 2001
From: Cunxiao Ni <85601223+Cunxiao2002@users.noreply.github.com>
Date: Sat, 6 Dec 2025 16:22:04 +0800
Subject: [PATCH 075/139] [Tool] Provide layout visualization tool (#1353)

* Provide layout visualization tool

Adds a layout visualization tool to TileLang, which helps users understand and debug the layout transformations applied during compilation.

This tool visualizes the memory layout of tensors at different stages of the compilation process, allowing developers to identify potential inefficiencies and optimize their code for better performance.

The visualization can be enabled via a pass config option.

* format

* add layout visual example

* Adds vis extra with matplotlib dependency

* rafactor pass config name

* fix lint

* Enables configurable layout visualization formats

Allows users to specify the output formats (png, pdf, svg) for layout visualization through a pass config option.

This change provides more flexibility in how layout visualizations are generated, allowing users to choose the formats that best suit their needs.

It also fixes a bug where layout visualization was not correctly disabled when the config option was set to "false".

* Adds visual layout inference tool docs

* fix lint

* fix lint

* Rafactor configurable layout visualization formats

* fix lint

* fix typo

* add some comments

* fix lints

* add some warnings for user

* Moves layout visualization

* Refactors layout visualization pass configuration

Updates the layout visualization pass configuration to use boolean flag for enabling and a string for specifying formats.

* Enables multiple layout visualization formats

* Updates layout visualization docs

* Moves layout visualization to analysis
---
 docs/tutorials/debug_tools_for_tilelang.md    | 26 ++++++
 .../visual_layout_inference.py                | 61 +++++++++++++
 pyproject.toml                                |  2 +
 src/op/builtin.cc                             |  2 +
 src/op/builtin.h                              |  4 +
 tilelang/__init__.py                          |  1 +
 tilelang/analysis/__init__.py                 |  1 +
 tilelang/analysis/layout_visual.py            | 90 +++++++++++++++++++
 tilelang/engine/phase.py                      | 44 +++++++++
 tilelang/tools/plot_layout.py                 | 65 ++++++++++----
 tilelang/transform/pass_config.py             |  9 ++
 11 files changed, 288 insertions(+), 17 deletions(-)
 create mode 100644 examples/visual_layout_inference/visual_layout_inference.py
 create mode 100644 tilelang/analysis/layout_visual.py

diff --git a/docs/tutorials/debug_tools_for_tilelang.md b/docs/tutorials/debug_tools_for_tilelang.md
index e18b1327..f8dfaab8 100644
--- a/docs/tutorials/debug_tools_for_tilelang.md
+++ b/docs/tutorials/debug_tools_for_tilelang.md
@@ -171,6 +171,32 @@ The output messages will include something like:
 msg='hello world' BlockIdx=(0, 0, 0), ThreadIdx=(0, 0, 0): 0
 ```
 
+### Visual Layout Inference For TileLang
+ The **Visual Layout Inference** tool automatically generates visual diagrams that illustrate the mapping between logical indices, thread IDs, and register file locations.
+
+When TileLang performs layout inference, it determines how fragment buffers are distributed across threads. The visual layout tool captures this information and generates:
+1. **Textual output**: A human-readable description of the layout mapping
+2. **Visual diagrams**: Color-coded plots showing the thread-to-data mapping
+
+The visual layout inference tool is controlled through the `TL_LAYOUT_VISUALIZATION_ENABLE` and `TL_LAYOUT_VISUALIZATION_FORMATS` pass configuration. By default, `TL_LAYOUT_VISUALIZATION_ENABLE` is **disabled** to avoid performance overhead during compilation.
+
+When enabled, `TL_LAYOUT_VISUALIZATION_FORMATS` accepts string values to control output formats:
+- "txt": Text output only (same as default)
+- "all": Generates all formats (TXT, PDF, PNG, SVG)
+- "png": Generate PNG format only
+- "pdf": Generate PDF format only
+- "svg": Generate SVG format only
+- "txt,svg": Generate multiple formats (comma-separated) in addition to text output
+
+The output messages of "txt" will include something like:
+```
+C_local inferenced layout:
+  Shape: [32, 32] -> [8]
+  Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+  Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+```
+
+
 ## Conclusion
 
 By carefully examining intermediate representations (IR) before final code generation—and by leveraging runtime printing through `T.print`—one can quickly diagnose where index calculations, copy logic, or other kernel operations deviate from the intended behavior. This two-pronged approach (inspecting IR transformations and using runtime prints) is often sufficient for resolving generation and correctness issues in TileLang programs.
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
new file mode 100644
index 00000000..3677d475
--- /dev/null
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -0,0 +1,61 @@
+import tilelang
+import tilelang.language as T
+
+
+# use pass_configs to enable layout visualization
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg"
+    })
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+
+    @T.prim_func
+    def gemm(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def main():
+    kernel = matmul(128, 128, 128, 32, 32, 32)
+
+    import torch
+
+    a = torch.randn(128, 128).cuda().half()
+    b = torch.randn(128, 128).cuda().half()
+
+    c = kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All check passed.")
+
+    # print the layout visualization result and save figures to ./tmp.
+    '''
+    C_local inferenced layout:
+    Shape: [32, 32] -> [8]
+    Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+    Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+    '''
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 706cd529..d793fb1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
 # mldtypes should be greater than 0.5.1
 # if you want to enable fp4
 fp4 = ["ml-dtypes>=0.5.1"]
+# if you want to enable layout inference visualization
+vis = ["matplotlib"]
 
 [build-system]
 requires = ["cython>=3.0.0", "scikit-build-core"]
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index ced86cfa..260ba6fa 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -34,6 +34,8 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
diff --git a/src/op/builtin.h b/src/op/builtin.h
index 7ae638f1..ea861d06 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -51,6 +51,10 @@ static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
 static constexpr const char *kStorageRewriteDetectInplace =
     "tl.storage_rewrite_detect_inplace";
+static constexpr const char *kLayoutVisualizationEnable =
+    "tl.layout_visualization_enable";
+static constexpr const char *kLayoutVisualizationFormats =
+    "tl.layout_visualization_formats";
 /*!
  * \brief Whether to disable dynamic tail split
  *
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 2eae5cdb..75a92eab 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -137,6 +137,7 @@ from . import (
     transform,  # noqa: F401
     language,  # noqa: F401
     engine,  # noqa: F401
+    tools,  # noqa: F401
 )
 from .autotuner import autotune  # noqa: F401
 from .transform import PassConfigKey  # noqa: F401
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
index 33ccded6..4e4090d8 100644
--- a/tilelang/analysis/__init__.py
+++ b/tilelang/analysis/__init__.py
@@ -3,3 +3,4 @@
 from .ast_printer import ASTPrinter  # noqa: F401
 from .nested_loop_checker import NestedLoopChecker  # noqa: F401
 from .fragment_loop_checker import FragmentLoopChecker  # noqa: F401
+from .layout_visual import LayoutVisual  # noqa: F401
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
new file mode 100644
index 00000000..782b9126
--- /dev/null
+++ b/tilelang/analysis/layout_visual.py
@@ -0,0 +1,90 @@
+import tilelang.language as T
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor
+
+from tvm.tir.transform import prim_func_pass
+from tilelang.tools.plot_layout import plot_layout
+
+
+def print_fragment_format(layout: T.Fragment) -> str:
+    """
+    Format fragment layout information into a human-readable string.
+
+    Parameters
+    ----------
+    layout : T.Fragment
+        The fragment layout to format
+
+    Returns
+    -------
+    str
+        Formatted string showing shape, thread mapping, and index mapping
+    """
+    if isinstance(layout, T.Fragment):
+        input_shape = layout.get_input_shape()
+        output_shape = layout.get_output_shape()
+        lines = [
+            f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}",
+            f"  Index:  {layout.forward_index}"
+        ]
+        print("\n".join(lines))
+    else:
+        raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
+
+
+@tir.functor.visitor
+class _LayoutVisualVisitor(PyStmtExprVisitor):
+    """
+    User-friendly pass which visualizes fragment layouts inferred during compilation.
+
+    In TileLang, Fragment layouts describe:
+    - How logical indices (e.g., [i, j]) map to thread IDs
+    - How logical indices map to register file locations within each thread
+    - The shape transformation from input dimensions to output dimensions
+
+    This pass generates two types of output:
+    1. Textual output: A human-readable description printed to console
+    2. Visual diagrams: Color-coded plots saved to files (PDF, PNG, SVG formats)
+
+    Configuration:
+    The pass is controlled by the TL_ENABLE_LAYOUT_VISUALIZATION configuration option.
+    The configuration accepts string values:
+
+    - Empty string or not set: Pass does nothing (default, disabled)
+    - "png": Generate PNG format only (recommended for quick inspection)
+    - "pdf": Generate PDF format only (recommended for documentation)
+    - "svg": Generate SVG format only (recommended for web/vector graphics)
+    - "all": Generate all formats (PDF, PNG, SVG)
+    - "png,svg": Generate multiple formats (comma-separated)
+    """
+
+    def __init__(self, formats: list[str] = ""):
+        super().__init__()
+        self.layout_found = []
+        self.processed_layouts = set()
+        self.formats_list = [f for f in formats if f != "txt"]
+
+    def visit_block_(self, op: tir.Block) -> None:
+        if "layout_map" in op.annotations:
+            layout_map = op.annotations["layout_map"]
+
+            for key, layout in layout_map.items():
+                if isinstance(layout, T.Fragment):
+                    layout_id = str(layout)
+                    if layout_id not in self.processed_layouts:
+                        print(f"{key} inferenced layout:")
+                        print_fragment_format(layout)
+                        for fmt in self.formats_list:
+                            plot_layout(layout, name=f"{key}_layout", formats=fmt)
+                        self.processed_layouts.add(layout_id)
+
+        # super().visit_block_(op)
+
+
+def LayoutVisual(formats: str = ""):
+
+    def pass_fn(func: tir.PrimFunc, mod, ctx):
+        _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 1408458b..b688ad9f 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -67,6 +67,48 @@ def should_force_let_inline(pass_ctx: PassContext | None = None) -> bool:
     return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_FORCE_LET_INLINE, False))
 
 
+def should_enable_layout_visual(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE, False)
+    return enabled
+
+
+def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    formats_value = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS, "")
+    if not formats_value:
+        return ["txt"]
+
+    formats_str = formats_value.strip().lower()
+    valid_formats = ["txt", "png", "pdf", "svg", "all"]
+
+    if formats_str == "all":
+        return ["txt", "png", "pdf", "svg"]
+
+    if "," in formats_str:
+        formats_list = [f.strip() for f in formats_str.split(',')]
+    else:
+        formats_list = [formats_str]
+
+    invalid_formats = [f for f in formats_list if f not in valid_formats]
+    if invalid_formats:
+        raise ValueError(
+            f"Invalid formats for TL_LAYOUT_VISUALIZATION_FORMATS: {invalid_formats}. "
+            f"Valid formats are: {valid_formats}. "
+            f"You can choose one of the valid formats or a comma-separated list of formats.(e.g., 'txt,png,pdf')"
+        )
+    return formats_list
+
+
+def LayoutVisual(mod: IRModule) -> None:
+    """Apply layout visualization pass if enabled."""
+    if should_enable_layout_visual():
+        formats = get_layout_visual_formats()
+        tilelang.analysis.LayoutVisual(formats=formats)(mod)
+
+
 def PreLowerSemanticCheck(mod: IRModule) -> None:
     """
     Check whether the module is valid before lowering. If not, raise a user-friendly error
@@ -121,6 +163,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LayoutReducer()(mod)
     # Infer memory layouts for fragments and shared memory
     mod = tilelang.transform.LayoutInference()(mod)
+    # Visualize the layout
+    LayoutVisual(mod)
     # Lower high-level tile operations to low-level operations
     mod = tilelang.transform.LowerTileOp()(mod)
     # Lower l2 persistent map
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 291da257..06e01f48 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -1,11 +1,13 @@
+from __future__ import annotations
 import tilelang.language as T
 
 
-def plot_layout(layout: T.Layout,
+def plot_layout(layout: T.Fragment,
                 save_directory="./tmp",
                 name: str = "layout",
                 colormap: str = "RdPu",
-                verbose: bool = False) -> None:
+                verbose: bool = False,
+                formats: str | list[str] = "png") -> None:
     """
     Plot the layout of a buffer.
 
@@ -21,7 +23,8 @@ def plot_layout(layout: T.Layout,
         The colormap to use for visualization (default is "RdPu").
     verbose : bool, optional
         If True, prints additional information about the mapping (default is False).
-
+    formats : str | list[str], optional
+        The formats to save the image in (default is "png").
     Returns
     -------
     None
@@ -82,6 +85,21 @@ def plot_layout(layout: T.Layout,
     raw_colors = [cmap(i) for i in range(num_threads)]
     colors = raw_colors.copy()
 
+    # Show the distribution of registers in each thread of a warp.
+    warp_size = 32
+    # Warn if the number of threads is less than the warp size
+    if num_threads < warp_size:
+        import warnings
+        warnings.warn(
+            f"Layout visualization has {num_threads} threads, which is less than the warp size ({warp_size}). "
+            f"For the best viewing experience, it is recommended to have at least {warp_size} threads.",
+            UserWarning,
+            stacklevel=2)
+    spectral_camp = plt.get_cmap("hsv", warp_size * 6)
+
+    for i in range(min(warp_size, num_threads)):
+        colors[i] = spectral_camp(i * 6)
+
     # Determine the number of rows and columns in the input shape
     nrows, ncols = input_shape
     # Adjust figure size to maintain square cells
@@ -191,17 +209,30 @@ def plot_layout(layout: T.Layout,
     # Save the figure in multiple formats
     plt.tight_layout()
 
-    # Save as PDF
-    pdf_path = tmp_directory / f"{name}.pdf"
-    plt.savefig(pdf_path, bbox_inches="tight")
-    print(f"Saved pdf format into {pdf_path}")
-
-    # Save as PNG
-    png_path = tmp_directory / f"{name}.png"
-    plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
-    print(f"Saved png format into {png_path}")
-
-    # Save as SVG
-    svg_path = tmp_directory / f"{name}.svg"
-    plt.savefig(svg_path, bbox_inches="tight", format="svg")
-    print(f"Saved svg format into {svg_path}")
+    if isinstance(formats, str):
+        formats_str = formats.strip().lower()
+        if formats_str == 'all':
+            formats_list = ['pdf', 'png', 'svg']
+        elif "," in formats_str:
+            formats_list = [f.strip() for f in formats_str.split(',')]
+        else:
+            formats_list = [formats_str]
+    else:
+        raise TypeError(f"Expected str, but got {type(formats).__name__}. "
+                        f"Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'.")
+
+    # Save the figure
+    if 'pdf' in formats_list:
+        pdf_path = tmp_directory / f"{name}.pdf"
+        plt.savefig(pdf_path, bbox_inches="tight")
+        print(f"Saved pdf format into {pdf_path}")
+
+    if 'png' in formats_list:
+        png_path = tmp_directory / f"{name}.png"
+        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
+        print(f"Saved png format into {png_path}")
+
+    if 'svg' in formats_list:
+        svg_path = tmp_directory / f"{name}.svg"
+        plt.savefig(svg_path, bbox_inches="tight", format="svg")
+        print(f"Saved svg format into {svg_path}")
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index a1edb881..92adcb42 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -69,6 +69,15 @@ class PassConfigKey(str, Enum):
     TL_FORCE_LET_INLINE = "tl.force_let_inline"
     """Force TileLang to inline let bindings during simplification. Default: False"""
 
+    TL_LAYOUT_VISUALIZATION_ENABLE = "tl.layout_visualization_enable"
+    """Enable layout inference visualization. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_FORMATS = "tl.layout_visualization_formats"
+    """Layout visualization formats.
+    Acceptable values: "pdf", "png", "svg", "all"
+
+    """
+
     TL_STORAGE_REWRITE_DETECT_INPLACE = "tl.storage_rewrite_detect_inplace"
     """Control StorageRewrite inplace detection.
 
-- 
GitLab


From 8d019eb9a13a11d201b131f9e6722cad52640a13 Mon Sep 17 00:00:00 2001
From: Yichen Yan <wenji.yyc@alibaba-inc.com>
Date: Sat, 6 Dec 2025 19:18:18 +0800
Subject: [PATCH 076/139] [Release] Relax constraint of tvm-ffi to compatible
 version (#1373)

Co-authored-by: LeiWang1999 <leiwang1999@outlook.com>
---
 .github/workflows/dist.yml | 2 --
 pyproject.toml             | 6 +++++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 73c08936..86e584fe 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -112,8 +112,6 @@ jobs:
         python-version:
           # Wheels are built with Python 3.8 Limited API, they should work with all Python >= 3.8.
           # Only build wheels against Python 3.8 Limited API to save CI resources.
-          # FIXME: Here we use Python 3.9 because our dependency `apache-tvm-ffi` claims to support
-          #        Python 3.8 but it depends on a version of `ml-dtypes` that requires Python >= 3.9.
           - "3.9"
       fail-fast: false
     timeout-minutes: 120
diff --git a/pyproject.toml b/pyproject.toml
index d793fb1b..66424c02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,11 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    "apache-tvm-ffi==0.1.0",
+    "apache-tvm-ffi~=0.1.0",
+    # Extra constraint to tvm-ffi for abi issue,
+    # should be removed after our tvm's update.
+    # See discussion in tilelang#1373 and apache/tvm-ffi#307
+    "apache-tvm-ffi<=0.1.1",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
-- 
GitLab


From 0921328dc5557478c3d20481916e648a415d4258 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Sat, 6 Dec 2025 19:18:49 +0800
Subject: [PATCH 077/139] [Language] Tilelang LazyJIT Experimental Version
 (#1337)

* initial step

* modify builder

* scratch version of new frontend

* write some tests

* add many tests

* add typing stub for tir.ir

* remove idents

* minor update

* minor update

* First version of jitv2 (renamed to LazyJIT)

* fix pre-commit error

* minor fix

* fix lint error

* fix lint error

* Fix conditional check for PrimFunc instance

---------

Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |   8 +-
 examples/lazy_jit/lazyjit.en.ipynb            | 779 ++++++++++++++++++
 examples/lazy_jit/lazyjit.zh.ipynb            | 779 ++++++++++++++++++
 .../test_tilelang_language_frontend_v2.py     |   6 +-
 .../test_tilelang_language_lazy_jit.py        | 425 ++++++++++
 tilelang/__init__.py                          |   2 +-
 tilelang/engine/lower.py                      |   4 +
 tilelang/jit/__init__.py                      | 261 +++++-
 tilelang/jit/adapter/tvm_ffi.py               |  12 +-
 tilelang/language/__init__.py                 |   8 +-
 tilelang/language/allocate.py                 |  38 +-
 tilelang/language/v2/__init__.py              |   2 +-
 tilelang/language/v2/annot.py                 | 717 ++++++++++++++++
 tilelang/language/v2/builder.py               | 265 ++++--
 tilelang/language/v2/dtypes.py                |  14 +-
 tilelang/language/v2/utils.py                 |  15 +
 16 files changed, 3210 insertions(+), 125 deletions(-)
 create mode 100644 examples/lazy_jit/lazyjit.en.ipynb
 create mode 100644 examples/lazy_jit/lazyjit.zh.ipynb
 create mode 100644 testing/python/language/test_tilelang_language_lazy_jit.py
 create mode 100644 tilelang/language/v2/annot.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 04d84911..1e5bab47 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,11 +35,7 @@ repos:
     rev: v21.1.6  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
-        exclude: |
-          (?ix)(
-            ^.+\.(cu|cuh)$|
-            ^.+\.json$
-          )
+        types_or: [c++, c]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.14.7  # sync with requirements-lint.txt
     hooks:
@@ -66,4 +62,4 @@ repos:
             ^.+\.(cpp|hpp|cxx|cc|c|h|cu|cuh)$|
             ^.+\.svg$|
             ^.*\brequirements\b.*\.txt$
-          )
+          )
\ No newline at end of file
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
new file mode 100644
index 00000000..acb318c1
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -0,0 +1,779 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT combines the jit generation and invocation logic.\n",
+    "\n",
+    "The function signature syntax is similar to triton but with significant enhancements, most notably allowing Tensor annotations:\n",
+    "\n",
+    "For example, the code below annotates a 2D Tensor with T.Tensor[[int, int], T.float16]\n",
+    "1. Each dimension is a compile-time constant; changing it triggers recompilation\n",
+    "2. Its dtype must be T.float16\n",
+    "\n",
+    "DType can also be Any or None in addition to a concrete type\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A: T.Tensor[[int, int], T.float16],\n",
+    "    B: T.Tensor[[int, int], T.float16],\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "Call the Tensor directly as an argument to trigger the full jit compile-and-run workflow:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "Change the call-site arguments; if the compiler parameters differ, it recompiles:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device='cuda')\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "You can also manually call compile helpers to build a kernel\n",
+    "\n",
+    "1. `ker.compile` compiles the kernel\n",
+    "2. `ker.get_tir` retrieves the tir\n",
+    "3. `ker.par_compile` compiles in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-11-25 17:29:46  [TileLang:tilelang.cache.kernel_cache:WARNING]: Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.\n"
+     ]
+    }
+   ],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### Separate the implementation with macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "Next we'll implement a simple gemm in several ways. For convenience, first write a macro that captures the main gemm logic:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### Mark dynamic shapes with T.dyn\n",
+    "\n",
+    "When some dimensions are dynamic, mark them with T.dyn. T.dyn can take a string argument to name the variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(\n",
+    "    A: T.Tensor[[int, T.dyn['K']], T.float16], # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn['K'], int], T.float16], # noqa: F821\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c60fd346",
+   "metadata": {},
+   "source": [
+    "Inspect the lazy_jit function signature: parameters with a `$` suffix are compile-time constants that may vary, and those with `$dyn` are runtime variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c6992eb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'A': TensorAnnot(shape=[A_shape_0$, K$dyn], strides=None, dtype=dtype('float16')),\n",
+       " 'B': TensorAnnot(shape=[K$dyn, B_shape_1$], strides=None, dtype=dtype('float16'))}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gemm_dyn_K.func.annot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### Use T.StridedTensor to annotate tensors with strides\n",
+    "\n",
+    "Annotation format: T.StridedTensor[Shape, Stride, DType]. Each Shape or Stride entry can be\n",
+    "* int: compile-time constant\n",
+    "* T.dyn: runtime value\n",
+    "\n",
+    "DType can be None or Any"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(\n",
+    "    A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]\n",
+    "):\n",
+    "    M, N = A.shape\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N],\n",
+    "            B[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N]\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device='cuda')\n",
+    "B = as_contingious(A[::2, ::2])\n",
+    "B_ref = A[::2, ::2].contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### Annotate tensors with T.ptr\n",
+    "lazy_jit lets you declare a handle with T.ptr, but you must define its shape inside the function via T.match_buffer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: int,\n",
+    "    N: int,\n",
+    "    K: int,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16)\n",
+    "    B = T.match_buffer(B, (K, N), T.float16)\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### Use T.int32 to annotate runtime variables\n",
+    "\n",
+    "lazy_jit lets you define runtime variables with T.int32 or other types, enabling a fully dynamic gemm similar to triton"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: T.int32,\n",
+    "    N: T.int32,\n",
+    "    K: T.int32,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16, strides=(K, 1))\n",
+    "    B = T.match_buffer(B, (K, N), T.float16, strides=(N, 1))\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## Compilation and parallel compilation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "lazyjit and the original jit both support parallel compilation\n",
+    "\n",
+    "To avoid wasting memory with torch.tensor placeholders, use T.Tensor to create placeholders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6d7f05cdfff412e9a527332438f7aa2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14836065a21b41ae8fc34e8763ae49fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7f29c0072ed0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00882f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00735f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0088890>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c01f94c0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0073fe0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0070ce0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00732f0>]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            'A': T.Tensor((1024, 1024), T.float32),\n",
+    "            'B': T.Tensor((1024, 1024), T.float32),\n",
+    "            'block_M': block_M,\n",
+    "            'block_N': block_N,\n",
+    "            'block_K': block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## More convenient macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang macros are now upgraded:\n",
+    "\n",
+    "1. Allow `T.Ref` as an annotation, similar to C++ pass-by-reference\n",
+    "2. Allow returning multiple values\n",
+    "3. Allow nesting and recursion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### Passing references with T.Ref\n",
+    "\n",
+    "The reference via T.Ref can target a var or a buffer element"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1 # noqa: F841\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # Supports constant indices\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # Also supports variable indices\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### Pass as arguments\n",
+    "\n",
+    "You can pass macros as parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(\n",
+    "    A: T.Tensor[[T.dyn], Any],\n",
+    "    fn,\n",
+    "):\n",
+    "    N, = A.shape\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device='cuda')\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro recursion\n",
+    "\n",
+    "Macro can be recursive, even if it's rarely needed, as long as the termination condition is known at compile time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device='cuda')\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro returning multiple values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c # noqa: F841\n",
+    "        b = s - c # noqa: F841\n",
+    "foo"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
new file mode 100644
index 00000000..fb9b71b7
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -0,0 +1,779 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT 将 jit 生成和调用的逻辑合并到一起\n",
+    "\n",
+    "函数签名的写法与 triton 相似，但做了大量增强，最主要的增强是允许对 Tensor 的标注：\n",
+    "\n",
+    "例如，下面的代码用 T.Tensor[[int, int], T.float16] 来标注了一个二维 Tensor\n",
+    "1. 它的每个维度都是编译期常量，如果改变，会触发重新编译\n",
+    "2. 它的类型必须是 T.float16\n",
+    "\n",
+    "DType 除了写确定的外，还可以写 Any 或者 None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A: T.Tensor[[int, int], T.float16],\n",
+    "    B: T.Tensor[[int, int], T.float16],\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "直接将 Tensor 作为参数调用，即可触发完整的 jit 编译运行流程："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "更改调用的参数，如果编译器参数发生了变化，会触发重新编译："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device='cuda')\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "你也可以手动调用 compile 函数编译 kernel\n",
+    "\n",
+    "1. `ker.compile` 编译 kernel\n",
+    "2. `ker.get_tir` 获取 tir\n",
+    "3. `ker.par_compile` 并行编译"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-11-25 17:29:46  [TileLang:tilelang.cache.kernel_cache:WARNING]: Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.\n"
+     ]
+    }
+   ],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### 用 macro 来分离实现"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "接下来，我们会用各种方式来实现一个简单的 gemm，为了方便，我们先写一个 macro 把 gemm 的主要逻辑写出来："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### 用 T.dyn 标记动态 Shape\n",
+    "\n",
+    "当某些维度是动态的的时候，可以用 T.dyn 来标记。T.dyn 可以接受一个字符串参数，表示变量的名字"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(\n",
+    "    A: T.Tensor[[int, T.dyn['K']], T.float16], # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn['K'], int], T.float16], # noqa: F821\n",
+    "):\n",
+    "    M, K = A.shape\n",
+    "    K, N = B.shape\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c60fd346",
+   "metadata": {},
+   "source": [
+    "查看 lazy_jit 的函数签名，其中带有后缀`$` 的是不确定的编译期常量，带有 `$dyn` 的是运行时的变量"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c6992eb4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'A': TensorAnnot(shape=[A_shape_0$, K$dyn], strides=None, dtype=dtype('float16')),\n",
+       " 'B': TensorAnnot(shape=[K$dyn, B_shape_1$], strides=None, dtype=dtype('float16'))}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gemm_dyn_K.func.annot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### 用 T.StridedTensor 标记带 stride 的 Tensor\n",
+    "\n",
+    "标记方法：T.StridedTensor[Shape, Stride, DType]，每个 Shape 或 Stride 可以写\n",
+    "* int: 表示编译期常量\n",
+    "* T.dyn：表示运行时常量\n",
+    "\n",
+    "DType 可以写 None 或 Any"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Any\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(\n",
+    "    A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]\n",
+    "):\n",
+    "    M, N = A.shape\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N],\n",
+    "            B[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N]\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device='cuda')\n",
+    "B = as_contingious(A[::2, ::2])\n",
+    "B_ref = A[::2, ::2].contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### 用 T.ptr 标注 Tensor\n",
+    "lazy_jit 允许你用 T.ptr 来声明一个 handle，但必须在函数内用 T.match_buffer 给它定义 shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: int,\n",
+    "    N: int,\n",
+    "    K: int,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16)\n",
+    "    B = T.match_buffer(B, (K, N), T.float16)\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### 用 T.int32 标注运行时变量\n",
+    "\n",
+    "lazy_jit 允许你用 T.int32 或其他类型来定义运行时变量，这样，你可以写一个完全动态的 gemm，这和 triton 非常相似"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(\n",
+    "    A: T.ptr,\n",
+    "    B: T.ptr,\n",
+    "    M: T.int32,\n",
+    "    N: T.int32,\n",
+    "    K: T.int32,\n",
+    "):\n",
+    "    A = T.match_buffer(A, (M, K), T.float16, strides=(K, 1))\n",
+    "    B = T.match_buffer(B, (K, N), T.float16, strides=(N, 1))\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## 编译与并行编译"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "lazyjit 和原来的 jit 都支持并行编译\n",
+    "\n",
+    "为了防止 torch.tensor 白白浪费内存，可以使用 T.Tensor 来创建 placeholder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6d7f05cdfff412e9a527332438f7aa2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14836065a21b41ae8fc34e8763ae49fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7f29c0072ed0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00882f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00735f0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0088890>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c01f94c0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0073fe0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c0070ce0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7f29c00732f0>]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            'A': T.Tensor((1024, 1024), T.float32),\n",
+    "            'B': T.Tensor((1024, 1024), T.float32),\n",
+    "            'block_M': block_M,\n",
+    "            'block_N': block_N,\n",
+    "            'block_K': block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## 更便利的 Macro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang 的 macro 现在已经升级：\n",
+    "\n",
+    "1. 允许用 `T.Ref` 作为 annotation，这类似与 C++ 的引用传递\n",
+    "2. 允许返回多个值\n",
+    "3. 允许嵌套，递归"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### T.Ref 传递引用\n",
+    "\n",
+    "T.Ref 传递的引用可以 var 也可以是 Buffer 的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1 # noqa: F841\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # 支持常量 index\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # 也支持变量 index\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### 当作参数传递\n",
+    "\n",
+    "你可以把 macro 当做参数传递"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(\n",
+    "    A: T.Tensor[[T.dyn], Any],\n",
+    "    fn,\n",
+    "):\n",
+    "    N, = A.shape\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device='cuda')\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro 递归\n",
+    "\n",
+    "虽然不知道有没有这种需求，但 macro 是可以递归的，但要求终止条件编译期间确定"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device='cuda')\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro 返回多个值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c # noqa: F841\n",
+    "        b = s - c # noqa: F841\n",
+    "foo"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index ee694104..78d38f3a 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -252,9 +252,9 @@ def test_marco_return():
             c = macro_return_expr(4.0)
             d = macro_apply_func(5.0, lambda x: x * 2.0)
             check(a, (int, float, T.PrimExpr))
-            check(b, T.PrimExpr)
-            check(c, T.PrimExpr)
-            check(d, T.PrimExpr)
+            check(b, (int, float, T.PrimExpr))
+            check(c, (int, float, T.PrimExpr))
+            check(d, (int, float, T.PrimExpr))
 
 
 def test_prim_func_generator():
diff --git a/testing/python/language/test_tilelang_language_lazy_jit.py b/testing/python/language/test_tilelang_language_lazy_jit.py
new file mode 100644
index 00000000..d3b20c6b
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_lazy_jit.py
@@ -0,0 +1,425 @@
+from dataclasses import dataclass, field
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+from typing import Any
+from itertools import product
+import torch
+
+
+def _gemm_impl():
+
+    @T.macro
+    def gemm_impl(
+        A: T.Tensor[[int, int], Any],
+        B: T.Tensor[[int, int], Any],
+        C: T.Tensor[[int, int], Any],
+        out_dtype: T.dtype,
+        block_M: int,
+        block_N: int,
+        block_K: int,
+    ):
+        dtype = A.dtype
+        M, K = A.shape
+        K, N = B.shape
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[bx * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[bx * block_M, by * block_N])
+
+    return gemm_impl
+
+
+def test_jit2_gemm_annot():
+
+    @tilelang.lazy_jit
+    def gemm(
+        A: T.Tensor[[int, int], Any],
+        B: T.Tensor[[int, int], Any],
+        out_dtype: T.dtype = T.float32,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        M, K = A.shape
+        K, N = B.shape
+        C = T.empty(M, N, dtype=out_dtype)
+        _gemm_impl()(A, B, C, out_dtype, block_M, block_N, block_K)
+        return C
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm.par_compile([{
+        'A': T.Tensor((1024, 1024), dtype=in_dtype),
+        'B': T.Tensor((1024, 1024), dtype=in_dtype),
+        'out_dtype': out_dtype
+    } for in_dtype, out_dtype in prod])
+
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.torch()
+        out_dtype = out_dtype.torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        B = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        C_ref = out_dtype(A @ B)
+        C = gemm(A, B)
+        torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)
+
+
+def test_jit2_gemm_ptr():
+
+    @tilelang.lazy_jit
+    def gemm_ptr(
+        A: T.ptr,
+        B: T.ptr,
+        C: T.ptr,
+        M: int,
+        N: int,
+        K: int,
+        dtype: T.dtype,
+        out_dtype: T.dtype,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        A = T.make_tensor(A, (M, K), dtype)
+        B = T.make_tensor(B, (K, N), dtype)
+        C = T.make_tensor(C, (M, N), out_dtype)
+        _gemm_impl()(A, B, C, out_dtype, block_M, block_N, block_K)
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm_ptr.par_compile([{
+        'A': T.ptr(),
+        'B': T.ptr(),
+        'C': T.ptr(),
+        'M': 1024,
+        'N': 1024,
+        'K': 1024,
+        'dtype': in_dtype,
+        'out_dtype': out_dtype
+    } for in_dtype, out_dtype in prod])
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.torch()
+        out_dtype = out_dtype.torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        B = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        C_ref = out_dtype(A @ B)
+        C = torch.empty(1024, 1024, dtype=out_dtype, device='cuda')
+        gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
+        torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_annot():
+    from tilelang.language.v2.annot import Annot, ArgVarTable
+    from tilelang.language.v2.builder import Builder
+    import traceback
+
+    @dataclass
+    class AnnotTest:
+        annot: Annot
+        promote: Any
+        match_ok: list[Any] = field(default_factory=list)
+        match_ng: list[Any] = field(default_factory=list)
+
+    tests = [
+        AnnotTest(
+            annot=T.Tensor[[int, int], T.float32],
+            promote=False,
+            match_ok=[torch.randn(1, 1, dtype=torch.float32),
+                      T.Tensor((1, 1), dtype=T.float32)],
+            match_ng=[
+                torch.randn(1, 1, dtype=torch.float16),
+                T.Tensor(1, dtype=T.float32),
+                T.Tensor((1, 1), dtype=T.float16),
+            ],
+        ),
+        AnnotTest(
+            annot=T.Tensor[[int], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, dtype=torch.float32),
+                torch.randn(12, dtype=torch.float16),
+                T.Tensor((1,), dtype=T.float32),
+                T.Tensor((1,), dtype=T.float16),
+            ],
+            match_ng=[torch.randn((1, 1), dtype=torch.float32),
+                      T.Tensor((1, 1), dtype=T.float16)]),
+        AnnotTest(
+            annot=T.Tensor[[int, 1], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, 1, dtype=torch.float32),
+                torch.randn(12, 1, dtype=torch.float16),
+                T.Tensor((12, 1), T.float32),
+                T.Tensor((12, 1), T.float16),
+            ],
+            match_ng=[torch.randn(12, 12, dtype=torch.float32),
+                      T.Tensor((12, 12), T.float32)]),
+        AnnotTest(
+            annot=T.Tensor[[T.dyn, 1], Any],
+            promote=False,
+            match_ok=[
+                torch.randn(12, 1, dtype=torch.float32),
+                torch.randn(12, 1, dtype=torch.float16),
+                T.Tensor((12, 1), T.float32),
+                T.Tensor((12, 1), T.float16),
+            ],
+            match_ng=[torch.randn(12, 12, dtype=torch.float32),
+                      T.Tensor((12, 12), T.float32)]),
+        AnnotTest(
+            annot=T.Tensor[[1024, 1024], T.float32],
+            promote=True,
+        ),
+        AnnotTest(annot=T.dyn[int, 'X'], promote=False, match_ok=[1, 2, 3, 4]),
+        AnnotTest(annot=T.dyn, promote=False, match_ok=[1, 2, 3, 4])
+    ]
+
+    for test in tests:
+        promote = test.annot.promote()
+        promoted = promote is not None
+        if promoted != test.promote:
+            raise AssertionError(
+                f'Promote mismatch for {test.annot}: expected {test.promote}, got {promoted}')
+        with Builder().prim_func('_test'):
+            for match_ok in test.match_ok:
+                try:
+                    vt = ArgVarTable()
+                    test.annot.create_prim_func_arg('arg', match_ok, vt)
+                except Exception as e:
+                    traceback.print_exc()
+                    raise AssertionError(
+                        f'Match failed for {test.annot} with value {match_ok}: {e}') from e
+            for match_ng in test.match_ng:
+                try:
+                    vt = ArgVarTable()
+                    test.annot.create_prim_func_arg('arg', match_ng, vt)
+                    raise AssertionError(
+                        f'Match unexpectedly succeeded for {test.annot} with value {match_ng}')
+                except Exception:
+                    pass
+
+
+def test_jit2_many_annot():
+
+    @T.macro
+    def copy_impl(A, B):
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128:bx * 128 + 128, by * 128:by * 128 + 128], B[bx * 128:bx * 128 + 128,
+                                                                          by * 128:by * 128 + 128])
+
+    @tilelang.lazy_jit
+    def copy1(
+        A: T.Tensor[[int, int], T.float32],
+        B: T.Tensor[[int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+        B: T.Tensor[[128, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy3(
+        A: T.Tensor[[int, 128], T.float32],
+        B: T.Tensor[[int, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy4(
+        A: T.Tensor[[T.dyn, int], T.float32],
+        B: T.Tensor[[T.dyn, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy5(
+        A: T.StridedTensor[[int, int], [int, int], T.float32],
+        B: T.StridedTensor[[int, int], [int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy6(
+        A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+        B: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+    ):
+        copy_impl(A, B)
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device='cuda')
+        B = torch.empty(128, 128, device='cuda')
+        copy(A, B)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device='cuda')
+        B = torch.randn(128, 2, 128, 2, device='cuda')
+        copy(A[:, 0, :, 0], B[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
+
+
+def test_jit2_return():
+
+    @T.macro
+    def copy_impl(A):
+        M, N = A.shape
+        B = T.empty(M, N, dtype=A.dtype)
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128:bx * 128 + 128, by * 128:by * 128 + 128], B[bx * 128:bx * 128 + 128,
+                                                                          by * 128:by * 128 + 128])
+        return B
+
+    @tilelang.lazy_jit
+    def copy0(A: T.Tensor[[int, int], Any]):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy1(A: T.Tensor[[int, int], T.float32],):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy2(A: T.Tensor[[128, 128], T.float32],):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy3(A: T.Tensor[[int, 128], T.float32],):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy4(A: T.Tensor[[T.dyn, int], T.float32],):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy5(A: T.StridedTensor[[int, int], [int, int], T.float32],):
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy6(A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],):
+        return copy_impl(A)
+
+    for copy in [copy0, copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device='cuda')
+        B = copy(A)
+        assert torch.equal(B, A)
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device='cuda')
+        B = copy(A[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B)
+
+
+def test_jit2_deepseek_deepgemm():
+
+    @tilelang.lazy_jit
+    def deep_gemm(
+        A: T.Tensor[[int, int], T.float8_e4m3],
+        B: T.Tensor[[int, int], T.float8_e4m3],
+        scales_a: T.Tensor[[int, int], T.float32],
+        scales_b: T.Tensor[[int, int], T.float32],
+        out_dtype: T.dtype = T.bfloat16,
+        accum_dtype: T.dtype = T.float32,
+        block_N: int = 128,
+        block_M: int = 128,
+        block_K: int = 128,
+    ):
+        # A: [M, K]
+        # B: [N, K]
+        # scales_a: [M, K // 128]
+        # scales_b: [N, K // 128]
+        # C: [M, N]
+
+        group_size = 128
+        in_dtype = A.dtype
+        M, K = A.shape
+        N, K = B.shape
+        C = T.empty(M, N, dtype=out_dtype)
+
+        assert out_dtype in [
+            T.bfloat16, T.float32
+        ], f"Expect out_dtype to be one of [T.float16, T.float32], got {out_dtype}"
+        assert scales_a.shape == [M, T.ceildiv(K, group_size)
+                                 ], f"Expect scales_a shape to be f{[M, T.ceildiv(K, group_size)]}"
+        assert scales_b.shape == [N, T.ceildiv(K, group_size)
+                                 ], f"Expect scales_b shape to be f{[N, T.ceildiv(K, group_size)]}"
+
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), in_dtype)
+            B_shared = T.alloc_shared((block_N, block_K), in_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            scale_C_shared = T.alloc_shared((block_M,), T.float32)
+            C_local = T.alloc_fragment((block_M, block_K), accum_dtype)
+            C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.use_swizzle(panel_size=10)
+
+            T.clear(C_local)
+            T.clear(C_local_accum)
+            K_iters = T.ceildiv(K, block_K)
+            for k in T.Pipelined(K_iters, num_stages=4):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                Scale_B = scales_b[bx * block_N // group_size, k]
+                for i in T.Parallel(block_M):
+                    scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                for i, j in T.Parallel(block_M, block_N):
+                    C_local_accum[i, j] += C_local[i, j] * scale_C_shared[i]
+                T.clear(C_local)
+
+            T.copy(C_local_accum, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+        return C
+
+
+#     def ceildiv(a, b):
+#         return (a + b - 1) // b
+
+#     def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
+#         # A_scale: (M, K//128)       ==>   (M//128, K//128, 128)
+#         # B_scale: (N//128, K//128)  ==>   (N//128, K//128, 128)
+#         # A_fp8: (M, K)
+#         # B_fp8: (N, K)
+#         # out_dtype: float16 or float32
+#         # return C: (M, N)
+#         M, N, K = A_fp8.shape[0], B_fp8.shape[0], A_fp8.shape[1]
+#         A_scales = A_scale.view(M // 128, 128, K // 128).permute(0, 2, 1)
+#         B_scales = B_scale.repeat_interleave(128, dim=1).view(N // 128, K // 128, 128)
+#         C = torch.zeros(M, N, device="cuda", dtype=out_dtype)
+#         c_acc = torch.zeros(128, 128, device="cuda", dtype=torch.float32)
+#         for i in range(ceildiv(M, 128)):
+#             for j in range(ceildiv(N, 128)):
+#                 c_acc.zero_()
+#                 for k in range(ceildiv(K, 128)):
+#                     c = torch._scaled_mm(
+#                         A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
+#                         B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+#                         scale_a=A_scales[i, k].view(128, 1).contiguous(),
+#                         scale_b=B_scales[j, k].view(1, 128).contiguous(),
+#                         out_dtype=torch.bfloat16)
+#                     c_acc += c.to(torch.float32)
+#                 C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+#         return C
+
+#     M, N, K = 1024, 1024, 8192
+#     A = torch.randn((M, K), dtype=torch.float8_e4m3fn, )
+
+if __name__ == '__main__':
+    tilelang.testing.main()
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 75a92eab..0d8c21ba 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -120,7 +120,7 @@ def _load_tile_lang_lib():
 if env.SKIP_LOADING_TILELANG_SO == "0":
     _LIB, _LIB_PATH = _load_tile_lang_lib()
 
-from .jit import jit, JITKernel, compile  # noqa: F401
+from .jit import jit, lazy_jit, JITKernel, compile, par_compile  # noqa: F401
 from .profiler import Profiler  # noqa: F401
 from .cache import clear_cache  # noqa: F401
 
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 88d89dcc..7abdfb92 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -141,6 +141,10 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
         if var in func.buffer_map:
             tensor_types.append(KernelParam.from_buffer(func.buffer_map[var]))
         else:
+            if var.dtype == 'handle':
+                raise ValueError(
+                    f'Handle parameter {var} must be mapped to a buffer.\n'
+                    f'Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer.')
             tensor_types.append(KernelParam.from_var(var))
     return tensor_types
 
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 9f0e25f4..09cbac5e 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -16,13 +16,15 @@ from typing import (
     Literal,
 )
 from collections.abc import Iterable
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
-from tilelang.language.v2 import PrimFunc
+from tilelang.language.v2 import PrimFunc, PrimFuncCreater, prim_func
+from tilelang.language.v2.annot import Annot
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
@@ -40,6 +42,7 @@ logger = getLogger(__name__)
 _P = ParamSpec('_P')
 _KP = ParamSpec('_KP')
 _T = TypeVar('_T')
+_Ret = TypeVar('_Ret')
 
 
 def compile(
@@ -74,10 +77,19 @@ def compile(
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
     """
+
     assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
+
     if isinstance(compile_flags, str):
         compile_flags = [compile_flags]
 
+    if hasattr(func, 'out_idx_override'):
+        if func.out_idx_override is not None and out_idx is not None:
+            raise ValueError(
+                "Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors"
+            )
+        out_idx = func.out_idx_override or out_idx
+
     # This path is not a performance critical path, so we can afford to convert the target.
     target = Target(determine_target(target))
 
@@ -176,8 +188,76 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
 
 
 @dataclass
-class JITImpl(Generic[_P, _KP, _T]):
-    func: Callable[_P, _T] | PrimFunc[_KP, _T]
+class JITImpl(Generic[_P, _KP, _T, _Ret]):
+    '''
+    Detailed Just-In-Time wrapper for TileLang programs.
+
+    This dataclass encapsulates the configuration and runtime helpers used by the
+    top-level `jit` and `jit2` decorators. It represents a configured JIT
+    "factory" that can (a) elaborate TileLang/PrimFunc creators into concrete
+    TIR (PrimFunc), (b) compile those TIR functions into runnable kernels via
+    the TVM bridge, (c) cache compiled kernels keyed by call-site arguments
+    (and optional tuning parameters), and (d) provide parallel compilation
+    helpers for batch autotuning workflows.
+
+    Attributes
+    ----------
+    out_idx : list[int] | int | None
+        Which output tensor(s) of the compiled kernel should be returned to the
+        caller. Accepts a single index, a list of indices, or None to return all.
+    execution_backend : Literal["dlpack", "ctypes", "cython"]
+        Backend used for exchanging arguments and executing the generated kernel.
+    target : str | tvm.target.Target
+        TVM compilation target (e.g. "cuda", "llvm", or "auto").
+    target_host : str | tvm.target.Target | None
+        Host target used for cross-compilation, or None to infer/default.
+    verbose : bool
+        Enable verbose messages during compilation/build.
+    pass_configs : dict[str, Any] | None
+        Extra TVM pass configuration options forwarded to the compiler's
+        PassContext.
+    debug_root_path : str | None
+        If provided, compiled kernel source and the elaborated Python program
+        are written to this directory to ease debugging and inspection.
+    compile_flags : list[str] | str | None
+        Additional flags passed to the compiler. A single string will be converted
+        to a single-element list.
+    func_source : str
+        Original Python source string from which the PrimFunc or creator was
+        derived. Used for diagnostics and debug dumps.
+    signature : inspect.Signature
+        Function signature of the original Python function (useful for tooling).
+    v2 : bool
+        Indicates whether the object wraps a "v2" PrimFunc creator (True) or a
+        plain callable / PrimFunc (False). v2-mode enables argument conversion
+        hooks and a distinct cache keying strategy.
+    func : Callable | PrimFunc | PrimFuncCreater
+        The underlying object: either a user function that returns a PrimFunc
+        (creator), a PrimFuncCreater, or an already-constructed PrimFunc.
+        For presentation/readability the function is stored last in the dataclass.
+
+    Behavioral summary
+    ------------------
+    - get_tir(*args, **kwargs)
+        Converts provided call-site arguments into a concrete PrimFunc. If the
+        wrapped object is a PrimFuncCreater or a user callable, it is invoked
+        with the given arguments. If the wrapped object is already a PrimFunc,
+        it is returned as-is.
+
+    - compile(...)
+        A convenience wrapper that elaborates and immediately compiles a single
+        PrimFunc into a JITKernel using the module-level `compile` function.
+        When `debug_root_path` is set, the compiled C kernel and the source
+        Python program are saved for inspection.
+
+    - par_compile(configs, ...)
+        Accepts an iterable of configs (either dicts mapping keyword args or
+        tuples mapping to positional args). Each config is elaborated to a
+        PrimFunc and the resulting set is compiled in parallel via the
+        module-level `par_compile` helper. Returns a list of JITKernel objects
+        in the same order as the provided configs.
+    '''
+
     out_idx: list[int] | int | None
     execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
     target: str | Target
@@ -188,6 +268,14 @@ class JITImpl(Generic[_P, _KP, _T]):
     compile_flags: list[str] | str | None
     func_source: str
     signature: inspect.Signature
+    lazy_jit: bool
+    # place func at the last element for better __repr__
+    func: Callable[_P, _T] | PrimFunc[_KP, _T]
+
+    @property
+    def annot(self) -> dict[str, Annot]:
+        assert self.lazy_jit, "annot is only support in @tilelang.jit2"
+        return self.func.func_annot.annots
 
     def __post_init__(self):
         if self.debug_root_path is not None and not path.isabs(self.debug_root_path):
@@ -197,21 +285,47 @@ class JITImpl(Generic[_P, _KP, _T]):
             except NameError:
                 self.debug_root_path = path.abspath(self.debug_root_path)
         self._kernel_cache: dict[tuple, Kernel] = {}
+        self._tuner_cache: dict[tuple, Kernel] = {}
 
     def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
-        program_result_source = self.func
-        if isinstance(program_result_source, PrimFunc):
-            program_result = program_result_source
-        elif callable(program_result_source):
-            program_result = program_result_source(*args, **kwargs)
+        """
+        Retrieve a TIR (Tensor Intermediate Representation) PrimFunc from the stored callable or object.
+        """
+        if isinstance(self.func, PrimFuncCreater):
+            tir = self.func(*args, **kwargs)
+        elif isinstance(self.func, PrimFunc):
+            tir = self.func
+        elif callable(self.func):
+            tir = self.func(*args, **kwargs)
         else:
-            raise ValueError(f"Invalid function type: {type(program_result_source)}")
-        return program_result
+            raise ValueError(f"Invalid function type: {type(self.func)}")
+        assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
+        return tir
 
     def par_compile(self,
                     configs: Iterable[dict[str, Any] | tuple[str, Any]],
                     num_workers: int = None,
                     ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+        """
+        Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+        Parameters
+        ----------
+        configs : Iterable[Union[dict[str, Any], tuple[Any, ...]]]
+            The configurations to elaborate and compile. Each config can be either
+            a dictionary mapping keyword arguments to values, or a tuple of positional
+            arguments.
+        num_workers : int, optional
+            Number of parallel workers to use for compilation. Defaults to None,
+            which lets the system decide.
+        ignore_error : bool, optional
+            If True, compilation errors for individual configs will be logged
+            as warnings and the corresponding result will be None. If False,
+            any compilation error will raise an exception. Defaults to False.
+        Returns
+        -------
+        List[JITKernel]
+            A list of compiled JITKernel objects corresponding to the provided configs.
+        """
         configs = list(configs)
         funcs = []
         for cfg in tqdm(configs, desc='Elaborating'):
@@ -233,7 +347,7 @@ class JITImpl(Generic[_P, _KP, _T]):
             num_workers=num_workers,
             ignore_error=ignore_error)
 
-    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
+    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         func = self.get_tir(*args, **kwargs)
         kernel_result = compile(
             func,
@@ -261,12 +375,34 @@ class JITImpl(Generic[_P, _KP, _T]):
 
         return kernel_result
 
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel[_KP, _T]:
+    def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
+        if isinstance(self.func, PrimFuncCreater):
+            tune_params = kwargs.pop('__tune_params', {})
+            return self.func.func_annot.parse_key(*args, **kwargs, **tune_params)
+        else:
+            tune_params = kwargs.pop('__tune_params', {})
+            key_args_tuple = args
+            key_kwargs_tuple = tuple(sorted(kwargs.items()))
+            tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
+            key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+            return key
+
+    def convert_kernel_args(self, *args: _P.args, **kwargs: _P.kwargs):
+        if isinstance(self.func, PrimFuncCreater):
+            tune_params = kwargs.pop('__tune_params', {})
+            return self.func.func_annot.convert_to_kernel_args(*args, **kwargs, **tune_params)
+        else:
+            raise NotImplementedError(
+                "convert_arg_to_kernel_args is only implemented for PrimFuncCreater.")
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         # Separate out the tuning parameters from the user's kwargs
-        tune_params = kwargs.pop('__tune_params', {})
         # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
         return_compile_arguments = kwargs.pop('__return_compile_arguments', False)
         if return_compile_arguments:
+            logger.warning(
+                "`__return_compile_arguments` is deprecated and will be removed in future versions."
+            )
             compile_args = {
                 'out_idx': self.out_idx,
                 'execution_backend': self.execution_backend,
@@ -278,19 +414,27 @@ class JITImpl(Generic[_P, _KP, _T]):
             }
             return compile_args
 
-        key_args_tuple = args
-        key_kwargs_tuple = tuple(sorted(kwargs.items()))
-        tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
-        key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+        key = self.parse_cache_key(*args, **kwargs)
 
-        if key not in self._kernel_cache:
-            self._kernel_cache[key] = self.compile(*args, **kwargs, **tune_params)
+        tune_params = kwargs.pop('__tune_params', {})
+
+        kernel = self._kernel_cache.get(key, None)
+        if kernel is None:
+            kernel = self.compile(*args, **kwargs, **tune_params)
+            self._kernel_cache[key] = kernel
+
+        if self.lazy_jit:
+            args = self.func.func_annot.convert_to_kernel_args(*args, **kwargs, **tune_params)
+            return kernel(*args)
+        else:
+            return kernel
 
-        return self._kernel_cache[key]
+
+ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
 
 
 @overload
-def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T]:
+def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]:
     ...
 
 
@@ -300,13 +444,12 @@ def jit(
     out_idx: Any = None,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                               "torch"] = "auto",
+    execution_backend: ExecutionBackend = "auto",
     verbose: bool = False,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
     compile_flags: list[str] | str | None = None
-) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T]]:
+) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]:
     ...
 
 
@@ -316,8 +459,7 @@ def jit(  # This is the new public interface
         out_idx: Any = None,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                                   "torch"] = "auto",
+        execution_backend: ExecutionBackend = "auto",
         verbose: bool = False,
         pass_configs: dict[str, Any] | None = None,
         debug_root_path: str | None = None,
@@ -358,12 +500,12 @@ def jit(  # This is the new public interface
         compile_flags = [compile_flags]
 
     def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
-        if isinstance(func, PrimFunc):
+        if isinstance(func, (PrimFunc, PrimFuncCreater)):
             orig_func = func.orig_func
         else:
             orig_func = func
         return JITImpl(
-            func,
+            func=func,
             out_idx=out_idx,
             execution_backend=execution_backend,
             target=target,
@@ -374,9 +516,70 @@ def jit(  # This is the new public interface
             compile_flags=compile_flags,
             func_source=inspect.getsource(orig_func),
             signature=inspect.signature(orig_func),
-        )
+            lazy_jit=False)
 
     if func is not None:
         return decorator(func)
     else:
         return decorator
+
+
+@overload
+def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]:
+    ...
+
+
+@overload
+def lazy_jit(
+    *,
+    out_idx: Any = None,
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]:
+    ...
+
+
+def lazy_jit(
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
+
+    if isinstance(compile_flags, str):
+        compile_flags = [compile_flags]
+
+    compile_args = dict(
+        out_idx=None,
+        execution_backend=execution_backend,
+        target=target,
+        target_host=target_host,
+        verbose=verbose,
+        pass_configs=pass_configs,
+        debug_root_path=debug_root_path,
+        compile_flags=compile_flags)
+
+    def decorator(func: Callable[_P, _T]):
+        pf: PrimFunc[_P, _T] | PrimFuncCreater[_P, _T] = prim_func(func, generator=True)
+        # if isinstance(pf, PrimFunc):
+        #     compile_args.pop('debug_root_path', None)
+        #     return compile(pf, **compile_args)
+        # else:
+        return JITImpl(
+            func=pf,
+            **compile_args,
+            func_source=inspect.getsource(pf.orig_func),
+            signature=inspect.signature(pf.orig_func),
+            lazy_jit=True)
+
+    return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index a6a1bfeb..96b4c85e 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -106,6 +106,9 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
         params = func.params
         buffer_map = func.buffer_map
         dynamic_symbolic_map = {}
+        for i, param in enumerate(params):
+            if isinstance(param, tir.Var) and (param not in dynamic_symbolic_map):
+                dynamic_symbolic_map[param] = (2, i, -1)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
@@ -217,7 +220,14 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
                                 if (str(s) == str(key)):
                                     ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[
                                         key]
-                                    shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                    if ref_id == 2:
+                                        shape.append(inputs[ref_tensor_idx])
+                                    elif ref_id == 0:
+                                        shape.append(
+                                            tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                    elif ref_id == 1:
+                                        shape.append(
+                                            tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
                         else:  # Already converted to Python int during initialization
                             shape.append(s)
 
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 1f560a44..f78c8711 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -13,16 +13,15 @@ from . import overrides as _overrides  # noqa: F401
 from .v2 import *  # noqa: F401
 from .tir.ir import *  # noqa: F401
 from tilelang.layout import Layout, Fragment  # noqa: F401
-from .proxy import (
-    ptr,  # noqa: F401
-    make_tensor,  # noqa: F401
+from .proxy import ptr, make_tensor  # noqa: F401
+from .v2.annot import (
     Buffer,  # noqa: F401
     Tensor,  # noqa: F401
     StridedTensor,  # noqa: F401
     FragmentBuffer,  # noqa: F401
     SharedBuffer,  # noqa: F401
     LocalBuffer,  # noqa: F401
-    Ref,  # noqa: F401
+    dyn,  # noqa: F401
 )
 from .loop import (
     Parallel,  # noqa: F401
@@ -56,6 +55,7 @@ from .allocate import (
     alloc_wgmma_desc,  # noqa: F401
     alloc_tcgen05_smem_desc,  # noqa: F401
     alloc_tcgen05_instr_desc,  # noqa: F401
+    empty,  # noqa: F401
 )
 from .copy import copy, c2d_im2col  # noqa: F401
 from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index da1ca837..8036e6ac 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -14,8 +14,7 @@ Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
 from __future__ import annotations
-
-from typing import overload, Literal
+from typing import TypeVarTuple, TypeVar, overload, Literal, Unpack, Callable
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
@@ -23,9 +22,16 @@ from tvm.script.parser.tir import block_attr
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
 from .v2.dtypes import dtype as tl_dtype
+from .v2.builder import OutTensor
+from .v2.annot import Tensor, SharedBuffer, LocalBuffer, FragmentBuffer
+
+_Shapes = TypeVarTuple('_Shapes')
+_DType = TypeVar('_DType')
 
 
-def alloc_shared(shape, dtype, scope="shared.dyn"):
+def alloc_shared(shape: tuple[Unpack[_Shapes]],
+                 dtype: _DType,
+                 scope="shared.dyn") -> SharedBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -43,7 +49,9 @@ def alloc_shared(shape, dtype, scope="shared.dyn"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape, dtype, scope="local"):
+def alloc_local(shape: tuple[Unpack[_Shapes]],
+                dtype: _DType,
+                scope="local") -> LocalBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -57,7 +65,9 @@ def alloc_local(shape, dtype, scope="local"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape, dtype, scope="local.fragment"):
+def alloc_fragment(shape: tuple[Unpack[_Shapes]],
+                   dtype: _DType,
+                   scope="local.fragment") -> FragmentBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -256,3 +266,21 @@ def alloc_tcgen05_instruction_desc(dtype: str = "uint32"):
 # Alias: short name consistent with imports
 def alloc_tcgen05_instr_desc(dtype: str = "uint32"):
     return alloc_tcgen05_instruction_desc(dtype)
+
+
+@overload
+def empty(shape: tuple[Unpack[_Shapes]],
+          dtype: str = 'float32') -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+    ...
+
+
+def empty(*shape: Unpack[_Shapes],
+          dtype: str = 'float32') -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+    if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
+        return OutTensor(shape[0], dtype)
+    elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
+        return OutTensor(shape[0], shape[1])
+    elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
+        return OutTensor(shape, dtype)
+    else:
+        raise RuntimeError(f'Invalid shape {shape}')
diff --git a/tilelang/language/v2/__init__.py b/tilelang/language/v2/__init__.py
index b86b378a..5ed48343 100644
--- a/tilelang/language/v2/__init__.py
+++ b/tilelang/language/v2/__init__.py
@@ -1,2 +1,2 @@
-from .builder import prim_func, macro, PrimFunc  # noqa: F401
+from .builder import prim_func, macro, PrimFunc, PrimFuncCreater, Ref  # noqa: F401
 from .dtypes import *
diff --git a/tilelang/language/v2/annot.py b/tilelang/language/v2/annot.py
new file mode 100644
index 00000000..0afded38
--- /dev/null
+++ b/tilelang/language/v2/annot.py
@@ -0,0 +1,717 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
+from tvm import tir
+from tvm.ir.expr import PrimExpr
+from tvm.script.ir_builder.tir import buffer
+from typing import Any, Callable, Literal, TypeVar, ParamSpec, Generic, TypeVarTuple, Unpack, TYPE_CHECKING, _GenericAlias, Self
+from collections.abc import Sequence
+from .dtypes import AnyDType
+from . import dtypes as dt
+import tvm.script.ir_builder.tir as tb_tir
+from tvm.script.ir_builder import IRBuilder
+import torch
+import inspect
+
+_Shapes = TypeVarTuple('_Shapes')
+_Shape = ParamSpec('_Shape')
+_Stride = ParamSpec('_Stride')
+_DType = TypeVar('_DType')
+
+Scope = Literal['global', 'shared.dyn', 'local', 'local.fragment']
+
+
+class Annot(ABC):
+    '''
+    Base class for tilelang kernel annotations
+    Tilelang kernel annotations are used to specify how to interpret each argument of the jit kernel
+
+    It provides 3 main functionalities:
+    1. determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
+    2. parse the argument value into a hash key for jit caching
+    3. convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+    '''
+
+    def is_kernel_arg(self) -> bool:
+        '''
+        Determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
+        '''
+        return False
+
+    @abstractmethod
+    def with_name(self: Self, name) -> Self:
+        pass
+
+    @abstractmethod
+    def get_key_parser(self) -> Callable[[str, Any], tuple[Any, ...]]:
+        '''
+        Return a parser function that converts the argument value into a hash key for jit caching
+        '''
+
+    @abstractmethod
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable) -> tir.Var | tir.Buffer:
+        '''
+        Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+        '''
+
+    def promote(self) -> TIRAnnot | None:
+        '''
+        Try to promote the annotation into a FixedAnnot if possible
+        Return None if not promotable
+        '''
+        return None
+
+
+@dataclass
+class ArgVarTable:
+    '''
+    ArgVarTable is used to manage the mapping from argument names to tir.Var objects
+    '''
+
+    var_tab: dict[str, tir.Var] = field(default_factory=dict)
+    tmp_name_idx: int = 0
+
+    def get_or_create_var(self, name: str, dtype: dt.dtype) -> tir.Var:
+        if not name:
+            name = self.create_tmp_name()
+        if name not in self.var_tab:
+            self.var_tab[name] = tir.Var(name, dtype)
+        return self.var_tab[name]
+
+    def create_tmp_name(self) -> str:
+        name = f'varg_{self.tmp_name_idx}'
+        self.tmp_name_idx += 1
+        return name
+
+
+@dataclass
+class Value(Annot):
+    kind: Literal['static', 'dynamic'] = 'dynamic'
+    name: str | None = None
+    dtype: dt.dtype | None = dt.int32
+    value: int | tir.Var | None = None
+    creator: Callable[[], Any] | None = None
+
+    def is_kernel_arg(self) -> bool:
+        return self.kind == 'dynamic'
+
+    @classmethod
+    def from_value(cls, value: Any, prefer_name: str = None) -> Value:
+        if isinstance(value, int):
+            # handle A: T.Tensor[[1024, 1024], ...]
+            return Value(kind='static', name=prefer_name, dtype=dt.int32, value=value)
+        elif isinstance(value, float):
+            return Value(kind='static', name=prefer_name, dtype=dt.float32, value=value)
+        elif isinstance(value, tir.Var):
+            # handle A: T.Tensor[[M, N, K], ...]
+            return Value(kind='dynamic', name=value.name, dtype=value.dtype, value=value)
+        elif isinstance(value, dt.dtype):
+            # handle A: T.float32
+            return Value(kind='dynamic', name=prefer_name, dtype=value, value=None)
+        elif isinstance(value, Value):
+            # handle A: T.dyn
+            return value
+        elif isinstance(value, TypeVar):
+            return Value(kind='static', name=value.__name__, value=None)
+        elif value is Any or value is None or value is dt.dtype or isinstance(
+                value, (type, _GenericAlias)):
+            # A # no annotation
+            # A: Any
+            # A: _T
+            # A: dt.dtype
+            # A: tuple[...]
+            return Value(kind='static', name=prefer_name, value=None)
+        else:
+            raise TypeError(f"Unsupported Value annotation: {value!r}")
+
+    def with_name(self, name: str) -> Value:
+        return Value(kind=self.kind, name=self.name or name, dtype=self.dtype, value=self.value)
+
+    def get_key_parser(self):
+        if self.kind == 'static':
+            if self.value is not None:
+                expected_value = self.value
+
+                def key_parser(name: str, target: Any):
+                    assert target == expected_value
+                    return target
+
+                return key_parser
+            else:
+                return lambda name, target: (target,)
+        else:
+            return lambda name, target: (None,)
+
+    def parse_key(self, target: Any):
+        return self.get_key_parser()(target)
+
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable, create_arg: bool = True):
+        if self.kind == 'static':
+            if self.value:
+                assert self.value == value, f"static value mismatch for {name}: expected {self.value}, got {value}"
+            return value
+        else:
+            name = self.name or name or vt.create_tmp_name()
+            if self.value is not None:
+                arg = self.value
+            elif self.creator is not None:
+                arg = self.creator()
+            else:
+                arg = vt.get_or_create_var(name, self.dtype)
+            return tb_tir.arg(name, arg) if create_arg else arg
+
+    def __repr__(self):
+        if self.kind == 'static':
+            if self.value is not None:
+                return repr(self.value)
+            else:
+                return (str(self.name) or '$unnamed') + '$'
+        else:
+            if self.value is not None:
+                return repr(self.value)
+            elif self.creator is not None:
+                return repr(self.creator())
+            else:
+                return (str(self.name) or '$unnamed') + '$dyn'
+
+
+def _canonicalize_dtype(val: Any) -> dt.dtype | None:
+    if val == Any or val is None:
+        return None
+    if isinstance(val, TypeVar):
+        return None
+    return dt.dtype(val)
+
+
+def _canonicalize_shape(shape: Sequence[Any]) -> list[Value]:
+    if shape is None or shape is Any:
+        return None
+    return [Value.from_value(dim) for _, dim in enumerate(iterable=shape)]
+
+
+def _canonicalize_strides(strides: Sequence[Any]) -> list[Value]:
+    if strides is None or strides is Any:
+        return None
+    return [Value.from_value(dim) for _, dim in enumerate(strides)]
+
+
+def _shape_with_name(shape: Sequence[Value], base_name: str) -> list[Value]:
+    if shape is None:
+        return None
+    res = []
+    for i, dim in enumerate(shape):
+        dim = dim.with_name(f'{base_name}_{i}')
+        res.append(dim)
+    return res
+
+
+def _try_convert_static_shape(shape: Sequence[Value]):
+    if shape is None:
+        return None
+    res = []
+    for s in shape:
+        if s.kind == 'static' and s.value is not None or s.kind == 'dynamic' and s.value is not None:
+            res.append(s.value)
+    if len(res) == len(shape):
+        return res
+
+
+@dataclass
+class BufferAnnot(Annot):
+    shape: tuple = None
+    strides: tuple = None
+    dtype: dt.dtype = None
+
+    def is_kernel_arg(self) -> bool:
+        return True
+
+    @property
+    def scope(self):
+        return 'global'
+
+    def __call__(
+        self,
+        shape: tuple[Unpack[_Shapes]],
+        dtype: _DType = "float32",
+        data=None,
+        strides=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+        return buffer(
+            shape,
+            dtype=dtype,
+            data=data,
+            strides=strides,
+            elem_offset=elem_offset,
+            scope=scope or self.scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def __getitem__(self, params):
+        shape, dtype = params
+        if not isinstance(shape, (tuple, list)):
+            shape = (shape,)
+        shape = _canonicalize_shape(shape)
+        dtype = _canonicalize_dtype(dtype)
+        return self.__class__(shape, strides=self.strides, dtype=dtype)
+
+    def with_name(self, name: str):
+        shape = _shape_with_name(self.shape, base_name=f'{name}_shape')
+        strides = _shape_with_name(self.strides, base_name=f'{name}_stride')
+        return self.__class__(shape, strides, self.dtype)
+
+    def get_key_parser(self):
+        raw_shapes = True
+        if self.shape is not None:
+            raw_shapes = False
+            shape_len = len(self.shape)
+            static_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == 'static']
+            # static_fixed_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == 'static' and dim.value is not None]
+            # static_fixed_shape_values = [dim.value for dim in self.shape if dim.kind == 'static' and dim.value is not None]
+        raw_strides = True
+        if self.strides is not None:
+            raw_strides = False
+            strides_len = len(self.strides)
+            strides_shape_idx = [i for i, dim in enumerate(self.strides) if dim.kind == 'static']
+            # static_fixed_strides_idx = [i for i, dim in enumerate(self.strides) if dim.kind == 'static' and dim.value is not None]
+            # static_fixed_strides_values = [dim.value for dim in self.strides if dim.kind == 'static' and dim.value is not None]
+        raw_dtype = True
+        if self.dtype is not None:
+            raw_dtype = False
+            expected_dtype = self.dtype
+
+        def key_parser(name: str, target: Any):
+            if isinstance(target, torch.Tensor):
+                shape = tuple(target.shape)
+                strides = tuple(target.stride())
+                dtype = dt.dtype(target.dtype)
+            elif isinstance(target, tir.Buffer):
+                shape = tuple(target.shape)
+                strides = tuple(target.strides)
+                dtype = dt.dtype(target.dtype)
+            else:
+                raise TypeError(
+                    f"Unsupported buffer argument type for argument `{name}`: expected a `torch.Tensor` or `tir.Buffer`, got {type(target)}"
+                )
+            if not raw_shapes:
+                assert len(shape) == shape_len
+                shape = tuple(shape[i] for i in static_shape_idx)
+                # shape_fixed = tuple(shape[i] for i in static_fixed_shape_idx)
+                # assert shape_fixed == static_fixed_shape_values, f"shape mismatch"
+            if not raw_strides:
+                assert len(strides) == strides_len
+                strides = tuple(strides[i] for i in strides_shape_idx)
+                # strides_fixed = tuple(strides[i] for i in static_fixed_strides_idx)
+                # assert strides_fixed == static_fixed_strides_values
+            if not raw_dtype:
+                dtype = dt.dtype(dtype)
+                if dtype != expected_dtype:
+                    raise TypeError(
+                        f"Tensor dtype mismatch for argument `{name}`, expected {expected_dtype}, got {dtype}"
+                    )
+            return shape, strides, dtype
+
+        return key_parser
+
+    def parse_key(self, target: Any):
+        return self.get_key_parser()(target)
+
+    @staticmethod
+    def match_shape(shape: tuple[Value, ...], target_shape: tuple[int, ...], vt: ArgVarTable):
+        if shape is None:
+            return target_shape
+        args = []
+        for s, target in zip(shape, target_shape):
+            args.append(s.create_prim_func_arg(s.name, target, vt, create_arg=False))
+        return args
+
+    def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable):
+        if isinstance(value, tir.Buffer):
+            shape = value.shape
+            strides = value.strides
+            dtype = value.dtype
+        elif isinstance(value, torch.Tensor):
+            shape = value.shape
+            strides = value.stride()
+            dtype = dt.dtype(value.dtype)
+        else:
+            raise TypeError(f"Unsupported buffer argument type: {type(value)}")
+        shape = self.match_shape(self.shape, shape, vt)
+        strides = self.match_shape(self.strides, strides, vt)
+        arg = buffer(shape, dtype=self.dtype or dtype, strides=strides, scope=self.scope)
+        return tb_tir.arg(name, arg)
+
+    def promote(self):
+        shape = _try_convert_static_shape(self.shape)
+        strides = _try_convert_static_shape(self.strides)
+        if shape is not None and strides is not None and self.dtype is not None:
+            buf = buffer(shape, self.dtype, strides=strides, scope=self.scope)
+            return TIRAnnot(data=buf)
+
+    # def __repr__(self):
+    #     items = []
+    #     if self.shape is not None:
+    #         items.append(f'shape=[{', '.join(map(repr, self.shape))}]')
+    #     if self.strides is not None:
+    #         items.append(f'strides=[{', '.join(map(repr, self.strides))}]')
+    #     if self.dtype is not None:
+    #         items.append(f'dtype={self.dtype}')
+    #     items.append(f'scope={repr(self.scope)}')
+    #     return 'Buffer(' + ', '.join(items) + ')'
+
+
+class TensorAnnot(BufferAnnot):
+
+    @staticmethod
+    def _construct_strides(shape: tuple[Any]):
+        s, strides = 1, [1]
+        for dim in shape[:0:-1]:
+            s *= dim
+            strides.append(s)
+        return tuple(reversed(strides))
+
+    def __call__(
+        self,
+        shape: tuple[Unpack[_Shapes]],
+        dtype: _DType = "float32",
+        data=None,
+        strides=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ):
+        if isinstance(shape, (int, PrimExpr)):
+            shape = (shape,)
+        strides = strides or self._construct_strides(shape)
+        return super().__call__(
+            shape=shape,
+            dtype=dtype,
+            data=data,
+            strides=strides,
+            elem_offset=elem_offset,
+            scope=scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators)
+
+    def promote(self):
+        shape = _try_convert_static_shape(self.shape)
+        if shape is not None and self.dtype is not None:
+            strides = self._construct_strides(shape)
+            buf = buffer(shape, self.dtype, strides=strides, scope=self.scope)
+            return TIRAnnot(data=buf)
+
+
+class StridedTensorAnnot(BufferAnnot):
+
+    def __call__(
+        self,
+        shape,
+        strides,
+        dtype: _DType = "float32",
+        data=None,
+        elem_offset=None,
+        scope=None,
+        align=0,
+        offset_factor=0,
+        buffer_type="",
+        axis_separators=None,
+    ):
+        return super().__call__(
+            shape=shape,
+            strides=strides,
+            dtype=dtype,
+            data=data,
+            elem_offset=elem_offset,
+            scope=scope,
+            align=align,
+            offset_factor=offset_factor,
+            buffer_type=buffer_type,
+            axis_separators=axis_separators,
+        )
+
+    def __getitem__(self, params):
+        shape, strides, dtype = params
+        shape = _canonicalize_shape(shape)
+        strides = _canonicalize_strides(strides)
+        dtype = _canonicalize_dtype(dtype)
+        return StridedTensorAnnot(shape, strides, dtype)
+
+
+class FragmentBufferAnnot(BufferAnnot):
+
+    @property
+    def scope(self):
+        return 'local.fragment'
+
+
+class SharedBufferAnnot(BufferAnnot):
+
+    @property
+    def scope(self):
+        return 'shared.dyn'
+
+
+class LocalBufferAnnot(BufferAnnot):
+
+    @property
+    def scope(self):
+        return 'local'
+
+
+class DynAnnot(Value):
+    '''
+    Dynamic variable annotation represents a tvm tir.Var argument
+    '''
+
+    def __call__(self, dtype: AnyDType = dt.float32, name: str | None = None) -> DynAnnot:
+        return tir.Var(name, dtype)
+
+    def __getitem__(self, params):
+        if not isinstance(params, tuple):
+            params = (params,)
+        dtype = None
+        if len(params) == 1:
+            name, = params
+        if len(params) == 2:
+            dtype, name = params
+        dtype = _canonicalize_dtype(dtype) or dt.int32
+        return DynAnnot(kind='dynamic', dtype=dtype, name=name)
+
+
+@dataclass
+class DTypeAnnot(Annot):
+    '''
+    Data type annotation ensures automatically conversion from AnyDType to dtype
+    >>> def foo(A: T.dtype): print(A)
+    >>> foo(torch.float32)
+    dtype('float32')
+    >>> foo(T.float32)
+    dtype('float32')
+    >>> foo('float32')
+    dtype('float32')
+    '''
+    name: str | None = None
+
+    def is_kernel_arg(self) -> bool:
+        return False
+
+    def with_name(self, name):
+        return DTypeAnnot(name=name)
+
+    def get_key_parser(self):
+        return lambda name, value: (dt.dtype(value),)
+
+    def create_prim_func_arg(self, name, value, vt):
+        return dt.dtype(value)
+
+    def __repr__(self):
+        return self.name + '$dtype'
+
+
+@dataclass
+class TIRAnnot(Annot):
+    '''
+    TIR annotation is used to directly pass tir.Buffer or tir.Var as kernel arguments
+    >>> def foo(A: T.Buffer((128,), T.float32)): ...
+    '''
+    data: tir.Buffer | tir.Var
+
+    def is_kernel_arg(self) -> bool:
+        return True
+
+    def get_key_parser(self):
+        return lambda name, value: (None,)
+
+    def create_prim_func_arg(self, name, value, vt):
+        return tb_tir.arg(name, self.data)
+
+    def with_name(self, name: str):
+        IRBuilder.name(name, self.data)
+        return self
+
+    def __repr__(self):
+        return repr(self.data)
+
+
+if TYPE_CHECKING:
+
+    class Buffer(Generic[_Shape, _DType]):
+
+        def __init__(
+            shape: tuple[Unpack[_Shapes]],
+            dtype: _DType = "float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Buffer[Callable[[Unpack[_Shapes]]], _DType]:
+            ...
+
+        @property
+        def shape(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> tuple[Unpack[_Shapes]]:
+            ...
+
+        @property
+        def dtype(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> dt.dtype[_DType]:
+            ...
+
+        @property
+        def strides(self) -> tuple[tir.PrimExpr]:
+            ...
+
+        def scope(self) -> Scope:
+            ...
+
+    class Tensor(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+
+        def __new__(
+            shape: tuple[Unpack[_Shapes]],
+            dtype: _DType = "float32",
+            data=None,
+            strides=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+            ...
+
+    class StridedTensor(Generic[_Shape, _Stride, _DType], Buffer[_Shape, _DType]):
+
+        def __new__(
+            shape: tuple[Unpack[_Shapes]],
+            strides=None,
+            dtype: _DType = "float32",
+            data=None,
+            elem_offset=None,
+            scope=None,
+            align=0,
+            offset_factor=0,
+            buffer_type="",
+            axis_separators=None,
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+            ...
+
+    class FragmentBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class LocalBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class SharedBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
+        pass
+
+    class dyn(tir.Var):
+
+        def __new__(cls, dtype: _DType = "float32", name: str | None = None) -> dyn[_DType]:
+            ...
+
+        @property
+        def dtype(self: dyn[_DType]) -> dt.dtype[_DType]:
+            ...
+
+else:
+
+    Buffer = BufferAnnot()
+    Tensor = TensorAnnot()
+    StridedTensor = StridedTensorAnnot()
+    FragmentBuffer = FragmentBufferAnnot()
+    SharedBuffer = SharedBufferAnnot()
+    LocalBuffer = LocalBufferAnnot()
+    dyn = DynAnnot()
+
+
+@dataclass
+class FuncAnnot:
+    sig: inspect.Signature
+    arg_names: list[str]
+    annots: dict[str, Annot]
+    arg_parser: dict[str, Callable[[Any], tuple[Any, ...]]]
+    ker_arg_names: list[str]
+
+    @classmethod
+    def from_sig_annots(cls, sig: inspect.Signature, func_annots: dict[str, Any]) -> FuncAnnot:
+        annots = {}
+        arg_parser = {}
+        ker_arg_names = []
+        for param in sig.parameters.values():
+            name = param.name
+            annot = func_annots.get(name, Value('static', name))
+            if not isinstance(annot, Annot):
+                if not isinstance(annot, type) and callable(annot):
+                    annot = annot()
+                if annot is dt.dtype:
+                    annot = DTypeAnnot(name=name)
+                elif isinstance(annot, (tir.Buffer, tir.Var)):
+                    annot = TIRAnnot(data=annot)
+                else:
+                    annot = Value(kind='static', name=name)
+            annot = annot.promote() or annot
+            annots[name] = annot.with_name(name)
+            if annot.is_kernel_arg():
+                ker_arg_names.append(name)
+            arg_parser[name] = annot.get_key_parser()
+        arg_names = list(sig.parameters.keys())
+        return FuncAnnot(sig, arg_names, annots, arg_parser, ker_arg_names)
+
+    def parse_key(self, *args, **kws):
+        '''
+        Parse arguments and generates the cache key for jit caching
+        '''
+        args = {name: arg for name, arg in zip(self.arg_names, args)}
+        arg_dict = dict(**args, **kws)
+        parsed = []
+        for name, value in arg_dict.items():
+            key = self.arg_parser[name](name, value)
+            parsed.append((name, key))
+        return tuple(sorted(parsed))
+
+    def convert_to_kernel_args(self, *args, **kws):
+        args = {name: arg for name, arg in zip(self.arg_names, args)}
+        arg_dict = dict(**args, **kws)
+        return [arg_dict[name] for name in self.ker_arg_names]
+
+    def create_argument(self, name: str, value: Any, vt: ArgVarTable):
+        '''
+        Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
+        '''
+        return self.annots[name].create_prim_func_arg(name, value, vt)
+
+    def is_all_static(self):
+        '''
+        Check if all arguments are static (i.e., can be fully determined at compile time)
+        '''
+        return all(isinstance(annot, TIRAnnot) for annot in self.annots.values())
+
+    def get_all_static_args(self):
+        res = {}
+        for name, annot in self.annots.items():
+            if isinstance(annot, TIRAnnot):
+                res[name] = annot.data
+        return res
+
+    def get_compile_time_unknown_args(self):
+        res = []
+        for name, annot in self.annots.items():
+            if not isinstance(annot, TIRAnnot):
+                res.append(name)
+        return res
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 68c10913..27dc3282 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -6,12 +6,18 @@ import inspect
 from tilelang.language.kernel import KernelLaunchFrame
 from tvm_ffi.container import Map
 from tvm.ir.base import Span
+from tvm.ir.expr import Range
+from tvm.tir.stmt import BufferRegion
 from .ast import BaseBuilder, IRGenerator, eval_op, mutate
+from .utils import construct_strides
 import tvm
 from tvm.tir import Buffer
 from tvm.script.ir_builder import tir, IRBuilder
-from tvm.tir.expr import EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
+from tvm.tir.expr import BufferLoad, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
 from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union
+from collections.abc import Sequence
+from .annot import FuncAnnot, ArgVarTable, Annot
+import pprint
 # Python 3.9 compatibility for ParamSpec and Self
 try:
     from typing import ParamSpec, Self
@@ -31,7 +37,9 @@ def unwrap_expr(expr) -> PrimExpr | int | float:
     '''
     if isinstance(expr, tir.meta_var):
         expr = expr.value
-    elif isinstance(expr, Buffer) and expr.scope() == 'local.var':
+    elif isinstance(expr, Ref):
+        return expr.load()
+    elif is_var(expr):
         expr = tir.BufferLoad(expr, indices=[0])
     elif isinstance(expr, (EqualOp, NotEqualOp)):
         expr = expr.asobject()
@@ -113,6 +121,30 @@ class SerialForWithStep:
 
 
 @dataclass
+class OutTensor:
+    shape: Sequence[PrimExpr]
+    dtype: dt.dtype
+
+    @property
+    def strides(self):
+        return construct_strides(tuple(self.shape))
+
+
+@dataclass
+class Ref:
+    bufload: BufferLoad
+
+    @property
+    def buffer(self):
+        return self.bufload.buffer
+
+    def store(self, value):
+        tir.buffer_store(self.bufload.buffer, value, self.bufload.indices)
+
+    def load(self):
+        return self.bufload
+
+
 class UnrollForWithStep(SerialForWithStep):
     ...
 
@@ -145,11 +177,15 @@ def is_var(v: Any) -> bool:
 
 class Builder(BaseBuilder):
 
-    def __init__(self):
+    def __init__(self, func_annot: FuncAnnot = None):
         self.frames: list[AnyFrame] = []
         self.ir_builder = IRBuilder()
         self.name_inside_frame: dict[str, AnyFrame] = {}
-        self.arg_annotations = {}
+        self.macro_arg_annot = {}
+        self.func_annot = func_annot
+        self.out_idx = []
+        self.out_tensor_cnt = 0
+        self.arg_vt = ArgVarTable()
 
     @classmethod
     def current(cls) -> Self:
@@ -162,6 +198,8 @@ class Builder(BaseBuilder):
         with self.ir_builder, self.with_frame(tir.prim_func()):
             tir.func_name(name)
             yield
+        if len(self.out_idx) != self.out_tensor_cnt:
+            raise RuntimeError('Not all tensor allocated from `T.empty` are returned')
 
     @contextmanager
     def macro(self, name=None, annotations=None):
@@ -169,9 +207,9 @@ class Builder(BaseBuilder):
             raise RuntimeError(
                 f"Macro `{name}` is used inside boolean expressions, "
                 "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs")
-        save = self.name_inside_frame, self.arg_annotations
+        save = self.name_inside_frame, self.macro_arg_annot
         self.name_inside_frame = {}
-        self.arg_annotations = annotations or {}
+        self.macro_arg_annot = annotations or {}
         pos = len(self.frames)
         # here we add a ExitedMacroFrame to preserve the frame stack inside macro
         # because macro may bind some variable, and return it
@@ -188,7 +226,7 @@ class Builder(BaseBuilder):
         self.frames.append(MacroFrame())
         yield
         self.frames[pos] = ExitedMacroFrame()
-        self.name_inside_frame, self.arg_annotations = save
+        self.name_inside_frame, self.macro_arg_annot = save
 
     def get(self):
         return self.ir_builder.get()
@@ -269,8 +307,11 @@ class Builder(BaseBuilder):
             pass
         elif isinstance(val, tvm.tir.stmt.BufferStore):
             tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
-        elif not isinstance(val, tvm.tir.Buffer):
-            raise TypeError(f"Unsupported eval value: {val} of type {type(val)}")
+        elif isinstance(val, (Buffer, Var)):
+            pass
+        else:
+            logger.warning(
+                f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
 
     def ctx_for(self, it):
         self.check_continue_break()
@@ -355,10 +396,26 @@ class Builder(BaseBuilder):
         #   c = tl.alloc_var('float32')  # bind var `c`
         #   c = a                        # get and assign `c[0] = a_1[0]`
         #   ```
+        if isinstance(orig_value, Ref) and isinstance(value, (int, float, PrimExpr)):
+            orig_value.store(value)
+            return orig_value
         if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
             tir.buffer_store(orig_value, value, 0)
             return orig_value
+
+        # 2. Quick return for trivil types
+        if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
+            return value
+        if isinstance(value, tir.IntImm) and value.dtype == 'int32':
+            return value.value
+        if isinstance(value, (Var, Buffer)):
+            IRBuilder.name(name, value)
+            return value
+
+        # 3. Bind immutable tilelang objects
         res = self.bind_immutable(name, value)
+
+        # 4. Check variable scope and shadowing
         if name != '_':
             frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
             assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
@@ -372,6 +429,9 @@ class Builder(BaseBuilder):
         return res
 
     def unwrap_value(self, value):
+        '''
+        Unwrap some tilelang objects to get their inner value
+        '''
         value = unwrap_expr(value)
         # handle bx, by = tl.Kernel(128, 128), rval is frame
         if isinstance(value, tir.frame.IRBuilderFrame):
@@ -380,6 +440,10 @@ class Builder(BaseBuilder):
             return value
 
     def bind_immutable(self, name, value):
+        '''
+        Bind an immutable tilelang objects.
+        The immutability means the result is usually not changed or re-assigned in a python block.
+        '''
         if name == '_':
             # use _tmp to make the generated tir more readable
             name = "_tmp"
@@ -393,11 +457,19 @@ class Builder(BaseBuilder):
                     stacklevel=2,
                 )
             return self.enter_frame(value)
+        elif isinstance(value, OutTensor):
+            arg = tir.arg(name,
+                          tir.buffer(
+                              shape=value.shape,
+                              dtype=value.dtype,
+                              strides=value.strides,
+                          ))
+            arg._out_idx = self.out_tensor_cnt
+            self.out_tensor_cnt += 1
+            return arg
         elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
             IRBuilder.name(name, value)
             return value
-        elif isinstance(value, (tuple, list, tvm.ffi.Array)):
-            return value
         else:
             try:
                 value = tvm.runtime.convert(value)
@@ -420,7 +492,10 @@ class Builder(BaseBuilder):
 
     def aug_assign(self, op, target, aug_value):
         self.check_continue_break()
-        if is_var(target):
+        if isinstance(target, Ref):
+            target.store(eval_op(op, target.bufload, aug_value))
+            return target
+        elif is_var(target):
             tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
             return target
         elif isinstance(target, Buffer):
@@ -457,10 +532,15 @@ class Builder(BaseBuilder):
         else:
             return super().ifexp(cond, then, otherwise)
 
-    def ret(self, value):
+    def ret(self, value=None):
         self.check_continue_break()
         # handle return T.alloc_var()
-        value = self.unwrap_value(value)
+        if value is None:
+            value = tuple()
+        elif isinstance(value, tuple):
+            value = tuple(self.unwrap_value(v) for v in value)
+        else:
+            value = self.unwrap_value(value)
         last_macro = self.find_frame_idx(MacroFrame)
         if last_macro is not None:
             frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
@@ -478,7 +558,20 @@ class Builder(BaseBuilder):
                     "    return a\n"
                     "```"
                 )
-        return value
+            return value
+        else:
+            if not isinstance(value, tuple):
+                value = (value,)
+            for v in value:
+                if not isinstance(v, Buffer) or not hasattr(v, '_out_idx'):
+                    raise RuntimeError(
+                        f'Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})'
+                    )
+                # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
+                self.out_idx.append(v._out_idx - self.out_tensor_cnt)
+            if len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError(f'Not all tensor from `T.empty` are returned, only got {value}')
+            return NotImplemented
 
     def ctx_with(self, ctx):
         self.check_continue_break()
@@ -487,9 +580,11 @@ class Builder(BaseBuilder):
         else:
             return super().ctx_with(ctx)
 
-    def assert_expr(self, cond, msg):
+    def assert_expr(self, cond, msg=None):
         self.check_continue_break()
         cond = unwrap_cond(cond)
+        if msg is None:
+            msg = 'Assertion failed'
         if isinstance(cond, PrimExpr):
             self.enter_frame(tir.Assert(cond, msg))
         elif not cond:
@@ -506,30 +601,41 @@ class Builder(BaseBuilder):
         return self.unwrap_value(value)
 
     def macro_arg(self, name, value):
-        from tilelang.language.proxy import Ref
-        annot_value = self.arg_annotations.get(name, None)
+        annot_value = self.macro_arg_annot.get(name, None)
         if annot_value is Var or annot_value is Ref:
             if annot_value is Var:
                 logger.warning('Use `T.Var` as macro annotations is deprecated, please use `T.Ref`')
-            is_var = isinstance(value, tvm.tir.BufferLoad) and value.buffer.scope() == 'local.var'
-            if not is_var:
-                raise ValueError(
-                    f'Argument `{name}` is expected to be a variable allocated by `T.alloc_var`, but got {value}({type(value)})'
-                )
-            return value.buffer
+            if isinstance(value, BufferLoad):
+                if is_var(value.buffer):
+                    return value.buffer
+                idx = [self.bind('_', idx) for idx in value.indices]
+                # indices = self.bind(f'_', value.indices)
+                return Ref(BufferLoad(value.buffer, indices=idx))
+            if isinstance(value, BufferRegion):
+                region = [
+                    Range(
+                        self.bind('_', x.begin),
+                        end=self.bind('_', x.end) if x.end is not None else None)
+                    for x in value.region
+                ]
+                return BufferRegion(value.buffer, region=region)
+            raise ValueError(
+                f'To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})'
+            )
         elif isinstance(value, (PrimExpr, int, float)):
             return self.bind(name, value)
         else:
             return value
 
     def prim_func_arg(self, name, value):
-        if isinstance(value, (Buffer, Var)):
-            return tir.arg(name, value)
-        elif value is self.empty:
-            raise ValueError(f'Argument `{name}` is not annotated')
-        else:
-            raise TypeError(
-                f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+        return self.func_annot.create_argument(name, value, self.arg_vt)
+        # if isinstance(value, (Buffer, Var)):
+        #     return tir.arg(name, value)
+        # elif value is self.empty:
+        #     raise ValueError(f'Argument `{name}` is not annotated')
+        # else:
+        #     raise TypeError(
+        #         f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
 
     def arg(self, name, value):
         if self.find_frame_idx(MacroFrame) is not None:
@@ -547,6 +653,39 @@ class Builder(BaseBuilder):
 _P = ParamSpec('_P')
 _T = TypeVar('_T')
 
+
+@dataclass
+class PrimFuncCreater(Generic[_P, _T]):
+    func_annot: FuncAnnot
+    ir_gen: IRGenerator[_P, _T]
+    orig_func: Callable[_P, _T]
+
+    @property
+    def annot(self) -> dict[str, Annot]:
+        return self.func_annot.annots
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_P, _T]:
+        builder = Builder(self.func_annot)
+        with builder.prim_func(self.orig_func.__name__):
+            self.ir_gen.gen(builder)(*args, **kwargs)
+        res: PrimFunc = builder.get()
+        res.ir_gen = self.ir_gen
+        res.orig_func = self.orig_func
+        res.func_annot = self.func_annot
+        res.out_idx_override = builder.out_idx or None
+        return res
+
+    def __repr__(self):
+        fmt = pprint.pformat(
+            {
+                'annot': self.func_annot.annots,
+                'ir_gen': self.ir_gen,
+                'orig_func': self.orig_func
+            },
+            indent=2)
+        return f'{self.__class__.__name__}(\n{fmt}\n)'
+
+
 if TYPE_CHECKING:
 
     class PrimFunc(Generic[_P, _T], tvm.tir.PrimFunc):
@@ -557,8 +696,10 @@ if TYPE_CHECKING:
         attrs: tvm.Attrs | None
         span: Span | None
         ir_gen: IRGenerator[_P, _T] | None
-        source: str | None
         orig_func: Callable[_P, _T] | None
+        func_annot: FuncAnnot | None
+        out_idx_override: list[int] | None
+
 else:
     PrimFunc = tvm.tir.PrimFunc
 
@@ -580,6 +721,12 @@ class Macro(Generic[_P, _T]):
             res = self.ir_gen.gen(builder)(*args, **kwargs)
         return res
 
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
 
 def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
     """
@@ -683,13 +830,9 @@ def get_type_hints(func):
     return hints
 
 
-def _is_static_annot(annot: Any) -> bool:
-    return isinstance(annot, (dt.dtype, Buffer, Var))
-
-
 def prim_func(func: Callable[_P, _T] = None,
               *,
-              generator: bool = False) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
+              generator: bool = False) -> PrimFunc[_P, _T] | PrimFuncCreater[_P, _T]:
     """
     Decorator to create a primitive function (PrimFunc) for TileLang IR generation.
     This decorator transforms a Python function into a TileLang primitive function by analyzing
@@ -739,45 +882,21 @@ def prim_func(func: Callable[_P, _T] = None,
         sig = inspect.signature(func)
         annot = get_type_hints(func)
 
-        for k in annot:
-            if callable(annot[k]):
-                annot[k] = annot[k]()
-
-        # check whether all arguments are annotated
-        all_arg_annotated = all([x in annot for x in sig.parameters])
-        # check whether all annotations are Buffer/Var/dtype
-        all_annot_are_static = all([_is_static_annot(x) for x in annot.values()])
+        func_annot = FuncAnnot.from_sig_annots(sig, annot)
         ir_gen = mutate(func)
 
-        def prim_func_generator(*args, **kwargs):
-            builder = Builder()
-            with builder.prim_func(func.__name__):
-                ir_gen.gen(builder)(*args, **kwargs)
-            res = builder.get()
-            res.ir_gen = ir_gen
-            res.source = ir_gen.source
-            res.orig_func = func
-            return res
-
-        prim_func_generator.ir_gen = ir_gen
-        prim_func_generator.source = ir_gen.source
-        prim_func_generator.orig_func = func
-
-        if generator:
-            return prim_func_generator
+        prim_func_generator = PrimFuncCreater(func_annot, ir_gen, orig_func=func)
 
-        if all_arg_annotated and all_annot_are_static:
-            return prim_func_generator(**annot)
+        if func_annot.is_all_static():
+            args = func_annot.get_all_static_args()
+            return prim_func_generator(**args)
         else:
-            raise ValueError(
-                "Some arguments are not supported or statically annotated, \n"
-                "please check the annotations or set generator=True to get a prim_func generator.\n"
-                f"Argument Annotations: {annot}\n"
-                "Example usage of generator:\n"
-                "```py\n"
-                "@prim_func(generator=True)\n"
-                "def my_func(a=T.Tensor((128,), T.float32)): ...\n"
-                "return my_func()\n"
-                "```")
+            if generator is False:
+                unknown_args = func_annot.get_compile_time_unknown_args()
+                raise ValueError(
+                    f"Cannot create PrimFunc for `{func.__name__}`, some arguments are not compile-time known, \n"
+                    f"Annotations:\n{func_annot.annots}"
+                    f"Unknown Args: {unknown_args}")
+            return prim_func_generator
 
     return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 75cf83dd..3b215870 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -1,12 +1,22 @@
 from tilelang import tvm
 from tvm import ir
 import torch
-from typing import TYPE_CHECKING, Union
+from typing import Generic, TypeVar, Union, TYPE_CHECKING
 from tvm import tir
 import tvm.script.ir_builder.tir._ffi_api as tb_ffi
 import numpy as np
 
-dtype = tvm.DataType
+_T = TypeVar('_T')
+
+if TYPE_CHECKING:
+
+    class dtype(Generic[_T]):
+
+        def torch(self) -> torch.dtype:
+            ...
+else:
+    dtype = tvm.DataType
+
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
 
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/v2/utils.py
index 84f06145..022402df 100644
--- a/tilelang/language/v2/utils.py
+++ b/tilelang/language/v2/utils.py
@@ -4,6 +4,7 @@ import inspect
 from typing import Any, Callable, Literal
 from tilelang import env
 from hashlib import sha256
+from tvm import tir
 import linecache
 
 
@@ -84,3 +85,17 @@ def get_compiled_object(source: str | ast.AST,
     locs = {}
     exec(compiled, globals, locs)
     return locs[name]
+
+
+def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> tuple[Any, ...]:
+    """Construct row-major strides from shape."""
+    strides = []
+    stride = 1
+    for s in shape[::-1]:
+        strides.append(stride)
+        stride *= s
+        if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
+            raise ValueError(
+                "Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+    strides = tuple(reversed(strides))
+    return strides
-- 
GitLab


From 3f8e6b59ae70a8ca506b8a0b067c662db029fb9a Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 7 Dec 2025 00:02:27 +0800
Subject: [PATCH 078/139] [Builder] Enhance variable name binding and scope
 management (#1378)

- Improved handling of TVM Var/Buffer names to prevent out-of-scope errors when reusing Python names across different for-frames.
- Added assertions to ensure variables are defined within the correct control flow frame, enhancing error checking and code reliability.
---
 tilelang/language/v2/builder.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 27dc3282..436756df 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -409,7 +409,14 @@ class Builder(BaseBuilder):
         if isinstance(value, tir.IntImm) and value.dtype == 'int32':
             return value.value
         if isinstance(value, (Var, Buffer)):
+            # Bind TVM Var/Buffer names and also record scope so reusing the same
+            # Python name (e.g., loop vars like `i`) across different for-frames
+            # works without triggering out-of-scope errors.
             IRBuilder.name(name, value)
+            if name != '_':
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                self.name_inside_frame[name] = self.frames[frame]
             return value
 
         # 3. Bind immutable tilelang objects
-- 
GitLab


From a407c4a9a8fe4c69340c556525b9f29a50d7a994 Mon Sep 17 00:00:00 2001
From: Yunqian Fan <pannenets.f@foxmail.com>
Date: Sun, 7 Dec 2025 00:03:29 +0800
Subject: [PATCH 079/139] [Bugfix] make cuda driver api compat with cuda12/13,
 along with tests (#1379)

---
 ..._tilelang_carver_cuda_driver_properties.py |  76 +++++++
 tilelang/carver/arch/driver/cuda_driver.py    | 191 ++++++------------
 2 files changed, 135 insertions(+), 132 deletions(-)
 create mode 100644 testing/python/carver/test_tilelang_carver_cuda_driver_properties.py

diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
new file mode 100644
index 00000000..489c485f
--- /dev/null
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -0,0 +1,76 @@
+import tilelang.testing
+from tilelang.carver.arch.driver.cuda_driver import (
+    get_cuda_device_properties,
+    get_device_name,
+    get_shared_memory_per_block,
+    get_device_attribute,
+    get_max_dynamic_shared_size_bytes,
+    get_persisting_l2_cache_max_size,
+    get_num_sms,
+    get_registers_per_block,
+)
+import torch
+
+
+class _cudaDeviceAttrNames:
+    r"""
+    This struct carries all properties that are of int32_t.
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxSharedMemoryPerBlock: int = 8
+    cudaDevAttrMultiProcessorCount: int = 16
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def test_driver_get_device_properties():
+    prop = get_cuda_device_properties()
+    assert prop is not None, "Failed to get CUDA device properties"
+    assert isinstance(
+        prop,
+        torch.cuda._CudaDeviceProperties), ("Returned object is not of type _CudaDeviceProperties")
+
+
+def test_device_get_device_name():
+    tl_device_name = get_device_name()
+    th_device_name = torch.cuda.get_device_name()
+    assert tl_device_name == th_device_name, "Device names do not match"
+
+
+def test_device_get_shared_memory_per_block():
+    tl_smem = get_shared_memory_per_block()
+    driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
+    assert tl_smem == driver_smem, "Shared memory per block values do not match"
+
+
+def test_device_get_persisting_l2_cache_size():
+    tl_cache_size = get_persisting_l2_cache_max_size()
+    driver_cache_size = get_device_attribute(
+        _cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
+    assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
+
+
+def test_device_get_num_sms():
+    tl_num_sms = get_num_sms()
+    driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
+    assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
+
+
+def test_device_get_registers_per_block():
+    tl_regs_per_block = get_registers_per_block()
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock)
+    assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
+
+
+def test_device_get_max_dynamic_shared_size_bytes():
+    tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
+    driver_dynamic_smem = get_device_attribute(
+        _cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
+    assert tl_dynamic_smem == driver_dynamic_smem, (
+        "Max dynamic shared size bytes values do not match")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index 337987dd..dcb2a34a 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -2,123 +2,51 @@ from __future__ import annotations
 import ctypes
 import sys
 
+try:
+    import torch.cuda._CudaDeviceProperties as _CudaDeviceProperties
+except ImportError:
+    _CudaDeviceProperties = type("DummyCudaDeviceProperties", (), {})
 
-class cudaDeviceProp(ctypes.Structure):
-    _fields_ = [
-        ("name", ctypes.c_char * 256),
-        ("uuid", ctypes.c_byte * 16),  # cudaUUID_t
-        ("luid", ctypes.c_char * 8),
-        ("luidDeviceNodeMask", ctypes.c_uint),
-        ("totalGlobalMem", ctypes.c_size_t),
-        ("sharedMemPerBlock", ctypes.c_size_t),
-        ("regsPerBlock", ctypes.c_int),
-        ("warpSize", ctypes.c_int),
-        ("memPitch", ctypes.c_size_t),
-        ("maxThreadsPerBlock", ctypes.c_int),
-        ("maxThreadsDim", ctypes.c_int * 3),
-        ("maxGridSize", ctypes.c_int * 3),
-        ("clockRate", ctypes.c_int),
-        ("totalConstMem", ctypes.c_size_t),
-        ("major", ctypes.c_int),
-        ("minor", ctypes.c_int),
-        ("textureAlignment", ctypes.c_size_t),
-        ("texturePitchAlignment", ctypes.c_size_t),
-        ("deviceOverlap", ctypes.c_int),
-        ("multiProcessorCount", ctypes.c_int),
-        ("kernelExecTimeoutEnabled", ctypes.c_int),
-        ("integrated", ctypes.c_int),
-        ("canMapHostMemory", ctypes.c_int),
-        ("computeMode", ctypes.c_int),
-        ("maxTexture1D", ctypes.c_int),
-        ("maxTexture1DMipmap", ctypes.c_int),
-        ("maxTexture1DLinear", ctypes.c_int),
-        ("maxTexture2D", ctypes.c_int * 2),
-        ("maxTexture2DMipmap", ctypes.c_int * 2),
-        ("maxTexture2DLinear", ctypes.c_int * 3),
-        ("maxTexture2DGather", ctypes.c_int * 2),
-        ("maxTexture3D", ctypes.c_int * 3),
-        ("maxTexture3DAlt", ctypes.c_int * 3),
-        ("maxTextureCubemap", ctypes.c_int),
-        ("maxTexture1DLayered", ctypes.c_int * 2),
-        ("maxTexture2DLayered", ctypes.c_int * 3),
-        ("maxTextureCubemapLayered", ctypes.c_int * 2),
-        ("maxSurface1D", ctypes.c_int),
-        ("maxSurface2D", ctypes.c_int * 2),
-        ("maxSurface3D", ctypes.c_int * 3),
-        ("maxSurface1DLayered", ctypes.c_int * 2),
-        ("maxSurface2DLayered", ctypes.c_int * 3),
-        ("maxSurfaceCubemap", ctypes.c_int),
-        ("maxSurfaceCubemapLayered", ctypes.c_int * 2),
-        ("surfaceAlignment", ctypes.c_size_t),
-        ("concurrentKernels", ctypes.c_int),
-        ("ECCEnabled", ctypes.c_int),
-        ("pciBusID", ctypes.c_int),
-        ("pciDeviceID", ctypes.c_int),
-        ("pciDomainID", ctypes.c_int),
-        ("tccDriver", ctypes.c_int),
-        ("asyncEngineCount", ctypes.c_int),
-        ("unifiedAddressing", ctypes.c_int),
-        ("memoryClockRate", ctypes.c_int),
-        ("memoryBusWidth", ctypes.c_int),
-        ("l2CacheSize", ctypes.c_int),
-        ("persistingL2CacheMaxSize", ctypes.c_int),
-        ("maxThreadsPerMultiProcessor", ctypes.c_int),
-        ("streamPrioritiesSupported", ctypes.c_int),
-        ("globalL1CacheSupported", ctypes.c_int),
-        ("localL1CacheSupported", ctypes.c_int),
-        ("sharedMemPerMultiprocessor", ctypes.c_size_t),
-        ("regsPerMultiprocessor", ctypes.c_int),
-        ("managedMemory", ctypes.c_int),
-        ("isMultiGpuBoard", ctypes.c_int),
-        ("multiGpuBoardGroupID", ctypes.c_int),
-        ("reserved2", ctypes.c_int * 2),
-        ("reserved1", ctypes.c_int * 1),
-        ("reserved", ctypes.c_int * 60)
-    ]
-
-
-def get_cuda_device_properties(device_id: int = 0) -> cudaDeviceProp | None:
-
-    if sys.platform == "win32":
-        libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
-    else:
-        libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
-
-    prop = cudaDeviceProp()
-    cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
-    cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
-    cudaGetDeviceProperties.restype = ctypes.c_int
-    ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
-    if ret == 0:
-        return prop
-    else:
-        raise RuntimeError(f"cudaGetDeviceProperties failed with error {ret}")
+
+class cudaDeviceAttrNames:
+    r"""
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def get_cuda_device_properties(device_id: int = 0) -> _CudaDeviceProperties | None:
+    try:
+        import torch.cuda
+
+        if not torch.cuda.is_available():
+            return None
+        return torch.cuda.get_device_properties(torch.device(device_id))
+    except ImportError:
+        return None
 
 
 def get_device_name(device_id: int = 0) -> str | None:
     prop = get_cuda_device_properties(device_id)
     if prop:
-        return prop.name.decode()
-    else:
-        raise RuntimeError("Failed to get device properties.")
+        return prop.name
 
 
 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerBlock)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    shared_mem = int(prop.shared_memory_per_block)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
     else:
-        raise RuntimeError("Failed to get device properties.")
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_device_attribute(attr: int, device_id: int = 0) -> int:
@@ -130,7 +58,11 @@ def get_device_attribute(attr: int, device_id: int = 0) -> int:
 
         value = ctypes.c_int()
         cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
-        cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
+        cudaDeviceGetAttribute.argtypes = [
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
         cudaDeviceGetAttribute.restype = ctypes.c_int
 
         ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
@@ -148,28 +80,21 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
     Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
     """
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerMultiprocessor)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    shared_mem = get_device_attribute(
+        cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
     else:
-        raise RuntimeError("Failed to get device properties.")
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.persistingL2CacheMaxSize
-    else:
-        raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
+    prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
+    return prop
 
 
 def get_num_sms(device_id: int = 0) -> int:
@@ -186,15 +111,17 @@ def get_num_sms(device_id: int = 0) -> int:
         RuntimeError: If unable to get the device properties.
     """
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.multiProcessorCount
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    return prop.multi_processor_count
 
 
 def get_registers_per_block(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.regsPerBlock
-    else:
-        raise RuntimeError("Failed to get device properties.")
+    """
+    Get the maximum number of 32-bit registers available per block.
+    """
+    prop = get_device_attribute(
+        cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock,
+        device_id,
+    )
+    return prop
-- 
GitLab


From 8f50c122440582c82023cc14ca6c70c3f552cc9a Mon Sep 17 00:00:00 2001
From: Yunqian Fan <pannenets.f@foxmail.com>
Date: Sun, 7 Dec 2025 00:34:03 +0800
Subject: [PATCH 080/139] [Fix] typo in cuda attr (#1380)

* [Bugfix] make cuda driver api compat with cuda12/13, along with tests

* fix typo in cudaDevAttr
---
 .../carver/test_tilelang_carver_cuda_driver_properties.py    | 4 +++-
 tilelang/carver/arch/driver/cuda_driver.py                   | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
index 489c485f..46b17bf0 100644
--- a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -20,6 +20,7 @@ class _cudaDeviceAttrNames:
 
     cudaDevAttrMaxThreadsPerBlock: int = 1
     cudaDevAttrMaxSharedMemoryPerBlock: int = 8
+    cudaDevAttrMaxRegistersPerBlock: int = 12
     cudaDevAttrMultiProcessorCount: int = 16
     cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
     cudaDevAttrMaxPersistingL2CacheSize: int = 108
@@ -60,7 +61,8 @@ def test_device_get_num_sms():
 
 def test_device_get_registers_per_block():
     tl_regs_per_block = get_registers_per_block()
-    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock)
+    driver_regs_per_block = get_device_attribute(
+        _cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
     assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
 
 
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index dcb2a34a..c8cc1a38 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -14,6 +14,7 @@ class cudaDeviceAttrNames:
     """
 
     cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxRegistersPerBlock: int = 12
     cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
     cudaDevAttrMaxPersistingL2CacheSize: int = 108
 
@@ -38,6 +39,8 @@ def get_device_name(device_id: int = 0) -> str | None:
 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
+    if prop is None:
+        raise RuntimeError("Failed to get device properties.")
     shared_mem = int(prop.shared_memory_per_block)
     if format == "bytes":
         return shared_mem
@@ -121,7 +124,7 @@ def get_registers_per_block(device_id: int = 0) -> int:
     Get the maximum number of 32-bit registers available per block.
     """
     prop = get_device_attribute(
-        cudaDeviceAttrNames.cudaDevAttrMaxThreadsPerBlock,
+        cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock,
         device_id,
     )
     return prop
-- 
GitLab


From 6021f863c84c317d83c03516bb2bb5f4fcede1b1 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 7 Dec 2025 01:51:51 +0800
Subject: [PATCH 081/139] [Language V2] Minor fix for complex annotations
 (#1381)

---
 tilelang/language/v2/annot.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tilelang/language/v2/annot.py b/tilelang/language/v2/annot.py
index 0afded38..14395bd6 100644
--- a/tilelang/language/v2/annot.py
+++ b/tilelang/language/v2/annot.py
@@ -102,9 +102,6 @@ class Value(Annot):
             return Value(kind='static', name=prefer_name, dtype=dt.int32, value=value)
         elif isinstance(value, float):
             return Value(kind='static', name=prefer_name, dtype=dt.float32, value=value)
-        elif isinstance(value, tir.Var):
-            # handle A: T.Tensor[[M, N, K], ...]
-            return Value(kind='dynamic', name=value.name, dtype=value.dtype, value=value)
         elif isinstance(value, dt.dtype):
             # handle A: T.float32
             return Value(kind='dynamic', name=prefer_name, dtype=value, value=None)
@@ -113,6 +110,11 @@ class Value(Annot):
             return value
         elif isinstance(value, TypeVar):
             return Value(kind='static', name=value.__name__, value=None)
+        elif isinstance(value, (tir.Var, PrimExpr)):
+            # handle A: T.Tensor[[M, N, K], ...]
+            # or primexpr annotation like A: T.Tensor[[M, N * 4 +1]]
+            name = value.name if isinstance(value, tir.Var) else prefer_name
+            return Value(kind='dynamic', name=name, dtype=value.dtype, value=value)
         elif value is Any or value is None or value is dt.dtype or isinstance(
                 value, (type, _GenericAlias)):
             # A # no annotation
@@ -122,7 +124,7 @@ class Value(Annot):
             # A: tuple[...]
             return Value(kind='static', name=prefer_name, value=None)
         else:
-            raise TypeError(f"Unsupported Value annotation: {value!r}")
+            raise TypeError(f"Unsupported Value annotation: {value!r}, type: {type(value)}")
 
     def with_name(self, name: str) -> Value:
         return Value(kind=self.kind, name=self.name or name, dtype=self.dtype, value=self.value)
-- 
GitLab


From ce16e479168702404f03dd0282c03861877d4faa Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 7 Dec 2025 10:18:47 +0800
Subject: [PATCH 082/139] [Release] Bump Version into 0.1.7 (#1377)

* Update VERSION to 0.1.7

* Update Python version in distribution scripts to support CPython 3.9 and log output
---
 VERSION                                  | 2 +-
 maint/scripts/docker_local_distribute.sh | 2 +-
 maint/scripts/docker_pypi_distribute.sh  | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/VERSION b/VERSION
index 5ed6219f..11808190 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.6.post2
+0.1.7
diff --git a/maint/scripts/docker_local_distribute.sh b/maint/scripts/docker_local_distribute.sh
index 02dbc19b..2263066d 100755
--- a/maint/scripts/docker_local_distribute.sh
+++ b/maint/scripts/docker_local_distribute.sh
@@ -2,4 +2,4 @@
 set -euxo pipefail
 
 # Build for local architecture
-CIBW_BUILD='cp38-*' cibuildwheel .
+CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/docker_pypi_distribute.sh b/maint/scripts/docker_pypi_distribute.sh
index aa9ed9ab..f8d746de 100755
--- a/maint/scripts/docker_pypi_distribute.sh
+++ b/maint/scripts/docker_pypi_distribute.sh
@@ -12,9 +12,8 @@ if docker buildx version >/dev/null 2>&1; then
     docker buildx use multi >/dev/null 2>&1 || true
   fi
   docker buildx inspect --bootstrap >/dev/null 2>&1 || true
-  done
 
   export CIBW_ARCHS='x86_64 aarch64'
 fi
 
-NO_VERSION_LABEL=ON CIBW_BUILD='cp38-*' cibuildwheel .
+NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
-- 
GitLab


From 305c854be59b73eee297e24eb370bd75a8ff4179 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 7 Dec 2025 10:29:59 +0800
Subject: [PATCH 083/139] [Typing] Enhance compatibility for advanced typing
 features in Python (#1382)

- Updated `allocate.py` and `annot.py` to improve compatibility with Python 3.9 and later by conditionally importing advanced typing features such as `TypeVarTuple`, `Unpack`, and `ParamSpec`.
- Added fallback imports from `typing_extensions` for environments using earlier Python versions.
- Improved handling of generic alias detection to ensure consistent behavior across different Python versions.
---
 tilelang/language/allocate.py |  7 ++++++-
 tilelang/language/v2/annot.py | 39 +++++++++++++++++++++++------------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index 8036e6ac..73377822 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -14,7 +14,12 @@ Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
 from __future__ import annotations
-from typing import TypeVarTuple, TypeVar, overload, Literal, Unpack, Callable
+from typing import TypeVar, overload, Literal, Callable
+# Python 3.9 compatibility for advanced typing features (PEP 646)
+try:
+    from typing import TypeVarTuple, Unpack  # type: ignore[attr-defined]
+except Exception:
+    from typing_extensions import TypeVarTuple, Unpack  # type: ignore
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
diff --git a/tilelang/language/v2/annot.py b/tilelang/language/v2/annot.py
index 14395bd6..b61d9d11 100644
--- a/tilelang/language/v2/annot.py
+++ b/tilelang/language/v2/annot.py
@@ -4,7 +4,31 @@ from abc import ABC, abstractmethod
 from tvm import tir
 from tvm.ir.expr import PrimExpr
 from tvm.script.ir_builder.tir import buffer
-from typing import Any, Callable, Literal, TypeVar, ParamSpec, Generic, TypeVarTuple, Unpack, TYPE_CHECKING, _GenericAlias, Self
+from typing import Any, Callable, Literal, TypeVar, Generic, TYPE_CHECKING
+# Python 3.9 compatibility for advanced typing features
+try:
+    from typing import ParamSpec, TypeVarTuple, Unpack, Self  # type: ignore[attr-defined]
+except Exception:  # Python < 3.10 for ParamSpec, < 3.11 for Unpack/TypeVarTuple/Self
+    from typing_extensions import ParamSpec, TypeVarTuple, Unpack, Self  # type: ignore
+
+# Compatibility for generic alias detection across Python versions
+try:
+    from typing import _GenericAlias as _TypingGenericAlias  # type: ignore[attr-defined]
+except Exception:
+    _TypingGenericAlias = None  # type: ignore
+try:
+    # Builtin generic alias type for e.g. tuple[int]
+    from types import GenericAlias as _TypesGenericAlias  # type: ignore[attr-defined]
+except Exception:
+    _TypesGenericAlias = None  # type: ignore
+
+_GenericAliasTypes = tuple(t for t in (_TypingGenericAlias, _TypesGenericAlias) if t is not None)
+if not _GenericAliasTypes:
+
+    class _DummyGenericAlias:  # type: ignore
+        pass
+
+    _GenericAliasTypes = (_DummyGenericAlias,)  # type: ignore
 from collections.abc import Sequence
 from .dtypes import AnyDType
 from . import dtypes as dt
@@ -116,7 +140,7 @@ class Value(Annot):
             name = value.name if isinstance(value, tir.Var) else prefer_name
             return Value(kind='dynamic', name=name, dtype=value.dtype, value=value)
         elif value is Any or value is None or value is dt.dtype or isinstance(
-                value, (type, _GenericAlias)):
+                value, (type,) + _GenericAliasTypes):
             # A # no annotation
             # A: Any
             # A: _T
@@ -358,17 +382,6 @@ class BufferAnnot(Annot):
             buf = buffer(shape, self.dtype, strides=strides, scope=self.scope)
             return TIRAnnot(data=buf)
 
-    # def __repr__(self):
-    #     items = []
-    #     if self.shape is not None:
-    #         items.append(f'shape=[{', '.join(map(repr, self.shape))}]')
-    #     if self.strides is not None:
-    #         items.append(f'strides=[{', '.join(map(repr, self.strides))}]')
-    #     if self.dtype is not None:
-    #         items.append(f'dtype={self.dtype}')
-    #     items.append(f'scope={repr(self.scope)}')
-    #     return 'Buffer(' + ', '.join(items) + ')'
-
 
 class TensorAnnot(BufferAnnot):
 
-- 
GitLab


From d933d65b2f1a80d3e178dde86fdca5ba5072d8e8 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 8 Dec 2025 14:06:32 +0800
Subject: [PATCH 084/139] [Bugfix][Build] Update CMake configuration to remove
 project root injection for sys.path (#1385)

* [Build] Update CMake configuration for tilelang_cython_wrapper installation

- Adjusted output directories for the tilelang_cython_wrapper to ensure that development builds place the extension in build/lib.
- Updated installation paths to place the extension in tilelang/lib within the wheel, improving organization and avoiding potential conflicts with other modules.
- Modified the internal library path exposure in env.py to prevent shadowing of common module names, enhancing compatibility and usability in user projects.

* [Build] Standardize output directories for tilelang libraries

- Set output directories for both tilelang and tilelang_module libraries to "${CMAKE_BINARY_DIR}/lib" for consistency in development builds.
- This change enhances organization and ensures that all build artifacts are located in a unified directory structure.
---
 CMakeLists.txt  | 28 ++++++++++++++++++++++++----
 tilelang/env.py |  8 ++++++--
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e39891a5..8ed8a7a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,6 +238,18 @@ add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
 target_link_libraries(tilelang_module PUBLIC tvm)
+
+# Place dev build outputs under build/lib for consistency
+set_target_properties(tilelang PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+set_target_properties(tilelang_module PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
@@ -257,11 +269,19 @@ if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
 endif()
 
 python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
-# Install extension into the tilelang package directory
+
+# Ensure dev builds drop the extension into build/lib alongside other shared libs
+set_target_properties(tilelang_cython_wrapper PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+# Install the extension into tilelang/lib inside the wheel
 install(TARGETS tilelang_cython_wrapper
-        LIBRARY DESTINATION tilelang
-        RUNTIME DESTINATION tilelang
-        ARCHIVE DESTINATION tilelang)
+        LIBRARY DESTINATION tilelang/lib
+        RUNTIME DESTINATION tilelang/lib
+        ARCHIVE DESTINATION tilelang/lib)
 
 # let libtilelang to search tvm/tvm_runtime in same dir
 if(APPLE)
diff --git a/tilelang/env.py b/tilelang/env.py
index b1697ef5..ce27aba9 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -20,7 +20,9 @@ TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expe
 TVM_LIBRARY_NOT_FOUND_MESSAGE = ("TVM is not installed or found in the expected path")
 
 TL_ROOT = os.path.dirname(os.path.abspath(__file__))
-TL_LIBS = [TL_ROOT, os.path.join(TL_ROOT, 'lib')]
+# Only expose the internal lib directory to sys.path to avoid shadowing
+# common top-level module names (e.g., utils, analysis) from user projects.
+TL_LIBS = [os.path.join(TL_ROOT, 'lib')]
 TL_LIBS = [i for i in TL_LIBS if os.path.exists(i)]
 
 DEV = False
@@ -30,7 +32,9 @@ if not os.path.exists(THIRD_PARTY_ROOT):
     tl_dev_root = os.path.dirname(TL_ROOT)
 
     dev_lib_root = os.path.join(tl_dev_root, 'build')
-    TL_LIBS = [dev_lib_root, os.path.join(dev_lib_root, 'tvm')]
+    # In dev builds, place artifacts under build/lib and point search path there
+    # to avoid adding the entire build root to sys.path.
+    TL_LIBS = [os.path.join(dev_lib_root, 'lib'), os.path.join(dev_lib_root, 'tvm')]
     THIRD_PARTY_ROOT = os.path.join(tl_dev_root, '3rdparty')
     logger.warning(f'Loading tilelang libs from dev root: {dev_lib_root}')
 
-- 
GitLab


From 242b43bbea1af5e03414048740c53ea21e811012 Mon Sep 17 00:00:00 2001
From: Zhengju Tang <97930865+tzj-fxz@users.noreply.github.com>
Date: Mon, 8 Dec 2025 20:48:04 +0800
Subject: [PATCH 085/139] [BugFix] Fix split kernel layout bug of GQA decode
 (#1386)

* [BugFix] Fix split kernel layout bug of GQA decode

* [BugFix] Avoid local with Parallel; use robust fragment instead
---
 .gitignore                                    |  6 +++++
 examples/flash_decoding/example_gqa_decode.py | 27 ++++++++++---------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 730398df..d94abf9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,9 @@ cmake-build-*/
 
 # host checks logs
 maint/host_checks/logs/*
+
+# ncu
+*.ncu-rep
+
+# csv
+*.csv
\ No newline at end of file
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 46d9beea..7ccd9839 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -15,7 +15,7 @@ torch.random.manual_seed(0)
 def get_configs():
     block_N = [64, 128]
     block_H = [64]
-    num_split = [2, 4, 8]
+    num_split = [1, 2, 4, 8]
     num_stages = [1, 2, 3]
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
@@ -42,7 +42,7 @@ def get_heuristic_config() -> Tuple[Dict, int]:
     if sm_version == 89:
         cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=0, threads=128)
     else:
-        cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=2, threads=128)
+        cfg = dict(block_N=128, block_H=64, num_split=8, num_stages=2, threads=128)
     return cfg, sm_version
 
 
@@ -229,10 +229,9 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             po_local = T.alloc_fragment([dim], dtype)
             o_accum_local = T.alloc_fragment([dim], accum_dtype)
             lse_local = T.alloc_fragment([num_split, 128], dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
+            lse_logsum_local = T.alloc_fragment([128], accum_dtype)
             lse_max_local = T.alloc_fragment([128], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
+            scale_local = T.alloc_fragment([128], accum_dtype)
 
             T.annotate_layout({
                 lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
@@ -246,17 +245,19 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             for k, j in T.Parallel(num_split, 128):
                 lse_local[k, j] = glse[bz, by, k]
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+            for k in T.serial(num_split):
+                for j in T.Parallel(128):
+                    lse_logsum_local[j] += T.exp2(lse_local[k, j] - lse_max_local[j])
+            for j in T.Parallel(128):
+                lse_logsum_local[j] = T.log2(lse_logsum_local[j]) + lse_max_local[j]
             for k in T.serial(num_split):
                 for i in T.Parallel(dim):
                     po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                for j in T.Parallel(128):
+                    scale_local[j] = T.exp2(lse_local[k, j] - lse_logsum_local[j])
+                # Note: Pay attention to dim and the number of threads in Parallel
                 for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
+                    o_accum_local[i] += po_local[i] * scale_local[i]
             for i in T.Parallel(dim):
                 Output[bz, by, i] = o_accum_local[i]
 
@@ -474,7 +475,7 @@ def main(batch: int = 1,
         print(o_ref)
 
         assert_similar(o, o_ref, name="o_ref")
-        assert_similar(o_ref_split, o_ref, name="o_ref_split")
+        assert_similar(o, o_ref_split, name="o_ref_split")
 
         print("All checks pass.")
         latency = profiler.do_bench(ref_program, warmup=500)
-- 
GitLab


From e7e4e65b34a2947ef5c338db0966bfe20c5efd91 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 10 Dec 2025 13:07:00 +0800
Subject: [PATCH 086/139] [Enhancement] Add debug output methods for Layout and
 Fragment classes (#1392)

---
 src/layout/layout.cc        | 20 +++++++++++++++++---
 tilelang/layout/fragment.py |  3 ++-
 tilelang/layout/layout.py   |  3 ++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index c3f99f30..34d12697 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -12,6 +12,8 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include "arith/pattern_match.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
 #include "utils.h"
 
 namespace tvm {
@@ -78,7 +80,8 @@ void LayoutNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<LayoutNode>()
       .def_ro("input_size", &LayoutNode::input_size_)
-      .def_ro("forward_index", &LayoutNode::forward_index_);
+      .def_ro("forward_index", &LayoutNode::forward_index_)
+      .def("_DebugOutput", &LayoutNode::DebugOutput);
 }
 
 void LayoutNode::UpdateAnalyzer(arith::Analyzer *analyzer) const {
@@ -716,8 +719,19 @@ void FragmentNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<FragmentNode>()
       .def_ro("forward_thread", &FragmentNode::forward_thread_)
-      .def_ro("replicate_size", &FragmentNode::replicate_size_);
-}
+      .def_ro("replicate_size", &FragmentNode::replicate_size_)
+      .def("_DebugOutput", &FragmentNode::DebugOutput);
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<FragmentNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const FragmentNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    })
+    .set_dispatch<LayoutNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const LayoutNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    });
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index b9a56d8e..ff45f6d5 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -203,7 +203,8 @@ class Fragment(Layout):
         str
             A string showing the thread dimension and the index dimension.
         """
-        return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
+        return self._DebugOutput()
+        # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
     def is_equal(self, other: 'Fragment') -> bool:
         """
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index 10e0357e..87d2ee44 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -143,4 +143,5 @@ class Layout(Node):
         return _ffi_api.Layout_is_equal(self, other)
 
     def __repr__(self):
-        return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
+        return self._DebugOutput()
+        # return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
-- 
GitLab


From bc084aa4b96ac236dc7360d50843e70ae2bccfa4 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Wed, 10 Dec 2025 17:43:50 +0800
Subject: [PATCH 087/139] [Doc] Update logging docs (#1395)

---
 CMakeLists.txt            |   4 ++
 docs/index.md             |   9 +--
 docs/tutorials/logging.md | 118 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 docs/tutorials/logging.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ed8a7a0..5506e8cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -228,7 +228,11 @@ add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
+
+# Set debug mode compile definitions
+# We open the deubg option of TVM, i.e. TVM_LOG_DEBUG
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message(STATUS "Building TileLang with DEBUG mode")
   target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
 endif()
 
diff --git a/docs/index.md b/docs/index.md
index 45e7f5ea..ece397af 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,10 @@
 
 [GitHub](https://github.com/tile-ai/tilelang)
 
-Tile Language (tile-lang) is a concise domain-specific language designed to streamline 
-the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). 
-By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM, 
-tile-lang allows developers to focus on productivity without sacrificing the 
+Tile Language (tile-lang) is a concise domain-specific language designed to streamline
+the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention).
+By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM,
+tile-lang allows developers to focus on productivity without sacrificing the
 low-level optimizations necessary for state-of-the-art performance.
 
 :::{toctree}
@@ -24,6 +24,7 @@ get_started/targets
 
 tutorials/debug_tools_for_tilelang
 tutorials/auto_tuning
+tutorials/logging
 :::
 
 :::{toctree}
diff --git a/docs/tutorials/logging.md b/docs/tutorials/logging.md
new file mode 100644
index 00000000..5caf4328
--- /dev/null
+++ b/docs/tutorials/logging.md
@@ -0,0 +1,118 @@
+Logging in Tilelang/TVM
+===================================================
+<div style="text-align: left;">
+<em>Author:</em> <a href="https://github.com/SiriusNEO">SiriusNEO</a>
+</div>
+
+## TVM Logging Overview
+
+Tilelang currently utilizes the logging system from TVM. The implementation can be found in:
+
+- [include/tvm/runtime/logging.h](https://github.com/apache/tvm/blob/main/include/tvm/runtime/logging.h): Macro definitions
+- [src/runtime/logging.cc](https://github.com/apache/tvm/blob/main/src/runtime/logging.cc): Logging logic implementation
+
+The design style is inspired by [Google's glog](https://google.github.io/glog/stable/).
+
+## Logging Categories
+
+There are three primary macro types:
+
+```c++
+LOG(INFO) << "aaa";
+DLOG(INFO) << "aaa";
+VLOG(1) << "aaa";
+```
+
+- **LOG**: Standard logging preserved in code for displaying necessary information at different levels during runtime. Most Tilelang C++ error reporting is implemented via `LOG(FATAL) << "error msg"`.
+- **DLOG**: Debug logging for developer debugging output. DLOG is controlled at build time by the TVM_LOG_DEBUG environment variable and is **eliminated in Release builds through dead code elimination**.
+    - The key difference between LOG(DEBUG) and DLOG is this build-time elimination. We recommend using DLOG over LOG(DEBUG), as the latter has overlapping functionality and gets compiled into the release runtime.
+- **VLOG**: [Verbose logging](https://google.github.io/glog/stable/logging/#verbose-logging), primarily for debugging. Its main feature is customizable verbosity levels. For example, VLOG(n) where n can be 1, 2, 3, 4, 5, or 6, enabling complex tracing requirements. In contrast, LOG and DLOG typically use predefined verbose levels like INFO and DEBUG.
+    - In practical Tilelang development, VLOG is used less frequently.
+    - TVM's VLOG is implemented using DLOG, thus inheriting DLOG's characteristics.
+
+Additional useful macros include various **CHECK** variants:
+
+```c++
+CHECK(cond) << "error msg";
+DCHECK(cond) << "error msg";
+ICHECK(cond) << "error msg";
+```
+
+The implementation routes errors to LogFatal:
+
+```c++
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: (" #x << ") is false: "
+```
+- **DCHECK**: Debug mode CHECK, only compiled in debug builds
+- **ICHECK**: Internal Check that should exist in Release builds. When ICHECK fails, the entire system should report an error.
+
+## Logging Verbose Levels
+
+TVM defines 5 levels for LOG and DLOG (adding DEBUG compared to glog):
+
+```c++
+#define TVM_LOG_LEVEL_DEBUG 0
+#define TVM_LOG_LEVEL_INFO 1
+#define TVM_LOG_LEVEL_WARNING 2
+#define TVM_LOG_LEVEL_ERROR 3
+#define TVM_LOG_LEVEL_FATAL 4
+```
+
+## Using Logging in TileLang Development
+
+### Guidelines
+
+For temporary debugging output in your code, there are no restrictions (you can even use std::cout). Just remember to remove it before submitting a PR.
+
+For meaningful logging that should remain in the Tilelang codebase:
+
+- Critical correctness checks: Use ICHECK with sufficient error messages to facilitate debugging when issues arise.
+- Complex Pass debugging: For passes requiring intermediate output that may need future review (e.g., LayoutInference), use DLOG.
+- General INFO/WARNING messages: Use standard LOG.
+
+### Enabling Log Output in Tilelang
+
+To specify current log level at runtime, we need to set the environment variable `TVM_LOG_LEVEL`. An example usage is:
+
+```c++
+TVM_LOG_DEBUG=1 python3 code.py
+```
+
+which enables all DEBUG/INFO (level <= 1) logs for all files.
+
+#### Detailed Rules for TVM_LOG_DEBUG Specification
+
+The parsing logic is in `logging.cc`. Reference: [HyperAI Zhihu Article](https://zhuanlan.zhihu.com/p/1933106843468665163).
+
+Launch Python with `TVM_LOG_DEBUG=<spec>`, where `<spec>` is a comma-separated list of level assignments in the form `<file_name>=<level>`. Important notes:
+
+- The special filename DEFAULT sets the LOG level for all files.
+- `<level>` can be set to -1 to disable LOG for that file.
+- `<file_name>` is the C++ source filename (e.g., .cc, not .h) relative to the `src/` directory in the TVM repository. The `src/` prefix is optional when specifying file paths.
+
+### Enabling Debug Mode
+
+To enable DLOG/DCHECK, developers need to first build Tilelang in Debug mode:
+
+```bash
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_CUDA=ON
+```
+
+Tilelang's CMake logic automatically adds the `TVM_LOG_DEBUG` macro, compiling all DLOG statements:
+
+```cmake
+target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
+```
+
+Then you also need to specify the runtime environment variables. For example, to use `DLOG(INFO) << "xxx"` for debugging, run your code with INFO level (1): `TVM_LOG_DEBUG=1`.
+
+:::{note}
+   **Important**: There are two TVM_LOG_DEBUG variables. (1) Compile-time macro: Determines whether debug content (like DLOG) is compiled into the .so file. Referenced in C++ source via #ifdef TVM_LOG_DEBUG. This is automatically enabled when using Debug build mode in CMake. (2) Runtime environment variable: Controls logging level at runtime. TVM provides a specification for this variable, allowing control over per-file logging levels.
+
+   These two should ideally have different names, but TVM uses the same name for both, which can cause confusion.
+:::
+
+
-- 
GitLab


From f2858fa11d8af1d313f71ba667bb79b3de96c887 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 10 Dec 2025 22:04:18 +0800
Subject: [PATCH 088/139] [Enhancement] Refactor inflight computing to support
 dynamic pipeline extents (#1399)

* [Build] Update CMake configuration for tilelang_cython_wrapper installation

- Adjusted output directories for the tilelang_cython_wrapper to ensure that development builds place the extension in build/lib.
- Updated installation paths to place the extension in tilelang/lib within the wheel, improving organization and avoiding potential conflicts with other modules.
- Modified the internal library path exposure in env.py to prevent shadowing of common module names, enhancing compatibility and usability in user projects.

* [Build] Standardize output directories for tilelang libraries

- Set output directories for both tilelang and tilelang_module libraries to "${CMAKE_BINARY_DIR}/lib" for consistency in development builds.
- This change enhances organization and ensures that all build artifacts are located in a unified directory structure.

* [Refactor] Update TVM subproject and enhance pipeline loop handling

- Updated the TVM subproject to commit 90581fe9e5287bbcf1844ad14255a1e1e8cdf7f0.
- Added new fields to `PipelineAnnotation` and `RewrittenBlockInfo` structures to track original statement indices and improve async state management.
- Refactored `EmitImpl` and `PopulateWaitCounts` methods to enhance clarity and functionality, including better handling of commit groups and wait counts.
- Simplified access index calculations and strengthened analyzer constraints for loop bounds.

* [Cleanup] Remove license block and unused includes from inject_pipeline.cc

- Eliminated the Apache license block from the top of the file to streamline the code.
- Removed unused include directives for memory and stringstream to enhance code clarity and reduce unnecessary dependencies.

* [Refactor] Enhance transformation pipeline and test execution

- Added an additional Simplify transformation in the InjectSoftwarePipeline to improve optimization.
- Updated the test file to call `test_trival_pipeline()` directly, commenting out the previous main execution for better test isolation.
---
 3rdparty/tvm                                  |   2 +-
 src/transform/inject_pipeline.cc              | 192 +++++++++++-------
 ...lang_transform_Inject_software_pipeline.py |   1 +
 tilelang/engine/phase.py                      |   1 +
 4 files changed, 125 insertions(+), 71 deletions(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 3a32b763..90581fe9 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 3a32b763e9d8393b14e4d0f824b2846f70041bc1
+Subproject commit 90581fe9e5287bbcf1844ad14255a1e1e8cdf7f0
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index 511ebc57..79e78add 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file inject_software_pipeline.cc
  * \brief Transform annotated loops into pipelined one that parallelize
@@ -79,6 +60,8 @@ struct PipelineAnnotation {
   int stage;
   int order;
   bool async;
+  // Index of the statement in the original loop body order (SeqStmt order)
+  int original_idx = -1;
 };
 
 using PipelineInfo = std::unordered_map<Block, PipelineAnnotation,
@@ -304,15 +287,17 @@ public:
     }
 
     // Step 2: Emit the pipeline prologue, body and epilogue.
-    Stmt prologue = EmitImpl(pipeline_loop_->min,
-                             pipeline_loop_->min + max_stage_, true, true);
-    Stmt body =
-        EmitImpl(pipeline_loop_->min + max_stage_,
-                 pipeline_loop_->min + pipeline_loop_->extent, false, false);
-    Stmt epilogue = EmitImpl(
-        pipeline_loop_->min + pipeline_loop_->extent,
-        pipeline_loop_->min + pipeline_loop_->extent + max_stage_, true, true);
-
+    Stmt prologue =
+        EmitImpl(pipeline_loop_->min, pipeline_loop_->min + max_stage_, true,
+                 true, false);
+    Stmt body = EmitImpl(pipeline_loop_->min + max_stage_,
+                         pipeline_loop_->min + pipeline_loop_->extent, false,
+                         false, false);
+
+    Stmt epilogue =
+        EmitImpl(pipeline_loop_->min + pipeline_loop_->extent,
+                 pipeline_loop_->min + pipeline_loop_->extent + max_stage_,
+                 true, true, true);
     SeqStmt stmt = SeqStmt({prologue, body, epilogue});
 
     // Step 3: Make a new block that contains new buffer allocations after
@@ -515,12 +500,16 @@ private:
     // A symbolic expression representing the index the latest async operation
     // associated with this stage has written into, at the "current" iteration.
     Optional<PrimExpr> producer_head;
+    // the commit block's predicate
+    PrimExpr commit_predicate{nullptr};
   };
 
   /*! Structure holding intermediate information for pipeline loop rewriting. */
   struct RewrittenBlockInfo {
     int stage;
     int order;
+    PrimExpr start;
+    PrimExpr end;
     PrimExpr predicate;
     Block block;
     PrimExpr access_index;
@@ -528,56 +517,103 @@ private:
   };
 
   void PopulateWaitCounts(const std::vector<RewrittenBlockInfo> &new_blocks,
-                          std::map<int, AsyncStateLocal> *async_states_local) {
+                          std::map<int, AsyncStateLocal> *async_states_local,
+                          bool is_epilogue = false) {
+    // Precompute which orders are present in this emit, and their access_index
+    std::unordered_map<int, PrimExpr> order_to_access_index;
+    std::unordered_set<int> present_orders;
+    for (const auto &nb : new_blocks) {
+      order_to_access_index[nb.order] = nb.access_index;
+      present_orders.insert(nb.order);
+    }
     for (size_t i = 0; i < new_blocks.size(); ++i) {
+      // 1. Find the unique async producer stage
       int producer_stage_idx = -1;
-      for (auto read_region : new_blocks[i].block->reads) {
+      for (const auto &read_region : new_blocks[i].block->reads) {
         for (const auto &[stage, state] : async_states) {
           if (stage <= new_blocks[i].stage &&
               state.writes(read_region->buffer)) {
-            // Found an earlier stage where read_region->buffer was
-            // asynchronously written
+            // Currently only a single async stage dependency is supported
             ICHECK(producer_stage_idx == -1 || producer_stage_idx == stage)
                 << "A dependency on multiple async stages is not supported";
             producer_stage_idx = stage;
           }
         }
       }
-      if (producer_stage_idx == -1)
+      if (producer_stage_idx == -1) {
+        // This block does not depend on any async producer
         continue;
+      }
       const auto &state = async_states[producer_stage_idx];
+
       auto &dep_local_state = (*async_states_local)[producer_stage_idx];
-      PrimExpr in_flight_cnt = 0;
-      for (const auto &group : state.commit_groups) {
-        PrimExpr consumer_head = new_blocks[i].access_index;
-        PrimExpr producer_head;
-        if (dep_local_state.producer_head.defined()) {
-          producer_head = dep_local_state.producer_head.value();
-          // if the group is after the wait point, minus by 1
-          if (group.front() > new_blocks[i].order)
-            producer_head -= 1;
-        } else {
-          producer_head = state.producer_head;
-        }
-        in_flight_cnt += producer_head - consumer_head;
-      }
 
-      // We can relax the in-flight-count by the number of independent commit.
+      // 2. Use buffer_to_commit_group_ to find all actually dependent commit
+      // groups
       std::unordered_set<int> dependent_groups;
       for (const auto &read_region : new_blocks[i].block->reads) {
-        if (state.buffer_to_commit_group_.count(read_region->buffer.get()))
-          dependent_groups.insert(
-              state.buffer_to_commit_group_.at(read_region->buffer.get()));
+        auto it = state.buffer_to_commit_group_.find(read_region->buffer.get());
+        if (it != state.buffer_to_commit_group_.end()) {
+          dependent_groups.insert(it->second);
+        }
       }
-      for (int i = int(state.commit_groups.size()) - 1; i >= 0; i--) {
-        if (dependent_groups.count(i) == 0)
-          in_flight_cnt += 1;
-        else
-          break; // stop relaxing
+
+      // If there is no dependent commit group, no wait needs to be inserted
+      if (dependent_groups.empty()) {
+        continue;
+      }
+
+      // 3. Compute wait = max_g max(0, t_consumer - committed_before[g])
+      PrimExpr t_consumer = new_blocks[i].access_index;
+      PrimExpr wait_expr = make_zero(t_consumer.dtype());
+
+      PrimExpr current_head = dep_local_state.producer_head.defined()
+                                  ? dep_local_state.producer_head.value()
+                                  : state.producer_head;
+      int consumer_order = new_blocks[i].order;
+
+      for (int g : dependent_groups) {
+        const auto &group = state.commit_groups[g];
+        if (group.empty())
+          continue;
+        int commit_order = group.back();
+        bool commit_present = present_orders.count(commit_order) > 0;
+
+        PrimExpr committed_before;
+        if (commit_present && commit_order <= consumer_order) {
+          // Commit point is in this iteration and earlier than the current
+          // consumer; this iteration's head is visible
+          auto commit_predicate = dep_local_state.commit_predicate;
+          if (analyzer_.CanProve(!commit_predicate,
+                                 arith::ProofStrength::kSymbolicBound)) {
+            // it means the commit block is not executed in this iteration
+            committed_before = new_blocks[i].start - 1;
+          } else if (is_epilogue) {
+            committed_before = new_blocks[i].start - 1;
+          } else {
+            committed_before = order_to_access_index.at(commit_order);
+          }
+        } else {
+          // Commit point is later than the current consumer or not in this
+          // iteration; only the previous iteration's head is visible
+          if (dep_local_state.producer_head.defined()) {
+            auto commit_predicate = dep_local_state.commit_predicate;
+            if (analyzer_.CanProve(!commit_predicate,
+                                   arith::ProofStrength::kSymbolicBound)) {
+              committed_before = new_blocks[i].start - 1;
+            } else if (is_epilogue) {
+              committed_before = new_blocks[i].start - 1;
+            } else {
+              committed_before = current_head - 1;
+            }
+          }
+        }
+
+        wait_expr = analyzer_.Simplify(committed_before - t_consumer);
       }
-      in_flight_cnt = analyzer_.Simplify(in_flight_cnt);
-      dep_local_state.pending_waits.push_back(
-          {static_cast<int>(i), in_flight_cnt});
+
+      wait_expr = analyzer_.Simplify(wait_expr);
+      dep_local_state.pending_waits.push_back({static_cast<int>(i), wait_expr});
     }
   }
 
@@ -630,7 +666,7 @@ private:
    * \return The result loop.
    */
   Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
-                bool need_bound_check) {
+                bool need_bound_check, bool is_epilogue = false) {
     PrimExpr new_loop_var;
     PrimExpr extent = end - start;
     auto make_nop = []() {
@@ -642,7 +678,20 @@ private:
       new_loop_var = start; // use constants as the loop var for unit loops
     } else {
       new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
-      analyzer_.Bind(Downcast<Var>(new_loop_var), Range(start, end));
+      // Bind the iteration domain [start, end) to strengthen analyzer facts.
+      analyzer_.Bind(Downcast<Var>(new_loop_var),
+                     Range::FromMinExtent(start, end - start));
+    }
+    // Keep the bound constraints active for all analysis below.
+    // Only meaningful when the loop var is symbolic (non-unit loop).
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
+    if (!is_unit_loop) {
+      Var loop_iter = Downcast<Var>(new_loop_var);
+      ctx_lb_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
+      ctx_ub_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
     }
 
     std::vector<RewrittenBlockInfo> new_blocks;
@@ -653,15 +702,14 @@ private:
     for (const Block &block : ordered_stmts_) {
       int stage = pipeline_info_.at(block).stage;
       int order = pipeline_info_.at(block).order;
+
       PrimExpr inbound = Bool(true);
       PrimExpr skewed_loop_var = new_loop_var - stage;
       if (need_bound_check)
-        inbound =
-            analyzer_.Simplify(pipeline_loop_->min <= skewed_loop_var) &&
-            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent);
-      if (analyzer_.CanProve(!inbound)) {
-        continue;
-      }
+        inbound = And(
+            pipeline_loop_->min <= skewed_loop_var,
+            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+
       Block new_block = Downcast<Block>(
           PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
                                pipeline_loop_, max_stage_ != 1)(block));
@@ -674,6 +722,8 @@ private:
       PrimExpr normalized_access_index =
           is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
 
+      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+
       // Adjust the block predicate and the body according to the final loop
       // bound
       //  [pipeline_loop_->min, extent).
@@ -701,17 +751,18 @@ private:
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
+        local_state.commit_predicate = inbound;
         BlockNode *n = new_block.CopyOnWrite();
         n->body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::async_scope,
                            1, n->body);
       }
 
-      new_blocks.push_back({stage, order, inbound, new_block,
+      new_blocks.push_back({stage, order, start, end, inbound, new_block,
                             normalized_access_index,
                             pipeline_info_[block].async});
     }
 
-    PopulateWaitCounts(new_blocks, &async_states_local);
+    PopulateWaitCounts(new_blocks, &async_states_local, is_epilogue);
 
     auto stmts = CompletePipelineLoopStatements(new_blocks, async_states_local);
 
@@ -1008,7 +1059,8 @@ private:
           pipeline_async_stages.find(stage) != pipeline_async_stages.end();
       PipelineAnnotation stage_order{
           stage,
-          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async};
+          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async,
+          /*original_idx=*/static_cast<int>(i)};
       pipeline_info.emplace(original_order[i], stage_order);
     }
 
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index c0444043..7cb1b551 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -10,6 +10,7 @@ def _check(original, transformed):
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
+    mod = tl.transform.Simplify()(mod)
     tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
                                    True)
 
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index b688ad9f..cd205a6d 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -217,6 +217,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
             mod = tilelang.transform.InjectFenceProxy()(mod)
 
     mod = tilelang.transform.LowerOpaqueBlock()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
     mod = tilelang.transform.FlattenBuffer()(mod)
     # ConfigIndexBitwidth must be applied after FlattenBuffer
-- 
GitLab


From d19142f65c354d957d9b2d29901b3388fe58d1c6 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 10 Dec 2025 22:24:27 +0800
Subject: [PATCH 089/139] [AMD] Fix 3 bugs when build docker on amd mi3x gpu
 (#1401)

---
 docker/Dockerfile.rocm | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index f519bb0a..4f331cc7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -22,10 +22,12 @@ RUN conda run -n py_3.10 conda install pip cmake -y && \
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
-    conda run -n py_3.10 bash -c "cd tilelang && USE_ROCM=1 pip install -e . -v"
+    mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
+    conda run -n py_3.10 bash -c "pip install 'numpy<2.0' --force-reinstall && cd tilelang && USE_ROCM=1 pip install -e . -v"
 
-RUN conda init bash
+RUN conda init bash && \
+    echo "conda activate py_3.10" >> /root/.bashrc
 
 SHELL ["/bin/bash", "-l", "-c"]
 
-CMD ["bash", "-c", "source ~/.bashrc && conda activate py_3.10 && exec bash"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "--login", "-i"]
\ No newline at end of file
-- 
GitLab


From 79d381d1f7f098b9a48aadee7924ee6dd3aa3a68 Mon Sep 17 00:00:00 2001
From: senlyu163 <70838408+senlyu163@users.noreply.github.com>
Date: Thu, 11 Dec 2025 10:54:11 +0800
Subject: [PATCH 090/139] [Typo] Fix tilelang link in README.md (#1402)

---
 tilelang/carver/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md
index 498cf157..65030d0a 100644
--- a/tilelang/carver/README.md
+++ b/tilelang/carver/README.md
@@ -2,7 +2,7 @@
 
 **Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels. 
 
-Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/LeiYanggh/tilelang) (or other domain-specific compilers).
+Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/tile-ai/tilelang) (or other domain-specific compilers).
 
 ---
 
-- 
GitLab


From 0eb33f28a6a4d7e8c2d1161937816f48592b50fb Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 11 Dec 2025 11:05:04 +0800
Subject: [PATCH 091/139] [Dependency] Update apache-tvm-ffi version to >=0.1.2
 (#1400)

* [Dependency] Update apache-tvm-ffi version to >=0.1.2 in project files

* [Dependency] Update subproject commit for TVM to latest version afc07935

* [Enhancement] Add support for optional step parameter in loop constructs

- Updated loop creation functions to accept an optional step parameter, enhancing flexibility in loop definitions.
- Modified ForFrame implementations to utilize the new step parameter across various loop types including serial, parallel, and pipelined loops.
- Adjusted related vectorization transformations to accommodate the step parameter, ensuring consistent behavior in loop vectorization processes.

* lint fix
---
 3rdparty/tvm                         |  2 +-
 pyproject.toml                       |  2 +-
 requirements-dev.txt                 |  2 +-
 requirements.txt                     |  2 +-
 src/ir.cc                            | 43 +++++++++++++++++++---------
 src/transform/atomicadd_vectorize.cc |  3 +-
 src/transform/loop_vectorize.cc      |  3 +-
 7 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 90581fe9..afc07935 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 90581fe9e5287bbcf1844ad14255a1e1e8cdf7f0
+Subproject commit afc079350def46a78931c6edeb7bad3fb248b4e1
diff --git a/pyproject.toml b/pyproject.toml
index 66424c02..088737d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     # Extra constraint to tvm-ffi for abi issue,
     # should be removed after our tvm's update.
     # See discussion in tilelang#1373 and apache/tvm-ffi#307
-    "apache-tvm-ffi<=0.1.1",
+    "apache-tvm-ffi>=0.1.2",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6cd96873..ef8e98b6 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi>=0.1.2
 build
 cmake>=3.26
 cython>=3.0.0
diff --git a/requirements.txt b/requirements.txt
index 3ad186ed..58e851d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Runtime requirements
-apache-tvm-ffi~=0.1.0
+apache-tvm-ffi>=0.1.2
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
diff --git a/src/ir.cc b/src/ir.cc
index 3d2b3ecd..82a94cb8 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -44,16 +44,22 @@ static ForFrame MakeIterVarFrame(const std::string &name, const PrimExpr &dom) {
   n->vars.push_back(var);
   n->doms.push_back(Range(0, dom));
   n->f_make_for_loop = [](const Array<Var> &vars, const Array<Range> &doms,
-                          const Stmt &body) -> Stmt {
+                          const Array<Optional<PrimExpr>> &steps,
+                          Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), 1);
     ICHECK_EQ(doms.size(), 1);
-    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
+               /*thread_binding=*/std::nullopt,
+               /*annotations=*/tvm::ffi::Map<tvm::ffi::String, tvm::ffi::Any>{},
+               /*step=*/step);
   };
   return ForFrame(n);
 }
 
 ForFrame ParallelFor(const Array<PrimExpr> &extents,
-                     const Map<String, ObjectRef> &annotations) {
+                     const Map<String, tvm::ffi::Any> &annotations) {
   using namespace tvm::tir;
   ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.reserve(extents.size());
@@ -63,16 +69,19 @@ ForFrame ParallelFor(const Array<PrimExpr> &extents,
     n->vars.push_back(Var("v", extent.dtype()));
     n->doms.push_back(Range(make_const(dtype, 0), extent));
   }
-  n->f_make_for_loop = [annotations](const Array<Var> &vars,
-                                     const Array<Range> &doms,
-                                     Stmt body) -> Stmt {
+  n->f_make_for_loop =
+      [annotations](const Array<Var> &vars, const Array<Range> &doms,
+                    const Array<Optional<PrimExpr>> &steps, Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     for (int i = n - 1; i >= 0; --i) {
       Range dom = doms[i];
       Var var = vars[i];
+      Optional<PrimExpr> step =
+          i < steps.size() ? steps[i] : Optional<PrimExpr>(std::nullopt);
       body = For(var, dom->min, dom->extent, ForKind::kParallel, body,
-                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations);
+                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations,
+                 /*step=*/step);
     }
     return body;
   };
@@ -90,11 +99,12 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
   n->vars.push_back(Var("v", dtype));
   n->doms.push_back(Range(std::move(start), stop));
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
+                           const Array<Optional<PrimExpr>> &steps,
                            Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     ICHECK(n == 1);
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     if (num_stages > 0)
       anno.Set("num_stages", PrimExpr(num_stages));
     if (!order.empty())
@@ -105,8 +115,11 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
       anno.Set("tl_pipeline_sync", sync);
     if (!groups.empty())
       anno.Set("tl_pipeline_group", groups);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
     body = For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
-               /*thread_binding=*/std::nullopt, /*annotations=*/anno);
+               /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+               /*step=*/step);
     return body;
   };
   return ForFrame(n);
@@ -145,9 +158,10 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
   grouped_domain.push_back(group_size);
 
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
-                           const Stmt &body) -> Stmt {
+                           const Array<Optional<PrimExpr>> &steps,
+                           Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     Array<PrimExpr> idxs(grouped_domain.size(), PrimExpr());
     PrimExpr rem = loop_var * wave_size + index;
 
@@ -168,8 +182,11 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
     if (analyzer.CanProveGreaterEqual(waves, 2)) {
       new_body = SeqStmt({out_if, body});
     }
-    Stmt outer =
-        For(loop_var, 0, waves, ForKind::kSerial, new_body, std::nullopt, anno);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    Stmt outer = For(loop_var, 0, waves, ForKind::kSerial, new_body,
+                     /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+                     /*step=*/step);
     for (int i = 0; i < vars.size() - 1; ++i) {
       outer = tvm::tir::LetStmt(vars[i], idxs[i + 1], outer);
     }
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
index 40cb8140..37a6d589 100644
--- a/src/transform/atomicadd_vectorize.cc
+++ b/src/transform/atomicadd_vectorize.cc
@@ -203,7 +203,8 @@ private:
         vmap.Set(old_var, new_var * vector_size_);
         Stmt body = Substitute(fnode->body, vmap);
         return For(new_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
       }
     }
     return ret;
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index e8a18b00..836a52b4 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -232,7 +232,8 @@ private:
         Stmt body = Substitute(fnode->body, vmap);
         body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body);
         body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
         return body;
       }
     } else {
-- 
GitLab


From 53be59dcc072c78730a83f848154357286c63ccd Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Thu, 11 Dec 2025 17:40:20 +0800
Subject: [PATCH 092/139] [AMD] Enable FA2 fwd on AMD MI300X (#1406)

* enable FA2 on AMD MI300X

* make lint happy
---
 docker/Dockerfile.rocm                     | 28 ++++++++++++++++++----
 examples/amd/example_amd_flash_attn_fwd.py | 17 ++++++++++++-
 src/op/math.cc                             |  4 +++-
 src/target/codegen_hip.cc                  |  4 ++--
 4 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 4f331cc7..5f61f0e2 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -9,25 +9,43 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential git wget \
   libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  rocm-dev rocm-libs hip-dev hipblas-dev rocblas-dev \
   && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
 ENV PATH="/opt/conda/bin:${PATH}"
 ENV LIBGL_ALWAYS_INDIRECT=1
+ENV USE_ROCM=1
+ENV USE_CUDA=0
+ENV ROCM_HOME=/opt/rocm
+ENV HIP_PLATFORM=amd
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 
 
 RUN conda run -n py_3.10 conda install pip cmake -y && \
     conda run -n py_3.10 conda install -c conda-forge libstdcxx-ng=12 -y && \
     conda clean --all
 
-RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+RUN apt-get update && apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev && \
+    apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
-RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
-    mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
-    conda run -n py_3.10 bash -c "pip install 'numpy<2.0' --force-reinstall && cd tilelang && USE_ROCM=1 pip install -e . -v"
+# Copy local tilelang directory instead of cloning from git
+# Build from tilelang root: docker build -f docker/Dockerfile.rocm -t mi300:latest .
+COPY . /root/tilelang
+
+RUN mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
+    conda run -n py_3.10 bash -c "export USE_ROCM=1 USE_CUDA=0 && pip install 'numpy<2.0' --force-reinstall" && \
+    conda run -n py_3.10 bash -c "cd /root/tilelang && \
+        # Backup and modify pyproject.toml to remove torch from dependencies \
+        cp pyproject.toml pyproject.toml.bak && \
+        sed -i '/^[[:space:]]*\"torch/d' pyproject.toml && \
+        # Install tilelang with all dependencies except torch \
+        USE_ROCM=1 USE_CUDA=0 pip install -e . -v && \
+        # Restore original pyproject.toml \
+        mv pyproject.toml.bak pyproject.toml"
 
 RUN conda init bash && \
     echo "conda activate py_3.10" >> /root/.bashrc
 
 SHELL ["/bin/bash", "-l", "-c"]
 
-ENTRYPOINT ["/bin/bash", "--login", "-i"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "--login", "-i"]
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 3c422c28..9ffa7cbb 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -8,6 +8,21 @@ import argparse
 from functools import partial
 
 
+# Custom supply function to ensure tensors are created on GPU
+def supply_tensors_gpu(params):
+    """Supply function that creates tensors on GPU for ROCm/HIP."""
+    tensors = []
+    for param in params:
+        if hasattr(param, 'shape') and hasattr(param, 'dtype'):
+            # Force creation on GPU device
+            shape = [int(s) for s in param.shape]
+            tensor = torch.randn(shape, dtype=param.dtype, device='cuda')
+            tensors.append(tensor)
+        else:
+            tensors.append(param)
+    return tensors
+
+
 def ref_program(Q, K, V, is_causal, groups=1):
     assert Q.size(
         2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
@@ -63,7 +78,7 @@ def get_configs():
     return valid_configs
 
 
-@tilelang.autotune(configs=get_configs(), cache_input_tensors=True)
+@tilelang.autotune(configs=get_configs(), cache_input_tensors=True, supply_prog=supply_tensors_gpu)
 @tilelang.jit(out_idx=[3])
 def fast_flashattn(
     batch,
diff --git a/src/op/math.cc b/src/op/math.cc
index 2de21b91..b9de966e 100644
--- a/src/op/math.cc
+++ b/src/op/math.cc
@@ -33,6 +33,7 @@ TVM_REGISTER_OP("tl.pow_of_int")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "pow_of_int")
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", pow_of_int_op)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", pow_of_int_op);
 
 PrimExpr infinity_op(PrimExpr args) {
@@ -59,7 +60,8 @@ TVM_REGISTER_OP("tl.infinity")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "infinity")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", infinity_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 7ac2555d..7269a18d 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -1190,9 +1190,9 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
       if (op->value < 0) {
         temp << "-";
       }
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_INF_F" : "HIPRT_INF");
+      temp << ((op->dtype.bits() == 32) ? "HUGE_VALF" : "HUGE_VAL");
     } else if (std::isnan(op->value)) {
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_NAN_F" : "HIPRT_NAN");
+      temp << ((op->dtype.bits() == 32) ? "NAN" : "NAN");
     } else {
       temp << std::scientific << op->value;
       if (op->dtype.bits() == 32)
-- 
GitLab


From ede9eaa3c493d42eab8cb3e749fcf27e9ace60d2 Mon Sep 17 00:00:00 2001
From: Cunxiao Ni <85601223+Cunxiao2002@users.noreply.github.com>
Date: Thu, 11 Dec 2025 21:36:09 +0800
Subject: [PATCH 093/139] [TypoFix] fix typo for SM120 (#1408)

---
 src/op/gemm_py.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index 378fcc6a..6953779a 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -135,7 +135,7 @@ GemmInst GemmPyNode::getGemmInst(int block_size, Target target) const {
     return GemmInst::kMFMA;
   } else if (TargetIsVolta(target) || TargetIsAmpere(target) ||
              TargetIsTuring(target) || TargetIsHopper(target) ||
-             TargetIsSm100(target)) {
+             TargetIsSm100(target) || TargetIsSM120(target)) {
     return GemmInst::kMMA;
   } else {
     ICHECK(0) << "Unsupported target for gemm: " << target->str();
-- 
GitLab


From 08262bcea4e6d59904374ea3ba0d3fc2742f25fb Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 12 Dec 2025 00:27:12 +0800
Subject: [PATCH 094/139] [Doc] Minor documentation update (#1410)

---
 docs/_static/custom.css                    |  11 +
 docs/_static/img/logo-v2.png               | Bin 0 -> 8830 bytes
 docs/_static/img/logo.png                  | Bin 0 -> 7162 bytes
 docs/conf.py                               |   9 +-
 docs/index.md                              |  12 +
 docs/programming_guides/autotuning.md      | 308 +++++++++++++++++++++
 docs/programming_guides/control_flow.md    | 145 ++++++++++
 docs/programming_guides/instructions.md    | 182 ++++++++++++
 docs/programming_guides/language_basics.md | 234 ++++++++++++++++
 docs/programming_guides/overview.md        |  27 ++
 docs/programming_guides/type_system.md     |  42 +++
 11 files changed, 966 insertions(+), 4 deletions(-)
 create mode 100644 docs/_static/custom.css
 create mode 100644 docs/_static/img/logo-v2.png
 create mode 100644 docs/_static/img/logo.png
 create mode 100644 docs/programming_guides/autotuning.md
 create mode 100644 docs/programming_guides/control_flow.md
 create mode 100644 docs/programming_guides/instructions.md
 create mode 100644 docs/programming_guides/language_basics.md
 create mode 100644 docs/programming_guides/overview.md
 create mode 100644 docs/programming_guides/type_system.md

diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 00000000..0ef6b48c
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,11 @@
+/* Reduce the displayed size of the sidebar logo in Furo */
+.sidebar-logo {
+  max-height: 125px;
+  width: auto;
+}
+
+/* Optional: keep container from growing too tall due to spacing */
+.sidebar-logo-container {
+  line-height: 0;
+}
+
diff --git a/docs/_static/img/logo-v2.png b/docs/_static/img/logo-v2.png
new file mode 100644
index 0000000000000000000000000000000000000000..410773f60a0d6ddf9bb86186ecb70529ff1d4667
GIT binary patch
literal 8830
zcmaKyWmH>Tu!fP~PWf;N5DLW|iUdN@VlCd{P`t&hcyXt=mE!IeT#7rv-J!S>TyA>T
zx_|GF<jmS<_RPGKy|d0b&kp;bB8P`fiH(GWgr^`ct%kS`{QF>{BfiUZDt;p&5h5u_
zLp0oxkGwH{Y0NC#TrTsPS-3(wIjj8q(3E?K;p(PJ3?GR+b1-97O~Z0zO-;ghwD<)L
zKz-y<(oxd!p|T%{&;lx1Fr}u+Mc=>DAtjs#!;>z-7Eo?cQ9jXHr@|HS%a-wF&vEhH
zH!n`lg)1Dyp)rz@z@H?@ke_+5Pz)4gBnaUIHUQ-fAt68kIochBxDtIsc%b~CCI$s*
zp34J2FkoW(C&<fS3E>3z0p*SD$+Q#!i220DEF0#=2>&;@CtcQIzz=DFH$?V9>G?4T
zul>2~P+$mR;x?gPcJmbx!s|(dglL-(F_GGzO(PkM;tzCiVX;L+sYZD9<j_~3p*Z}r
zkQx5pvDhV>>^8u^cz@o%;@l-b#CtU6U{3@h;xXx1b?rRT12N+y%7XHVr6mdT0Bq$A
z!wgK?|0==$ls!j^Fy?b8WeA(3Bnd&<1K4d)NlC&U(is;2f&UO8nxtkqJuuv<{eh^h
z$f0NIjEG>Abmz`~2!pqi$`4aQ0O3`9PNa$<j4%*RB)pGmRUDXV*!3VUc5=iRT7UsL
zCmH|%d7qsW7=!i>QH2o<r#s&d(Uv#Uac4a00QO-nOcRO(saP6>kZRNN@Dz50TMsWw
z5Q8WK5K|3MKSbXB06nx=r87TWKDBT<u`|gH@8sA(L5PzAl3X-8Dc)v@3f(75J8$)N
zjtxxwC>TIP_T9d0fd*MD&ke9*@26Mc!P0>!iLF3;-*<=WqC!ydP$M6Z=*$)qAe2Zb
z#U7P!eaiEIF<IF{X<FUDI=_Jk*~?DJF-<B;zsW7$H!a)!IaiokdpHizN=%FuAsvib
zBS*8uaJsBZ_Xts;c{TKkq4SVw12xuC9EI-yRnD_RFiEk5m&(~YFA~WLt1~u^o$D?$
z_-xJM8(w7HDql7a1)@8HKdZUD`^<e;%7S5A_ze$u=HzltFh0^WbNWsV4ShPXs)#4^
zzpj6)VKUjQSSYQ(<HU#u7Z|u7#0$pmOP)NE3y&BjAB!o%8ck;@V3{`NmH|o}j;g5`
zTHwZrjMVa&94~9Y#rvPr74wJt0q{TP8?k>|YNmMNW$GZjK%F<ATU@!P>d5G<%(z~P
zW1~>d2eA-Uk#^tr2d@`>I!rf|l<`4Lagg_w4O1H0Lq6t5uez9!y_SeMHhRa7Z_!!$
z3Q%2yg;;V<V2#rb9MGL<Vc}5-^}JUY#l#Ri^qP%5L^DRP)~c35odesZvvk8ncpWl8
z>L)SNpe(EXP*{_*s^pgxFG&xYEQu0)xA?UlFG;*QU3ox|^CN8zoW*{R8KssA)l+<t
zP!Ci*W<i6$3%2^Uz#v(1X$3j9d^O+2yfnYetUK@XgsV!z2ymB^D~yx4pzYKP(Esbu
zvKDLhodKhIK5(_Rs1BUUpjdTLEa=+5#Ze}53S8&}R%%lP#;y?>7#|f;Vig1~)io=T
zP6rJr<_;RgEs1gJTsM#kYL=<;+OYWpQ#bw_^=93)ezh*wZ8LpXYBR0On@`#js9T2$
z4}EslBqtT08j7Y*?u|u9w9wm+pKCyt1Gcchb^K+g`je{k*VIs9MdXx5?Wp^}?&4r<
z9;q*1UZs#&Pemt_=!i8Wir?$;QN5yyZ#;=YYz;f5oHyh3wb|nhJZZ6={pHGwm*!~f
zPZl<(LUV<T6?e#+Otf^rcpR&m6Hq@N<=k263YX~@G#W2FazD6;-Q^;MR<fY9a^{k4
zmZJXHk5m{xny!#Tea;Q4xB4u}w%ekTt#xdlu%7z;_$c)*s%=d#GpzB8*d{><M(LC_
zMrr0HZ!c$EFeSt>b3Y(|{9%FMj?3013)tR*_=nq<@*f))I4Gp0^z&83<Wp9rdyPB^
zzNO`nYSty#mvGhS%pqHx=eX2e0ERyN^1mf-TL9bc0pYfWiOp;Vay+YYi&x!{01Gl6
z_j4wR2Q58$)O#&T*n}JU%j3z}k8z<?E@E<=x}QG6L;y8~s!wzfoYgf}R`%a(ePah2
zQ{Ez2=QfnewjY^<9eRK}r^U|y4fC1r;rqt>pfj+07H2#??;ugF(!h|~vjlnUa}DQ5
zO8%bCntQZHo5flEdNXmw8o$CKo<#=%-7e*DKlDd@J4!#7SCM=?bt4=vEzwe;Gbm>t
zmv+m(IDo9!1i}r4_0O*T1-A}%$(?%o1%94jfFQSe5BYrq|JA%&Eu?+CuQIGGa2oo^
zxZq^1WXtMXF^3ajfKz>#47wG+ctuDyi~8$>Zn+*f@AId4C2gL3j5PQ}fjM{&uWKk-
zcVQnjjHJ36U)Z1Ba<aXnD5_ktQ?4!x9E@rd1W3DT>q7P}$i*h@_&cY%1&T!T;5FIS
z=~>|Cr537r-;`VPe&O{;bhfX%Y8d5B_!iCWM#i;oU-J9T{%l|Pi3mec8P<L4tDX$3
zCNo{%CiISQw`Y{n6kMA>S$;~|<;1u>+!S;*5^I!WJQ1dLXvXF?c22w?OIF1CJ4D6?
zXW;YMJ{MlS*Y+FLIkM>9+^RWjJI1;=u>GssIO>bRTBU~K`kQD%Uu;hgJ4%Vxx7Np}
ziHx+J$XtlMuwKOrHK%@%8H4fJxr$0<twrrSWAd|3zAnyQB7c7CpB=vj1DhGHXga1X
z)4Fc){xN)qtMDJ%t2Qm&>c3RnwOvGQbH6S9x%<7<(6E;q#?GhrUzL68n9A`Eh>1G*
zwqprdrKD7Ylj3u6*=R$|Q?;B;#Z%`ORd%8E;}<+`-Iw`r4y>BIDDN#esdshRz>85c
zSgRmdN8ZY6BF16(tj9A_{1M+YcQAvt-(C8Nu!p$|NiMSF!^!nsLZ8%qdd0DPi%%qL
zcZU}FQEdM4SId~Jua2go{$<k(O*01W8GqMaS-xk27lw?;q(Zg4<!#a(zB6^Uj#H$k
z6+OZ3^WaE5x?Y%a$A;Nr<pXi6hK3iK(WK|U?OS|$KM=IOr|Cov+hpD0*Ch&6EtPs6
zQ255-L9>T0?_k%3<1;R~IAKjK%SCFtJUX9&7n#UpEr!8RK^1lzPj9f_GXdTi^?z8w
zh&D=rnRk(MN9Gdust;MIy%D85eV{>_Z!c|<DmYQsOx&|O`H?L*&Ywdv`U%xu1NGLD
zp91#{cumsF$*$)-*ZYFW1^iu~r?=BPn}|}01EqxF9mwb&#iy24MwsbtKnt)TAQ|D1
ziZR1f!j4-04vE5LEg_q&P0~WJM9<(OvONyNa}BNoU(@t1PueWAbV+Vs!myiW(y%D@
zwrJD<;eTKgX&wP(dlBjh{*!ppi9~R~n`&IK3Sy_X^)1A_dB}j~AkLgztc*L$F4dr)
zX^!8Y<{iH`;}~sn_w1_6O+36&ZlRO|t{&uH<XKeYkeGX;1Vb2kP@{`;20ZO$krw##
zSAQqFh99Fyl@PoQD+)&;jsmsbC?@qETwzzXUIHs0L8Ia-#gNuo(&~)m!y6v-^NzgT
z<!oCq8@kHYNo(Ku{fNy1t1MPt7JgRdp%6@{jVcr;R9mlvNU9(tT*;X4RVTlntrG)h
z^3mEF8&?sU@LYsDkH7&JeXj+E>6(?rS4e2Jwi~aj$T#7$4{=H4LASgz+{Rp$UZWo>
zTY0n{fEzuS%m8%!kTha83Z+#ECU_GG-3P@tG7a$$Bd7<8BntkV8@URGcYNO9KE{sP
z1{8@)<VSn%Hq1bh%+bi>ccP!28bapN4ez6-Brd6#g#zav&He6lKOL-%l85gx-?bs1
z1w4sgbc17-Q;I9!TuxaIXO{At*tg!gutFrCg=OmUZw9oGS5$5pu?Sc>AbD>BP`nKA
z>Xs6<Qc3O)#V#x`+_$MvkP0V?&Pf<}hkl>UkEMi!I#AuUe2rL3>B3dtuNkitx{Kl~
zTK6~aGoN%lGjPth%Cef%-U-#7*nDwj81f2XkZX2^?~YWjGtm39EByR{kB#vazEQoz
z3QWz}(EXVjS~Bo42blSuJDxQAXRL<jc5r~$MXni=L{3I=>J@%&?6SNEA<tNLD;p(B
zAI&IoE@x_G6tbzbH>@#<^4VXjufThhynG_QB#1tgO+3_M?(IfboA|}S7mxV-%Wt}z
zB{|aP@=07WBCl^M_%r9Dfz$hDlUwkCQkaInHaK?z)C|rR*+F7@mFRI$ZFYfk#cLWK
z=^jMDvm+7^>foF*OD^jFcbNlu>*p5_(<v+5xb~dZ7G(3umiX@+kr`5l@d<d<l;;$S
zq`#|FQ@xI1yq`0#h%jOjD+L-qP>XYi!I`2C6MSN+ys3L{Qu_7049qrS{X0P=1<&E^
z3K?z8@O<fnFCN#d@e*ZDor(B<3jEfscyGWh<W#oY*PhmOID&wv_@;=#tJjJIXn1Rj
zSv>S8XTqszD+BYwBFy6G^w2m%0z#6)BmLJenwn-Dv9_xe_G$x&N6`TJ1m<J@s<nPs
zK(+@u1-H#cgjz_B)SO~9XGWs!_g(S^u>i_dX}Zm=Q2BI)pJ%nPJlcWPr5Q-Wvlyd>
zb?BpMoZ?U7LTIDLHUWS0ZT-IU$1R^&lyPRCk4Z0rbhcz$?SEny;=2u>Ixm=sa2(^k
za*qrLSwv}v7@)LIRp_h4>7(q22WV?7iqqC#v|p6efj<f#T6*)2aI`^eLe%v`g(exi
zJd;PIcL_~j$)d6^Q1kkBFn&2%NzPEpw8Kj^oeLRV8pynp?urOw=TBj}$YYF>O<<DV
zGi6<#Jm$~Op+U}zRSQ-+E*K&32YxuB3!5N`_?5s;0*14&siPYEt!V(;6-(l2uQ<1W
z)@7Hj6jes117b;!p(oyx6zpW^#0$bD|E>Et?x8X4ggL8xKLRw`am6(M=>O6rp=bR5
zF#iVTO0O`zG|bPd!$Av>aGBs3ENP6Yrex@&!}Svq3>mG(U2*<={EpZAx$2XQpsF#F
z(?AHbCF8o^7av;8PRk)Og#xy8XKSTEFWzA^mD9rS;l-Iu5p^j_KId7E^wU+AAEnSY
z@vRf4EMHt?yyqXk+ssq{!dcqiTmDb*a3CZibYMJIb*`t0!jtMLyV%*;JSk*1T0s4N
z4jRF-L!Elx@C;_&{D`sA(p5~arFM{yN95G;d;o2vj^o0JRh4+XX4#olx*B3tt-O9u
zvQ+;XX(^}&HK$Lhh1aP5tRpKa6m-Q(+xoU<;gme|)s+XjWo<--2OT@v^W+h<A)i=v
z<G8rz-QGTUlH+u=jC3_i-i5&*Hi82xJvJ+i@!IK8-sTcV8PW3Xx`5g!8+8y#Cy{T#
z6L2;-*nZfne+1siRAb(SfnF<gF_QVPek37IV_T4pRyQ0=p1%C*YOI!|9F1ZZ`m=_8
zXW+)kF}X~N*WZZqk;sNns%@Si&-~9PyL#t7<weoTO%<)BVJ(=H`sx3@3!rQ2_g0qu
zX=#{Q9%Zn@<bfj-eC&KieoR6D;~oc@gw03+&(Y-y12IPTiOQg<QQ*B>kZyp^Nhv$M
zQw#MU{Mzd~C=CQ8xsGF9il*iZv-Vs#V03%?wh?9)6Hxf+#EzmXnbt4HL#O6X)RzEq
zk3isXO1$_pt*poD89D}W6jv{3o8i0kS{MQ4c_(pZ25;~R?^uL1gF2zZM-!*~^v_2A
zIt0J`k;GU^^SM;Zp~>a#Ublw{XEYa=>@_aYQ~Qr8V^C2RuxD=|uv+g3CV4?X2jM0Y
zter?*UfGK;d(>oFR&u1Vx~Gkztc<hA-JA6ZS;6eOHJp{40+i-gmNYCHK<^zlDg)<b
zFEICMRr79X!c^u~FigvuANpRCxiVsMcN>jx?|&_Ko^{Ll9ap&1V=m~Bca|*CrR0*!
z-l2jv4pP%osQFRRLcq``A6JMs+L%5z|I~?224%z~pEkfl?X*@fSordj_VYI`UGHtR
zAx7n=xOY#H+YFNeKSHY$##S(5vwv-yzt=HCf45r?tqUaz>|{9!kR@0*sYg1dLv{X%
z?BOLcS@usnKAEW3VPo`DUpsf)a>WXWuNbBv9r+d{?o7m^Qea`1a{Qx3{-Cj4QeBrK
zKa40)l4Y@*U-^rOawU6G;PiX|%5c!=SLZmJR)|yf@Twf^N?gN{4C|jT2UMcm-~TJ&
z8FV>9`Z6r*^gvYY5ozD!%169}7MM0`aXR4H5-DTRYOy`Jq9ej3LTL;YZ=cx_{dIU&
zz9n)kN-yh}e}fw5^k(ambmYzcvNz$I68V$7Xp_|H7C3tWm7`ejNMu@sj|&<5InEMM
z56lX{gZnv8mo1Cx7j+n`M05PRbkwA!5n1LG1zmIgV4ankO0Pn(0XMG1^~1;}>1a#Z
zW()?jIHlY}&HFj!h6B-bYt!LALv^q&4U{p|%0-pBl*?hpjlW>tOU6Oj+uOwVd&wV9
zU;fl$4J|MM(Xl&2uQI6RwwsgQB(m_Z;p$t9MSSFrnHrYyxF`{;NoKb#;E$!M-G>{i
zdVs$f;l8D_!^}7q@i+ToO4ZfO2{7&5Lg^o3EZp~N*|-!}KaSukGKJ3g9-i%J7W7cA
zSX2BXYr~?s(?{Fp_FM`YGle#9{1zfWN-^VbFG+;6lyF?xU%1{pLf>&y(BQ%sxF>=9
znK!!|Sn+!^!E9Ga7<mVpc#Ymt-4n{_nn6++LDqSDj&24AXB;wO_e@k^7c6a=C5<Cd
zLWnnV5Szoi*^#@eHzoz!OW7w#-h+leyj5j#7VamXUB&(IvL2Mb7#yryN7E8~%|<Ir
z?((L7aKQ%<wkJ(k!m_%h%P}3kdDvE)y;2xm^w4vW9RGA>NTb;h_xMYu9qzuRY}JNO
z3X4WYkARQDRjm#isDEkVifpV7OIb0`#{diH&F)P}_P1lMizJobK)lu8c?I1cGR~x>
zwP-Z7yruDO8pypLYACL+=L5p}d_mj!fC^XU>VkC^^D4Ql>Yc;^vh-*n7aaS5Xzx{9
zM#07^8VGI%Y54gR9@~ja8fnm79oV6KOCHxG@j)wXiSD?F>iQS!;08-EZ>_STO-;Np
zMMw^S+A}Ie?l3yOdeVjQ-b>sy*F@Z$DmB9DNcQ5md|;QVGL*kTYtp6Ts4v;H`IhW!
zwGP=%y)pUQPkEQU)*{GTXK7qjU2oC!D-@y^(==Q4ID2`%&SSIOJdqD=JK4+6gcN&y
z8C?wHmN%q*{<<Q*QI5y&Pi~ZNcZJWIco*koUP-i#pN4vFpj2oeOh#;OWjD26looUb
zFE|x#Yd`$;Un1P7T4D?mAS8@2_lWHZ&5V5K%{9f(Rc!ZF?h;(*<v~hGX@FU|Vh_0U
z2N~7NU#A;S1R@>55s&oYboU%Bg=99>meu18QPFKN9eoa442|PI*CwF*JY(VE3o0`k
z<FjOi!z<fm<xh-(DE%8}Ll5VEX(ua1?hjv0)tPK8ZX|8&&2GB|;=7NLRiW4Fi9#;b
z@-dYGIW0mi;>;1weRF7p4d}(}zNz0XU*f-6k|$zgF1y{5F1zjR02P?uvau!o<dl_~
zqT>6O4X@m%R(qgiIZzO;IoQ=>2HkP@{XE7sz~(K(_kLWklWL)~;j5p^eC)K`R@qc9
ze!XN9E>mwriCeo^!e2zD^qT1C&fLRo3kJJ~$7j7gBU+li0zSBR6FSoG|Jaf(_j|z?
zUD)9iozwhuZrtqCjIQl_bmdXdX)sf``P{Ac>`M%-zXQyZx{F@*E=q(5+EKLh%Z4|`
z^`<*`Ou`hee~e-}VvLr;mzArXqh17lTbk82%q2{vyQHeVmHKwv4%1GrKp(75D}TCw
z4^24lh@cuJdkKo4=zP~NhV%Z6g<;T654`4D!5G`vy0fGofU$=1eovlS=-|6_o)&Fo
z_116ib|>~yJ5O+{&(tPtvA!QPnf$>2JXY6D*W$Gl2}G)k_mA`dmj)HsVp5(^K=h7@
zrG4An##KJp_eX#H!=d|(=_0CCEwD5$KO=A7@zgcFcZ#YnIxk=e>n;HWHG8=Z)vO8R
z{Ut)$!4%6nv8CpFouEXf+cMh^g^mRp4PUG$8~#@I8PfN-@J>=P8hm7*aW2-NQT;2#
zcbo2<hI1(pkT2yj?mL$^Y!9RV=*0aBKJ;f)7k`eD@_BGppuGoN&@iQMM*Qo%>dAGp
zF;;uf%&fmTWM<SUhWw7zW4I#%1Gj^ifTl?R5_<xI+9KA5Ya7|)QN&?DOS<)|ySQ<F
z^F=$V;b6dqxQL6HG`@K;><FFFQ<iX52``?hHd}^Y&S+(kFRN)s8QOlXdel}Ke9kZk
zAXvNce(bAK5QbiBh&f+j3|^c&Uw?mUunPQ4q0``_lGSe9@ddmss!t5qm{H&E66HxV
znYar1t2<p(-WvU4yvpy4@maSN#b#DdWl585o=mjTSW&3?J_6`@8u90={*<Zg)Abzc
zp^%p-Gfx4LI#)}E8BFQi=JOL?bLY-iA%A&vzH%Sr+3a!2&>rr*d^E{inp%ECBy>t>
z6js@t328#N=TUBOaq6sStc^&W92m>%exGSWvIuvwE`8jKnDK}YuWaCQDH_Q8{<BH}
zX)8qbB}PLjic$m**Zj5rGor;;mAW@>Q0rss>r;4tzonx4Ab6QKw*2!=@ypb>-HnhS
z-k`!)Qe_#d!!g$-ooj}H;|9Kl5<t8%a*aw14aD7P)=Fdlm%*2krH=gP0a(&~@7I}o
zrmy?VA&xH9U0Oa4pog1aw@1SY-R-rSo%GZLE@_|2shP;ox^yBc1uMg)aA2#WtTus<
zA#dvM_!TM|PtHk~41@RAzWpOG=iZkYZDwiLQ-j_lAHn3d>l5MY#fOkuA&KwZ9Zf#<
zzTJJfPQ(~d<XVUDM*Z|*O^g>ONd9Q3@aZ=hLfUkZeNQ*maOdhs9lIUpp6#gjbJv8@
zQxpQc9=j&yyuI&JgovYsm#CmBR(H(~E=F!s2iN6GeY*ReoGw2GBdZF;DJ*RsppMr!
z1{*F$`6Qo{k1@NDU4D5lPjNa&%{ZF|qs-ebI0wX#FC!(J-%q+Ix()gNF{1FDut@EA
z`t_V|vj&4V=txU^f!)30y8b1a{e!T2g?@oEXUxz3g6Gcx&g)fj9=IKah-b9NMBXX;
z-lwkL;+!;p5<EBa(~}~B4z82u`B~pJ62Kps#PTov<73!5>^BQDelA^1*LHm)H|f25
zHK%FdbwQFavh`oWYM~X;m7=ppA<D){X745{CcN%bIplr^BlF-t`Q`BDZYEKm4eD&4
zVV6|}=!2bFSX#$1*5rTAcW8wi94+>nL`d#$1dcq28L$QJl?v#I_|PlOjYJIdEQ%O|
z#QE?1*F?OZ!g+r9Zj*hRtM2#z0Tb?5Rk=ltrc-rNBqz*F!KNcT_dktOrz6<kuwx13
zleBntE)*&-W@sZnqJ;>59@%_Dnua?GXEwakNYI82&%J2-4DL^PC1?D;a$9*z%k<i+
zNpu~v>tk`7qoe0S9iJ_KibIw$`FNhh%tSMmKjgnu6`ImNp58WrtI|_N=1~<_rtiP1
zqCgE$tZF<Lt@=GGg!y*HgB_HPSyAJF`iY-hW){**e8E;FfoI-R+|N_H2|d$^6*Z3|
zd&(Y}X2eRL;u2$v%U^pYRXPVjY8AC?FPcmH5?_u+d@qDCYH%(6oi)AgG%dJF?{~Jl
zl~FmdgX(GcUHsiA7@JF|fNqJ%mdNUS++ah&8{)Qru>vnaGtFS*5YRX>YWx1<Lh8+?
z%vSs}{tzwhkuMV_v3kGeobvAg0~bR{*UF0h?0Kv35lPh156!g=utW&QVwUY110>?A
z=RRXhAc~xgB)Gr168yk0E?{AoJJ2%`hIsGKOsLP?4lZszSx!%hI99G4-IdxU!NLnL
zW{0>VcQ5Lfaq8ZQc+y0}!`W3`^!fI6`s?FYx9G9<!z5oYC)GnZPOU56`s(&p7XD^K
zrd7=CUoxv%d>zna0mv?*hz<EzX=uF}H=|EG<B&cw63{xQ`aM{RNRAkg2ij+7dWF2E
zyj#xFarbySdCT+Pm;KVQ?L90!?jM+HtU25;){K+4Dl625&(=eB=Zk}XEP<PBZ&1`G
znKvN>fN83+JF;iTNjd#6a{L@X!5>RAL+s6@nH}DBxAWjZvW-!7H|JjZv0?<{Q;1TX
zEc|bZ%SuDf>@dRn>WuZ^b>+J@n9Je!HMW*x=F)g{5G@D+EJ5__R<EJ;w5JIP5@d`D
z)VNRBn<K8idxZTQWRC))q0&%cKzKCJ;rTx}he}7$!sSu~m-?4|(K?Ptq6p4KJeI2k
z<l7nsinWUsmwmh$KF%$wNtq~99#LjY6Ug!H$!}<l>#Sx-O0YvyHdPCx(2Ws|kZdSZ
z*bFXx>Ybdi(R_GOPZ}t=u8HZdGY_!#af5&D-B@NWJ5;4;{@svSn$x!X`oFxx;k0w^
zQ|~E*2JSrM%J+mlo3A?y`CSO~oqnMIHHL6p*m4%?UP(<PkUyg3pXscjYL9hE7@D_q
zu$nl)zm%yEf=mY{*|;CG55eT;g@!a%+^!JmDX4DS&t&A)1kg@ZNr=z6`dZl`#&S${
zp*GIKE~PwRkha{%lKf)8MfEP@WP<*kuVkSWSuR<`LMV3QLiT_}iO22Yxh8>r#)V-8
zyj%Yx2p_?Qg(V#kQ0$#{h@6<~R=r5tLqm|1-Q;xWEOYA9->mF?daMPSa$B*^n|>RE
zk)TW|3p)h3^)QXAO(2R{x!PG(T@wU<A&>7$gT7s{BI6rPHd9!&fMxoMdvBx8h0_rX
ze!_!mv13kB#s4add_3g147TIB!J_IL&8{*Br8~;-KPi3=@-1{78?~5z+?j(u){4Bv
z5LHC=2Yy1CU)#L>>+IdLX4}%g?@yFwEJj9$(rAH%ghKW2T!82?wj4uw=u!B_OQZJ-
zz3OUBO^2X#(-C4h4br|t+Utm0d^^=Z;A?(QQqV<Dt6+c}N)U5fpr?6x^)2Swu$0B>
zAJEpKPtE^Lud4hqaCxH}Q~uO{ti5ryf5i$`49aO_LeOv>l-v&{+6dK0pKBR76%n+n
z8X%k9Q-Qce;yH@_f*p?7q>v4=z6csbKTRs>Hw+>F@H@+6mwPpWimM$7fF9Xc5UJ3D
zI;#4D;O&UJ`&5jvrbz#oxgVQh;HuC+K4=PZ#5F?XLAT$DfIY|}u((~Mbo2;Ol`<BD
za7iN{6GZlp(xYf*PYM2`^aZRkz$OHo2ay$FXFwqcJ}#*F_5;eG-7kOOU9YYV8_z$+
zUe*=ZawmiE>Tt)gVG2gn8m5_IFVyib6q%iIc?yyIdOA=Z<%4vbWEKuW0Vk;E6YwVz
z!UAN4f__MnA!^f?53Q91A|os;Ng@T5ADn<P2Ki*L11aV_HZp`zSF;ea&l-`fOjcvB
z_Fq?_B`QRbGl(2IQAIEdHo$v7Ahb&tpl*T~O3(?ya=VN`RN`|^I{O|Mk%_pJd&Ab}
z-w(&N+Vwe%p1TT5%_Na&c~k-s>#ko<hn@bPmZF-#(mOeXmX{Qq2&O<p1T4)Ydx4jK
zS}>imXK67X=@5rUUS7mV))2=>kkJ1v6I7&L>*&@QbB8g+2$F(~igbmPk^lby=6t=~

literal 0
HcmV?d00001

diff --git a/docs/_static/img/logo.png b/docs/_static/img/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d04697ce4cd98d1aa6d9edc0f492601ef92c575
GIT binary patch
literal 7162
zcma)hWl$VI(<TxKu0eyl2MMl8@Fn<$4MBHtcejNAixb=z0wg#r?z#kbcL)+3f?MEt
zzpMN4Reg1LKW3(So~iB{>FJ(%A~e;N@NuYdP*70tRg~qmpWFWb6fCsonn^-@00o5z
zMMeIdt{3W|;Y$Nj6^gZw@fLcAZNBiFKTxu1Qlk3^lQ)*{dNCt$e$Ep~a^-SFVrae6
zl2<3tYJQ33Lx;7$Fd8tgj$ucnpKuZ?H*fYqd5lf+-q*px($eU*<7ssGIIN$4))D%G
zG1u({CFLR!8kZajF#sc%DVZ`177EHGc|nBGMdQ|vgoTRz9|0q|WPw$lM;Kk5!W9N`
z8>r3Llt*d{t-A??!xU20O2yUx2N1a9*76lrxO6)Ma)W`WLyQ{mnZbJ=0m9_8Oew!|
z=j?l6pyq7j7=>_Hp}?;D5<k-?V1SlN1Qr9BlUS{w1P4{VjEhI}|EohY1Ed;FTC6RE
z1_Rk(X7TM1A$Ak2h5I64g%+KAU;^SOzVuJ7S^pj6e2thsq!Ikt_H5DG@tEUCoQb?}
zD!*-%@&6(U{COo34J+jTstgr-UMXZ=@ct{s=LIyy`iAEGtSI6`sP3Hk4J@>Hjtw)r
zXsuazkEMht1cfPuQWFM@k#x^+ld^i>xe->b@A+uG8+;=_Bz#;K9TQ=%nI4V8L{60$
zNo^9hy)PW={MV3&mrCpxh5v^TC@zIWVc`?MIZjypKCv8<ByQx|!$W<5lRS!CfC{fu
zS>8s}>hxUwO?cEd`@0L`HRHXy#UDgOQzEIrL&rouLz?{WhyMNt!UCqh(_UCl-M<V!
zJ;e?PsQakKUyIEg1OtYKJNJauOL|K5_BGr+`gSU)?g=|&0aPPrN`Q6Ic%^O=?JssG
z=t5jj?$qJO89A$#i@KObh^&Nm6VTGUOZxd4vVbmM31Y>qghGsKJJ&bNDzoa)NALjx
zVGA4LjnS{maNdoFczH$9Ne?0lFJotsB})YZX2xs1#Ka>l?#Tgu3Cq=xj`mTlj<}^m
z>*XrO>~sYk@P^utn5d!s=_DD_LsYcw$ef(jTyfV~0|4z7e`@_QnzA*8Z=7k*@^6Sz
znE|RRJ@U;@leDeaU+xF7YqbNsjE|DG+>7rF2nb@nJ1o~91(Uuz4?+9lw#T#~9!#Q9
zcQ`xqS3Z7oAeW6+VjRorjtH<eKkxBR{~FntGy5q)i5i%7WNKc=CNnz<3DIivd7DSk
zsT`%uYoXSV@yEo(4B06VgVl;#t<N9$k3Qlb+dSm${O<d*Ql&Win)-p*W+ajyU$y7Q
z$3KVX?s09)-g1E}7OA5or61#-V%+q<JDRcduiaaYp;NqBk%jC8#?jlFjgwH?&((w<
zgjOxc_)^!|a5`!9;YcW#`LMAO-1~_j4H2HkN8ZA#ZWkQjQ972N4)xXIb(7^c2YOgW
zAz%UX$(=10r|OhEf_7XqlZYx?SE`<i3)=;rv<6|;0r2FD$85u?dixYQTqgpF*;XB;
zcXkGRqDKbv66akieX$Tbk&Z&eE5%=mkgQZY^~eK9QKR)7hgX!%#F+Ab+qTDfBb=wC
zrT0awIy;@dr^QmXi!;t1>*@J-&!6(Xc`6J4qX@|K)k}<snK5=)ZU4s<*m0T4SAV>U
zKb0JlhYqbhob*;Wb)FOI;+6CyDTB!4V1{9C>hUaPP%e3vjr;!g)PUa&r)`URu1bD<
z)7qSt?f{6)+^lO0ASI<$j`ScM>YVo)Y!;TR{FS+Ym$+k8Y2HUL6o>gh?J`;L(H4AG
z4c?s7U;OIhl@f5z7DETD>-uY2V{y0KC<2C}>uAL7=FKM8rGZj}ejK%UkIZ^U<j^HT
zhfdu}8tS%ll8_M&+#3k;PfgC^S!v8r?mNOrT)b{EyYyFF4G1m!&4n)_-i9$kuT6FK
zvakGYSP{)v0xNXB)_Dzj=%l1ras&cz1v454dnJVt4KteH$@ZOXjLK_}tjl4~#cP)$
zAUJi_P@*Lp?*;TWdpEB>2^GiQIr)!e&DBxL?a30kKLT)K6uH(XY&+#+XvMD_75o0p
znbC4v!$LrHT*&59UmRJ(cUV{ev%Q}V$&SN{ZiChbqld+Fjbx-Y7tWnx*e1E84}0&!
z+X;fKPv6t@@h{bz1+BLQe@ipp1f07J?{7fHj^_=vHFM=%t4k&!f_;2W^QjqD!uEo(
zo9k(xu)Y>!1Q1XPopshBn{p0p=ZedhG4PEuzajo!2v@wUe3}THc4DGo<(Um_yQjFW
zQx+rZb}TOT?_pyby}vpHM4YV7Mej1259zXh%B;wi?oqGAmzZq*_!_tpWlH8qm|1Vr
zbi5_tSvAv6D`0Bua!%0HTuEnsfBNeq@Sfpw;Jt&Pr^3^Yk5`Av5V<@dC+Npp#4oYo
zGmQw*8k+B;&fVZc<)^Ji<Jsn!$vJkmL-;VywKJ*b=$E9YwZQ$0a68+gwxfwmPFVul
z_ik}E=BlKnkhrHkfiw_RCKOv*MemYvIC|<6k7OE+sHT_2*R4V&LytpI7PFt8DyV=N
z2i>3a5Z4Ray#}i}@`t=KevShwu`mdwR9*=X5ElWVY{9$QPa;2BbQ3zVm%Fp=0+Vx3
z`>rj^><m!wl}yoB^>#L9O_FqeAasJ};qiYj(aZ*|(;M3eq%T(sW({rAm-syOg+{sF
z6UZR!^#n?l?T%#vhcETF4h1K#9%C{~JTNbMIC^p6Jp$u%;XN+U(vM}f`<1UC#8R(x
z0N=P8*-F1c&%^k{EK<AF-OtW8KKvn;Jpt`LY+PniJA1`D+xE_5%M2OHwgpy7r1G!o
z@i!8+k4s`!>E#Dz*veUhH$c7rsQ(%jr{vIu422G*f{K&QQVHoj`SfK1<41ai?OYW1
zrz__gKkK+iPa9y1i%!{%7aO)`{RS6$F==*3)z;Sf9Yb)qZZT9X33@n==)_*8vVJy!
z=JjorizZ!aSJoEo9zK4UQsezSri7v;|FyU;tI9;(N^w6uI%#i+{52)epBp$%LT<#~
z>PLe~sPQPBaactoESB1{UF95IT4@Zh^Q5uC*eTWvTGkpXR*iFtR`PTD#^ZyBX>78~
z1WztzJ90VH)rkuFxD<th%gw9je?==HcB9{oQ^||}TC~Kl<tpK$?bu_olvUErT#vX0
z+<Vxt)~#TrBaUupE5vH^tdNC=n^AE>2TjeXbr7(h+-bir4%{PVc=@P8LKB`&oWela
zqNXi`N9ubuu+otlUF``Mm%Z+|P8KPZS#m$BWxhFWhY8U(d$4&p&Q|wU#BV-onP&S2
z=*?tkHv0-ecI3EEfVLf5J0y~-I$?5hlwzZ1{45Q9&RHgjE(%ULE;86O;B`tJ94IT+
zIBs}E$k4YR(zUl4sSP96Gg)xOmD}?-(X~`FB1Vm~!f&M~_Ww+bgUT+b-tw~uJH(8|
z-p#bqJ@W1*`By?sQTx_Wp&)#wWd6DkIR*x%pGZ1EB7|~8Vm%t43PXF+_dc`tXpM)0
ztKGPyvhq+auYVM^ePfSGQM*E>X0lE+^rEh+W*&Eb4cq*r?|cUYO^=8B4{P@TJFnyE
zJfUYy<hCdJV^riK16q14yQtx&3K-o;XY3JqjSZ7Li3vYxuTpRJlJg)aL=Zc%i=$(i
zO70DU_6gQ3Wgg_-S-*l|++kW!t|wbFrI#;F@Ep_&5c9`i?t|IV7Mm6&kiK6B9P^v*
z8JCfa_tgQ9QtCjBYmOAFEs7pgV_n>Ua;t{KmScoRl@vD~0&Xlr!!`16(|=%(3~s*`
zs`(RB>zP@1I?^HDYKQw|yHp%9_cx@=`J1%QHHq7|;_(VdIP*74biOc;{Z$~yvsX*s
zFTuYf{V<+&p?M@1!OT`Ho*R?g*Uu(IJq7mwX6Y!iyuG66RZ6)gas7Yg@Tcke_
z(o1C<S}x67tli);+lMKJo$HB*8b{Fzn`Zfi!P<s-UxpVV%NCov8rAI+IPo^YrhV$M
zqOVEl+#-kgrXZc!FB9hte+b4hgCn{?C0uf`eY|*yj)aN0u%~+Ij%<mc4X1uKrD*!s
zzU(XJo74~p<W?=-mx^+C%-tx>>15|3)U}^HZ%4=IahB|u-#o-|>sWA_0gRe)hAr2J
zENm7?8prEH+O7BmcuMSaudWCE7|BrG<-RTs!#{AGZxcTA@uPSB+cCk=VDU}~Gnbi7
zutc|u>^+%Ye?X9HWE2(xFDBZ9{`_8Ay1mjnus0bnJKomk$tXY(`}L)$1Jm8l-~>rk
zcXHyWVM<QhAW}RU9ZmVulj-vF8_p}7{iNcSZLHFFv$Ce$nQGKX5%LHCsBI^1eQ~^%
zbjBM62?nPH2jkj?f50FcoS==y@y)Z+l%g;C&z1rXleuhJ#fz(LG+W!a{5rdMiR-oZ
z;wO>8enEjaI2?><1mQSbN%@kLtam8p;z=-%cg*DL9PmzI&PMEn7KTXK*nx7@<Jj*q
zCaR~<cb_7)tU`u@-mk(MgCUeRKXcc+RW~%>g}S&=ctAZ_!bv9hU?3@m?YnSDRlKII
zQ-XEl><kar8T1uZDA&4RZoDW{MUZRj5<Q-dSsX5RXpf6M&R!A_os#V8J@)KcYsoG5
zEtBGmH&&C*Pw@r24Y6OEpnesg!Ot@4q=TM+E!W`$lmUEOQi>ZflSs5buPDA1bm#q+
zFrPtDR1jk*#SFM_C8W=$(ZkzwJipHNWx^@;z#`BYDdY-dpm$k=VM~B>3sqSwS;PwK
z(eT=&dcFo$XIPjlqx+%<+`w<sbxdCfPxESLf(-aZH#oQqDzF|Rxsp~`)dc8Cz?D~`
zHAW}xh3j9S3@H_clGuZ<c%DzP0YUiCCtdzneng`|ulryLRE>vIc&pV<7?P2neKh>?
znjU+8sz&Mh>+>TG@CVz51C;v~;gw}lBY>e`R*RCsS)>_z%p`+CY@@B+&9%6-VQynX
zMdgi=ab9SxlbDaXIyc34eKM8#lAt3c!FQhsKqJLJUnQOvjHX61QJ(u=yg9MMoI^`x
z`UHVKtmJ)9V+MS5`I?uP7gYn<xoj*>wBtrWb&~)6eUh;^qIV;OlgD7A%%w?noHJjK
z(i&MZ8Xia|O)L4Bx>rLk8*rx9nK;2*CO8EVsq`#2{u!Ig{EIjiFrGn;VMB_0#`t%z
zqS1r3A+H;T0flX4Zk_+pfUk%;h&DWki(fY9+K)ZCM-dRpymqVlc)s(!E5RM7Su6KB
zA|}6CT`;JB9~_}Z#SiBEp-C60%tEB)BEz&Fjw<pPuJMW@_4M;Zk!eVqq-bHUBHq3u
z+hpT!q8N~0!PWvgyRu&WZs->X`tx?9e1cFEiW2=rttxu!%zY3Wb9CXY?%;h3&!t9=
zk9S%>mX&SoHtDb{<B)ekXLZR`<3O!lKLk+|7*duM-xeF(0#!s<Hp+?!DR1KMyY9Ws
zP*8C5|MM3hFlWChGUPnvu-eR<+lMv8Xq`Ih=xRKZ>p^S9tU`o%Q8Zw%F5RbXMfdgd
z2e*yk-dybwrhW7<S{#WL-`#oUE=!EGi^J8+$-VqP1Sdz+^>uHLP4y+oaNf@vpz^t4
z_XH*Vl&`QW+)bLS&84CwQ*<7gT`_J4KBNccV_teTSE@>Qy2LNUC4_N6MFOA&I`hF{
zaYfQ(XGFSZ=#xrgMs_Q{Rj+3Ig{8miUUb<t1*lu62afOUBDeO>4&VCv%x=RJgxC&*
zs08)p<6f7b*ZuHmgeIQNgcn$J>xq;WS&eOM`u|dUI8qHRFtUsNZCUwag+?r3zuC~Y
zgA8`7BFcUKILAC{_j%8tvK}E-|8=tj8{iKAc?H85!dlJk{bm}>&_IzIlv=qq?Eb}4
zdSzo`(-Mc1J(Ig~!28moHNR%%y(J~X0O!$i#Zh7YOFo*@z18;YhgnpbYMa67|8f<*
z4D+%rk+gp5D3pKYzJ!=+S%_iRzP`Aogk<HwtAkX>gSLNn(eR81OUy1^1%%eWJIPPH
z09<L))t1x?iQa}XnF<^T^}qk}AdXEFoXAGvDA;XrKcMZ`$Hl-iyq`(sx#cxl0&7oF
zU9wvSD%PA6FOr<`GR-ALgdT>wBd+S4!h*xRR)`TOpTsc~@6XZ|9&gc_Nnv$P-?Qlw
zRXoCgx`)mTxlc*P-=CC|$3E#>)b~a=0Yn_1c#hdu%I>cLlJ0($OM`ccif4NiQ@EXs
zN=$k&ToPLT$OKCucl9wFk!{QQq^t&<a<-!q!~x%Fz3cVwK_+)Ro<D?Mc&xQQ@%v`?
zjj8hy=18-b9xeYE8Q`TeZK?OPUHascJz2Yj|HsTXsW?vr5k$HzdL+0SUK6@VdNR0h
zcr}udvxK;;hF30luJl?k?pgd=Z;Gv0^*uj3550(&;^g2s#S^m;xlAqE(ycC0yxh0{
z(!8s0(h0(K9q`$b!d;Iq*ZvW<hzy>f+Y!VFy{SuA1^(+epsN|{TeAnA|B4xC;^5`R
zz56#Av~V8Gly7oMB!rI;ryxlP8~(CO;ZspmATN&|yIG?cr$#8%YSqf?C=RsNEu*{T
zMx`O{(UmB8i%4?S!*a~#CT3qX+MpSIWjpkR9j!rPR8)|^$WHq*{r)!UpdB5ZmL~CW
zkilcv=f<irF>2GJlEQHfSXaUGIV0T&_w@Snj+u4?A0qW^$m*|AMAISMNsB9>iS#F4
zf`~sR<l9kWS*~2TKt7e&a&v^ltr)xI<RboggSYgE_diC_HS4$&e#_KhCE%JIh1>3Q
z$@avI!yi${{n_TahfSh^9vdL47~0Y_x26v=_>yy2{y7Q~cN%4v{Bqy-;&povTF<T9
ziwpjR?CzbOW&~sn)9k=MDznU<5B<W#JP1tpw(nT;!j0Nj<7msFq*%jOJ>pd7iOs_O
z^85W4D<p~09BF1(bG!9a&FfhuhlR;f$V-=mNB{sgDoSDBnFelQo=)Xt)zp?v7t7tX
z<dRx28KTJZ&8*)EYX}Pi9{b*t39(+qCUCMUJz+bSXFjC1Ka+T9v*xg(?eF(M{zjl!
zC-oXx%{HRyfWEGLrIByRDgB?zl`MctjUnaYeBy;`)G+!>DU0DE^tiVwRQuE4dndN}
zCUP=c>a952)L9$kkxbhP`#4_O4F9Gz4FPC~f($U%C8fY+N-SOCc0=GVO#rm7Dofwo
zGPT*Z!M-Cv&{`(|f}*reeu<Mrhrs#VJF`kU;B+nTwaVaMIhb;ZkLORjQ|Ns}XKf?l
zej1NQzc%ARdLd_Sj?U`E(NglBoLfLL)5opk3^s{8!@_sQ$9af<Qu(Ix^`_3G>C(+%
z@o+s@;httkRHTBrfuzuFt4f@mgjCHi#du)2OqKakHHnQpbqK%Rr>=5I72Y4kdK%Q?
zl8|7p|IL%@6{0(NtXbQ)aHU1G-~Y0bU1JLvP@e7+U2^(t?e{$hr2X4n0KEI_l33KC
z-*ti%#;S3Brng9aa`SK``SJ<<%C|MA)psa7^tUfB137)y&!@I8eut;>@D)NA7qc(3
zL?=0B#v}a}0E|1_&k9b4>5c}kvlaH5;^1suMz*lt>zl>=@RTp2<r4m9_6`;Oy1*kw
zQ)Eef%qvoI0z#HTs~<3Im$mWQ>44X>H%1XVMEUEGcSmh0#8HG6%6G!&Bd-bB*{VRj
z!s^<T%Uyq4OhGX*tZzFR8*H-39Xq@Pz;m%=j9v5{0p)JSnKE?=AQb-6TI449Mnh6G
ze63Zh$4<d-HNR80H%Q<E^jKI-W&B}(v#+r09doPa>ZmvNH=ANBZp4k#Q)cRcADh>2
z#*U>o#_@l?MN)6PxdZAB$`lK{i#u#H6YK+%-LfL}j~3{h0mFfqLxi8dTrGa&kr}ks
zc%<5E=o7!VC|#E78dJi?c|V326bttGjEl#|W=oO2sAO)5zu4y9mt0h-pzzRG>r+{d
zWD23nxt!Bndrr(raTx`^riUu??EPVO0b~{F6mP$DG}4gTKk?c@W{$!8ovp27j)`zd
zG&Qwbg|8+V)@W0o+GDVP@Pb}<iML~M6>Ht-o{nt4FS>+E?~TDvX{XcM)QO|u$)3g?
z!?F}ok2hUqS*fipQr)gRt9jHXhb(E_e4Nq=IpB6jOOOZVnmz5(0R!TOs`>ULN&x!E
z9ViR`qWL$Uuhpke-E?bDdoUx3{Ru>Ow))C*k2I?HT$SQ!j|XuxAmJ$~v;)6?5Ik(6
zskhjkxc3wq!~BRV{m3Ls_%LBQ;S7#2tEf_@TI=~{JAXe2wpVWi`wyS&*Yq(N?QJo(
z!OWfA9OtunIN5p3g(0aa7#0Q%V%68miV_>VLF$vmTpWeOrK+r<0D7GE^5`}(@x!e(
z$2F&!v!#K(H}HipOdo@o0SK8kCFORD;v4Yazkt9?u-Wc}@#dSq?K1_{9YOsAW5z`8
z0cW8)3r|{ifn{5e?-$av5xp-3E&{H`r@}SX+Iso8b!N-@=5#;?hyKh9cf%WLmf**V
zWJkk4*B7kh1tJ}3+c_jchX(60bEina1l)3GtbBv;Bm*u}D=HL&;fL|sjZB|33Vy*3
zcVSlum0(wFdpwJf-Rf@`^SU3D5j5+ipJ-AD^&4?pWd;MyWX~2p44vkpFNh8f4zfrI
zQ26dcx<u!uDenP!R403lStW9sH|WHrIB`rK3l(3iQ;6BY#!?kNGaP@*nMxbypGM*2
z6lgtkYo*!+ORRg}KOjkZ_qH3xVh2)pSQKQQlGwe~)o!kf-KUEUKkvOYR*zr;1w_Mu
z1nAbDmm`3KU81q*r0(A&XBXETC!1SeVcwh*mAJKMzC8s_$;MC^koSij-}TL=<R`uN
z0ltr#{6afq<X)?)v}B&gV}w@XKwdW|G-6!2coxUc7~GGl)bLk-Ib^S=6Ma8`M_))c
z;btXHzs&C8m**)4#b-a6%4^?*EzPwjM?40#=>FTA!y=9f5Z41BpGgqt=R!q}VjAM8
z1nY@_pG1gS+3=)jl>dhaz_5PDHIe9vL?vSkQsf~+1$fpylR=h2k-wFY+Qe0EdQmV1
zPs}2Y0Qvt7UNDfa)RaMQh!Jm5Bxu!`z!Qu^)G%UV0DA6J7Yh?PZFL!x^1p*xACQxB
z1^os3&m4y~_wt267)$|fP%5k~&r{d^ksbmAT@2LeGb{Wza8eaqJi{UQ<Nld`*$apj
zCShbsQ8bzJ&wu8Z;E))37_5+Z-hGXODdqSNh?$rPZ2iImpBk8G{Y+Y+eiRjYp2SdN
z#_wmPq~mTvAuxr8=u%Fj{{fuIoEUi?-JGr$qZA4&{QGM62l~@X6qKhYnFOzvX9f@R
dKj}I;%F`M}ou<U{GlGJmqM$Bc@!ll(e*uNC;qm|g

literal 0
HcmV?d00001

diff --git a/docs/conf.py b/docs/conf.py
index 1b128903..9d524157 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
 # General information about the project.
-project = "Tile Language <br>"
+project = "TileLang <br>"
 author = "Tile Lang Contributors"
 copyright = f"2025-2025, {author}"
 
@@ -62,12 +62,13 @@ todo_include_todos = False
 html_theme = "furo"
 templates_path = []
 html_static_path = ["_static"]
-footer_copyright = "© 2025-2025 Tile Language"
+html_css_files = ["custom.css"]
+footer_copyright = "© 2025-2026 TileLang"
 footer_note = " "
 
 html_theme_options = {
-    "light_logo": "img/logo-row.svg",
-    "dark_logo": "img/logo-row.svg",
+    "light_logo": "img/logo-v2.png",
+    "dark_logo": "img/logo-v2.png",
 }
 
 header_links = [
diff --git a/docs/index.md b/docs/index.md
index ece397af..55804259 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -27,6 +27,18 @@ tutorials/auto_tuning
 tutorials/logging
 :::
 
+:::{toctree}
+:maxdepth: 1
+:caption: PROGRAMMING GUIDES
+
+programming_guides/overview
+programming_guides/language_basics
+programming_guides/instructions
+programming_guides/control_flow
+programming_guides/autotuning
+programming_guides/type_system
+:::
+
 :::{toctree}
 :maxdepth: 1
 :caption: DEEP LEARNING OPERATORS
diff --git a/docs/programming_guides/autotuning.md b/docs/programming_guides/autotuning.md
new file mode 100644
index 00000000..66d46889
--- /dev/null
+++ b/docs/programming_guides/autotuning.md
@@ -0,0 +1,308 @@
+# Autotuning
+
+TileLang includes a built‑in autotuner that searches configuration spaces
+for the best performing kernel, compiles candidates in parallel, validates
+correctness, benchmarks them, and caches the best result for reuse.
+
+This guide covers two workflows:
+- Decorator‑based: `@tilelang.autotune(configs=...)` stacked on `@tilelang.jit`
+- Programmatic: `AutoTuner.from_kernel(...).set_*().run()`
+
+It also explains input tensor supply, validation, caching, and environment
+variables that affect parallelism and cache behavior.
+
+## 1) Decorator‑based Autotune
+
+Use `@tilelang.autotune` above `@tilelang.jit` and expose tunable parameters as
+function arguments with defaults. The autotuner overrides these parameters with
+values from your config space.
+
+```python
+import tilelang
+import tilelang.language as T
+
+def matmul_configs(M, N, K):
+    # Example space — tailor to your target
+    tiles = [64, 128]
+    stages = [2, 3]
+    threads = [128, 256]
+    return [
+        dict(block_M=BM, block_N=BN, block_K=BK, num_stages=S, threads=TH)
+        for BM in tiles
+        for BN in tiles
+        for BK in [32, 64]
+        for S in stages
+        for TH in threads
+    ]
+
+@tilelang.autotune(configs=matmul_configs, warmup=25, rep=100, timeout=60)
+@tilelang.jit(out_idx=[-1])
+def matmul(M: int, N: int, K: int,
+           block_M: int = 128, block_N: int = 128, block_K: int = 32,
+           threads: int = 128, num_stages: int = 3,
+           dtype: str = 'float16', accum_dtype: str = 'float32'):
+
+    @T.prim_func
+    def kernel(A: T.Tensor((M, K), dtype),
+               B: T.Tensor((K, N), dtype),
+               C: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), dtype)
+            B_s = T.alloc_shared((block_K, block_N), dtype)
+            C_f = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_f)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_f)
+
+            T.copy(C_f, C[by * block_M, bx * block_N])
+
+    return kernel
+
+# Usage
+# Provide inputs via context (recommended for reproducibility across configs)
+import torch
+M = N = K = 1024
+A = torch.randn(M, K, device='cuda', dtype=torch.float16)
+B = torch.randn(K, N, device='cuda', dtype=torch.float16)
+C = torch.empty(M, N, device='cuda', dtype=torch.float16)
+
+from tilelang.autotuner import set_autotune_inputs
+with set_autotune_inputs(A, B, C):
+    tuned_kernel = matmul(M, N, K)   # compiles, tunes, returns best kernel
+    tuned_kernel(A, B, C)            # run best kernel
+```
+
+Notes
+- `configs` can be a list of dicts or a callable `(args...) -> list[dict]`. Each
+  dict’s keys must match the tunable function arguments (e.g., `block_M`).
+- The decorator returns a callable that runs autotune once per argument tuple
+  and caches the resulting best kernel in‑process.
+- For explicit input control during tuning, wrap the call with
+  `set_autotune_inputs(...)`. Otherwise, `supply_type` (below) is used.
+
+## 2) Programmatic Autotune
+
+Use the `AutoTuner` class to manage configs and arguments more explicitly.
+
+```python
+from tilelang.autotuner import AutoTuner
+
+kernel_factory = matmul  # the function above (already @tilelang.jit)
+tuner = AutoTuner.from_kernel(kernel_factory(M, N, K), configs=matmul_configs(M, N, K))
+
+tuner.set_profile_args(
+    warmup=25, rep=100, timeout=60,
+    supply_type=tilelang.TensorSupplyType.Auto,  # or provide supply_prog/ref_prog
+    ref_prog=lambda A, B, C: torch.allclose(C, (A @ B).to(C.dtype), rtol=1e-2, atol=1e-2),
+)
+
+tuner.set_compile_args(
+    target='auto',                  # or 'cuda'/'hip'/'metal'
+    execution_backend='auto',       # resolves per-target
+    out_idx=[-1],                   # which outputs to return if multiple
+    pass_configs={                  # optional TVM passes/flags
+        # tilelang.PassConfigKey.EXAMPLE_KEY: value,
+    },
+)
+
+artifact = tuner.run()             # compiles + runs + validates all configs
+best_kernel = artifact.kernel      # JITKernel
+best_latency = artifact.latency
+best_config = artifact.config
+
+# Reuse best kernel
+best_kernel(A, B, C)
+```
+
+### Example Gallery (in repo)
+- examples/gdn/example_chunk_delta_h.py:101 — uses `@autotune` to sweep configs
+- examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py:451 — uses `@tilelang.autotune`
+- examples/quickstart.py:84 — profiles a tuned kernel with `get_profiler`
+- examples/hadamard_transform/example_hadamard.py:152 — profiler with custom warmup
+- examples/dynamic_shape/example_dynamic.py:94 — profiler for dynamic shapes
+- examples/gemm/example_gemm_persistent.py:135 — compare persistent vs non‑persistent
+
+Click any path to open the code and compare patterns.
+
+## Input Tensor Supply
+
+The tuner needs inputs to compile and benchmark kernels. Provide them in one of
+three ways (priority order):
+
+1) Context manager (fixed inputs across configs)
+```python
+with set_autotune_inputs(A, B, C):
+    tuned = matmul(M, N, K)
+```
+
+2) Custom supplier program
+```python
+def supply_prog(signature):
+    # signature holds KernelParam objects describing shapes/dtypes
+    # Return a list of torch tensors matching the kernel’s arguments
+    return [A, B, C]
+
+tuner.set_profile_args(supply_prog=supply_prog)
+```
+
+3) Built‑in generators via `supply_type`
+- `TensorSupplyType.Auto` (default): heuristic per dtype (uniform ints / fp ranges)
+- `Integer`, `Uniform`, `Normal`, `Randn`, `Zero`, `One`
+
+Important
+- Built‑in generators require static shapes; if your PrimFunc uses symbolic
+  dimensions (T.dyn), supply concrete inputs via (1) or (2).
+- Float8 dtypes require PyTorch 2.1+ for `torch.float8_*` support.
+
+## Correctness Checking and Tolerances
+
+Use one of the following validation methods:
+- `ref_prog`: Provide a reference program that receives the same inputs and
+  checks results. You can return a boolean or raise on mismatch.
+- `manual_check_prog`: A callable that inspects outputs and raises on mismatch.
+- `skip_check=True`: Skip correctness checks (faster, use with caution).
+
+Control numeric drift via:
+- `rtol` and `atol` (defaults 1e‑2)
+- `max_mismatched_ratio` (default 1%)
+
+## Configuration Spaces and Best Practices
+
+What to tune
+- Tile sizes: `block_M`, `block_N`, `block_K`
+- Software pipelining: `num_stages`
+- Threads per block: `threads` (or (x, y) tuple)
+- Optional: dtype variants, epilogues, small scheduling knobs
+
+Tips
+- Start from a working baseline. Tune a small, meaningful space first.
+- Respect hardware limits (shared memory bytes, registers per thread/block,
+  max threads per block). Eliminate impossible configs up‑front.
+- Keep block sizes multiples of vector widths and warp sizes when relevant.
+- Use `set_autotune_inputs` to ensure each config is measured on identical data.
+- Record your best configs and bake them as defaults when stable.
+
+## Parallel Compilation/Benchmarking and Timeouts
+
+The tuner compiles configurations in parallel using a thread pool and benchmarks
+them with a per‑config timeout. On CUDA, each worker sets the current device to
+avoid context issues.
+
+Notes
+- `timeout` uses POSIX signals; on non‑Unix systems, it may not take effect.
+- Logs are written to `autotuner.log` in the working directory.
+
+## Caching
+
+The autotuner caches best artifacts both in‑memory (per process) and on disk under
+`$TILELANG_CACHE_DIR/autotuner`. The cache key includes:
+- TileLang version, function source, closure free‑vars
+- Config list, compile args, profile args
+
+Disk cache contents (per key)
+- Best config and latency: `best_config.json`, `latency.json`
+- Kernel sources and library: `device_kernel.cu`, `host_kernel.cu`, `kernel_lib.so` (or `kernel.cubin`/`executable.so` depending on backend)
+- Function and params: `function.pkl`, `params.pkl`
+
+Control via env vars (tilelang.env)
+- `TILELANG_CACHE_DIR` (default `~/.tilelang/cache`)
+- `TILELANG_TMP_DIR` (default `$TILELANG_CACHE_DIR/tmp`)
+- Disable all kernel caches: `TILELANG_DISABLE_CACHE=1`
+- Disable autotune disk cache only: `TILELANG_AUTO_TUNING_DISABLE_CACHE=1`
+
+CPU worker control
+- `TILELANG_AUTO_TUNING_CPU_UTILITIES` (fraction, default 0.9)
+- `TILELANG_AUTO_TUNING_CPU_COUNTS` (int, `-1` auto)
+- `TILELANG_AUTO_TUNING_MAX_CPU_COUNT` (int, `-1` unlimited)
+
+Backend notes
+- NVRTC backend persists `.cubin` and a Python launcher.
+- Torch/DLPack backend may not save artifacts to disk; in this case, only
+  in‑memory caching applies and a warning is logged.
+
+## Alternative: Manual Sweeps with par_compile
+
+If you prefer manual control, use `JITImpl.par_compile` to compile a batch of
+configs and drive your own benchmarking:
+
+```python
+@tilelang.jit
+def factory(M, N, K, block_M=128, block_N=128, block_K=32):
+    @T.prim_func
+    def k(A: T.Tensor((M, K), 'float16'),
+           B: T.Tensor((K, N), 'float16'),
+           C: T.Tensor((M, N), 'float16')):
+        ...
+    return k
+
+impl = factory  # JITImpl
+cfgs = [
+    dict(block_M=64, block_N=128, block_K=32),
+    dict(block_M=128, block_N=128, block_K=64),
+]
+kernels = impl.par_compile(cfgs, num_workers=4)
+# Now benchmark kernels[i](A, B, C) yourself
+```
+
+## Recording and Reusing Best Configs
+
+The programmatic path returns an `AutotuneResult` that can be saved and later
+reloaded. This is useful for CI, multi‑host workflows, or shipping tuned configs.
+
+```python
+artifact = tuner.run()  # AutotuneResult
+
+# Save to disk
+from pathlib import Path
+save_dir = Path('out/best/matmul_1024')
+artifact.save_to_disk(save_dir, verbose=True)
+
+# Reload later
+from tilelang.autotuner.param import AutotuneResult, CompileArgs
+restored = AutotuneResult.load_from_disk(save_dir, CompileArgs())
+best = restored.kernel
+best(A, B, C)
+```
+
+Notes
+- DLPack/Torch execution backend may not persist compiled binaries; in that
+  case, re‑compilation is needed on load or use a different backend.
+- The directory contains human‑readable JSONs (best config/latency) and sources.
+
+## Advanced: Config Space Callables
+
+Derive config spaces from problem sizes to keep searches targeted and legal:
+
+```python
+def matmul_configs(M, N, K):
+    large = min(M, N, K) >= 1024
+    tiles = [128] if large else [64, 128]
+    for BM in tiles:
+        for BN in tiles:
+            for BK in [32, 64]:
+                for S in [2, 3]:
+                    for TH in [128, 256]:
+                        yield dict(block_M=BM, block_N=BN, block_K=BK,
+                                    num_stages=S, threads=TH)
+```
+
+## Device and Backend Selection
+
+Tune compile‑time options explicitly:
+- `target='auto'|'cuda'|'hip'|'metal'` (normalized to a TVM Target)
+- `execution_backend='auto'|'tvm_ffi'|'ctypes'|'cython'|'nvrtc'|'torch'`
+- `pass_configs={...}` to toggle TileLang/TVM passes for experiments
+
+On CUDA with multiple GPUs, the tuner sets the current device per worker thread
+to avoid context mixups.
+
+## Troubleshooting
+- “No configurations to tune”: Ensure `configs` is a non‑empty list or callable.
+- Timeouts: Increase `timeout`; ensure inputs fit device memory; verify that
+  your reference check isn’t the bottleneck.
+- Dynamic shapes: Provide concrete inputs via `set_autotune_inputs` or a custom
+  `supply_prog`.
+- Disk cache disabled: Check `TILELANG_AUTO_TUNING_DISABLE_CACHE` and backend.
diff --git a/docs/programming_guides/control_flow.md b/docs/programming_guides/control_flow.md
new file mode 100644
index 00000000..158c5116
--- /dev/null
+++ b/docs/programming_guides/control_flow.md
@@ -0,0 +1,145 @@
+# Control Flow
+
+This guide covers the control‑flow primitives in TileLang and how they lower to
+efficient GPU code. You will use these to structure loops, handle boundaries,
+and express pipelined compute.
+
+## Overview
+- Conditionals: `if` / `elif` / `else`, ternary (`x if c else y`)
+- Loops: `T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`
+- While loops: `while` with a TIR condition
+- Flow control: Python `break` / `continue`
+- Safety: automatic OOB guards via the LegalizeSafeMemoryAccess pass
+
+The examples assume `import tilelang.language as T`.
+
+## Conditionals
+
+Standard Python `if`/`elif`/`else` is supported inside `@T.prim_func` kernels.
+Conditions should be TIR expressions (e.g., `i < N`). Python plain booleans are
+treated as compile‑time constants and will be folded.
+
+```python
+for i in T.serial(N):
+    if i < N:            # TIR condition
+        C[i] = A[i] + B[i]
+    else:
+        pass
+
+# Ternary
+x = (A[i] if i < N else 0)
+```
+
+Short‑circuit boolean ops are supported. For multi‑dimensional bounds, use
+`T.any_of` / `T.all_of` for clarity:
+
+```python
+if T.all_of(i < M, j < N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Boundary handling note
+- The LegalizeSafeMemoryAccess pass automatically inserts guards when an access
+  may be out‑of‑bounds, and elides them when proven safe. You can often omit
+  explicit `if` checks for simple edge handling, but keep them when you need
+  custom logic or clarity.
+
+## Loops
+
+### Serial
+
+`T.serial` creates a plain for‑loop. Common forms:
+
+```python
+for i in T.serial(N):
+    ...                     # 0..N-1
+
+for i in T.serial(0, N, 2):
+    ...                     # 0, 2, 4, ...
+```
+
+### Unroll
+
+`T.unroll` requests loop unrolling for small trip counts.
+
+```python
+for k in T.unroll(K_TILE):
+    acc += a[k] * b[k]
+```
+
+Advanced: TileLang forwards unroll hints to TIR; factor/explicit knobs are
+available for expert tuning.
+
+### Parallel (elementwise)
+
+`T.Parallel(ext0, ext1, ...)` builds nested loops that map well to elementwise
+operations. The body receives all indices in one `for` header:
+
+```python
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Optional: `coalesced_width=` can hint memory coalescing for the innermost loop.
+
+### Pipelined (software pipelining)
+
+`T.Pipelined(iters, num_stages=...)` overlaps producer/consumer stages (e.g.,
+Global→Shared copies with compute). This is the backbone of GEMM/attention
+pipelines.
+
+```python
+for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    T.copy(A[by * BM, ko * BK], A_s)  # stage: copy A tile
+    T.copy(B[ko * BK, bx * BN], B_s)  # stage: copy B tile
+    T.gemm(A_s, B_s, C_f)             # stage: compute
+```
+
+### Persistent (advanced)
+
+`T.Persistent(domain, wave_size, index, group_size=...)` exposes persistent
+thread‑block style looping. It is an advanced construct that TileLang lowers in
+later passes and is typically used by specialized templates.
+
+## While Loops
+
+`while` is supported when the condition is a TIR expression. Avoid infinite
+loops; TileLang will error if it detects a constant‑true condition.
+
+```python
+i = 0
+while i < N:
+    ...
+    if done:
+        break
+    i += 1
+```
+
+## Break and Continue
+
+Use Python `break`/`continue` to exit or skip within `T.serial`/`T.unroll`/
+`T.Parallel`/`while` loops. Keep the body clean after a `break`/`continue` for
+readability; the compiler will ignore the dead path.
+
+## Putting It Together: Residual Tile Handling
+
+Below is a typical edge pattern for a 2D kernel. With LegalizeSafeMemoryAccess,
+the explicit guard can be omitted when you don’t need a custom edge path.
+
+```python
+for i, j in T.Parallel(M, N):
+    gi = by * BM + i
+    gj = bx * BN + j
+    if T.all_of(gi < M, gj < N):     # optional in many cases
+        C[gi, gj] = A[gi, gj] + B[gi, gj]
+```
+
+## Debugging Conditions
+
+Use `T.print` to inspect values under predicates. For buffers, TileLang prints
+from a single thread to avoid duplicate outputs.
+
+```python
+if i == 0:
+    T.print(C, msg='C tile:')
+```
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
new file mode 100644
index 00000000..84bd9217
--- /dev/null
+++ b/docs/programming_guides/instructions.md
@@ -0,0 +1,182 @@
+# Instructions
+
+This page summarizes the core TileLang “instructions” available at the DSL
+level, how they map to hardware concepts, and how to use them correctly.
+
+## Quick Categories
+- Data movement: `T.copy`, `T.c2d_im2col`, staging Global ↔ Shared ↔ Fragment
+- Compute primitives: `T.gemm`/`T.gemm_sp`, elementwise math (`T.exp`, `T.max`),
+  reductions (`T.reduce_sum`, `T.cumsum`, warp reducers)
+- Control helpers: `T.clear`/`T.fill`, `T.reshape`/`T.view`
+- Diagnostics: `T.print`, `T.device_assert`
+- Advanced: atomics, memory barriers, warp‑group ops
+
+## Data Movement
+
+Use `T.copy(src, dst, coalesced_width=None, disable_tma=False, eviction_policy=None)`
+to move tiles between memory scopes. It accepts `tir.Buffer`, `BufferLoad`, or
+`BufferRegion`; extents are inferred or broadcast when possible.
+
+```python
+# Global → Shared tiles (extents inferred from dst)
+T.copy(A[by * BM, ko * BK], A_s)
+T.copy(B[ko * BK, bx * BN], B_s)
+
+# Fragment/Register → Global (store result)
+T.copy(C_f, C[by * BM, bx * BN])
+```
+
+Semantics
+- Extents are deduced from arguments; missing sides broadcast to the other’s rank.
+- Access patterns are legalized and coalesced during lowering. Explicit
+  vectorization is not required in HL mode.
+- Safety: the LegalizeSafeMemoryAccess pass inserts boundary guards when an
+  access may be out‑of‑bounds and drops them when proven safe.
+
+Other helpers
+- `T.c2d_im2col(img, col, ...)`: convenience for conv‑style transforms.
+
+## Compute Primitives
+
+GEMM and sparse GEMM
+- `T.gemm(A_shared, B_shared, C_fragment)`: computes a tile GEMM using shared
+  inputs and a fragment accumulator; lowered to target‑specific tensor cores.
+- `T.gemm_sp(...)`: 2:4 sparse tensor core variant (see examples and README).
+
+Reductions and scans
+- `T.reduce_sum`, `T.reduce_max`, `T.reduce_min`, `T.cumsum`, plus warp
+  reducers (`T.warp_reduce_sum`, etc.).
+- Allocate and initialize accumulators via `T.alloc_fragment` + `T.clear` or
+  `T.fill`.
+
+Elementwise math
+- Most math ops mirror TVM TIR: `T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`,
+  `T.sigmoid`, etc. Compose freely inside loops.
+
+Reshape/view (no copy)
+- `T.reshape(buf, new_shape)` and `T.view(buf, shape=None, dtype=None)` create
+  new views that share storage, with shape/dtype checks enforced.
+
+## Synchronization (HL usage)
+
+In HL pipelines, you usually don’t need to write explicit barriers. Passes such
+as PipelinePlanning/InjectSoftwarePipeline/InjectTmaBarrier orchestrate
+producer/consumer ordering and thread synchronization behind the scenes.
+
+If you need debugging or explicit checks:
+- `T.device_assert(cond, msg='')` emits device‑side asserts on CUDA targets.
+- `T.print(obj, msg='...')` prints scalars or buffers safely from one thread.
+
+## Putting It Together: GEMM Tile
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)  # Global → Shared
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)             # compute into fragment
+
+        T.copy(C_f, C[by * BM, bx * BN])      # store back
+```
+
+## Instruction Reference (Concise)
+
+Below is a concise list of TileLang instructions grouped by category. For full
+signatures, behaviors, constraints, and examples, refer to API Reference
+(`autoapi/tilelang/index`).
+
+Data movement
+- `T.copy(src, dst, ...)`: Move tiles between Global/Shared/Fragment.
+- `T.c2d_im2col(img, col, ...)`: 2D im2col transform for conv.
+
+Memory allocation and descriptors
+- `T.alloc_shared(shape, dtype, scope='shared.dyn')`: Allocate shared buffer.
+- `T.alloc_fragment(shape, dtype, scope='local.fragment')`: Allocate fragment.
+- `T.alloc_var(dtype, [init], scope='local.var')`: Scalar var buffer (1 elem).
+- `T.alloc_barrier(arrive_count)`: Shared barrier buffer.
+- `T.alloc_tmem(shape, dtype)`: Tensor memory (TMEM) buffer (Hopper+).
+- `T.alloc_reducer(shape, dtype, op='sum', replication=None)`: Reducer buf.
+- `T.alloc_descriptor(kind, dtype)`: Generic descriptor allocator.
+  - `T.alloc_wgmma_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_smem_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_instr_desc(dtype='uint32')`
+- `T.empty(shape, dtype='float32')`: Declare function output tensors.
+
+Compute primitives
+- `T.gemm(A_s, B_s, C_f)`: Tile GEMM into fragment accumulator.
+- `T.gemm_sp(...)`: Sparse (2:4) tensor core GEMM.
+- Reductions: `T.reduce_sum/max/min/abssum/absmax`, bitwise `and/or/xor`.
+- Scans: `T.cumsum`, finalize: `T.finalize_reducer`.
+- Warp reducers: `T.warp_reduce_sum/max/min/bitand/bitor`.
+- Elementwise math: TIR ops (`T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`, ...).
+- Fast math: `T.__log/__log2/__log10/__exp/__exp2/__exp10/__sin/__cos/__tan`.
+- IEEE math: `T.ieee_add/sub/mul/fmaf` (configurable rounding).
+- Helpers: `T.clear(buf)`, `T.fill(buf, value)`.
+- Views: `T.reshape(buf, shape)`, `T.view(buf, shape=None, dtype=None)`.
+
+Diagnostics
+- `T.print(obj, msg='')`: Print scalar/buffer from one thread.
+- `T.device_assert(cond, msg='')`: Device-side assert (CUDA).
+
+Logical helpers
+- `T.any_of(a, b, ...)`, `T.all_of(a, b, ...)`: Multi-term predicates.
+
+Annotation helpers
+- `T.use_swizzle(panel_size=..., enable=True)`: Rasterization hint.
+- `T.annotate_layout({...})`: Attach explicit layouts to buffers.
+- `T.annotate_safe_value(var, ...)`: Safety/const hints.
+- `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
+
+Atomics
+- `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
+- `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.
+- `T.atomic_max(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_min(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_load(dst)`, `T.atomic_store(dst, value)`.
+
+Custom intrinsics
+- `T.dp4a(A, B, C)`: 4‑element dot‑product accumulate.
+- `T.clamp(x, lo, hi)`: Clamp to [lo, hi].
+- `T.loop_break()`: Break from current loop via intrinsic.
+
+Barriers, TMA, warp‑group
+- Barriers: `T.create_list_of_mbarrier(...)`, `T.get_mbarrier(i)`.
+- Parity ops: `T.mbarrier_wait_parity(barrier, parity)`, `T.mbarrier_arrive(barrier)`.
+- Expect tx: `T.mbarrier_expect_tx(...)`; sugar: `T.barrier_wait(id, parity=None)`.
+- TMA: `T.create_tma_descriptor(...)`, `T.tma_load(...)`,
+  `T.tma_store_arrive(...)`, `T.tma_store_wait(...)`.
+- Proxy/fences: `T.fence_proxy_async(...)`, `T.warpgroup_fence_operand(...)`.
+- Warp‑group: `T.warpgroup_arrive()`, `T.warpgroup_commit_batch()`,
+  `T.warpgroup_wait(num_mma)`, `T.wait_wgmma(id)`.
+
+Lane/warp index
+- `T.get_lane_idx(warp_size=None)`: Lane id in warp.
+- `T.get_warp_idx_sync(warp_size=None)`: Canonical warp id (sync).
+- `T.get_warp_idx(warp_size=None)`: Canonical warp id (no sync).
+- `T.get_warp_group_idx(warp_size=None, warps_per_group=None)`: Group id.
+
+Register control
+- `T.set_max_nreg(reg_count, is_inc)`, `T.inc_max_nreg(n)`, `T.dec_max_nreg(n)`.
+- `T.annotate_producer_reg_dealloc(n=24)`, `T.annotate_consumer_reg_alloc(n=240)`.
+- `T.no_set_max_nreg()`, `T.disable_warp_group_reg_alloc()`.
+
+ 
+
+## Notes on Dtypes
+
+Dtypes accept three equivalent forms:
+- String: `'float32'`
+- TileLang dtype: `T.float32`
+- Framework dtype: `torch.float32`
+All are normalized internally. See Type System for details.
diff --git a/docs/programming_guides/language_basics.md b/docs/programming_guides/language_basics.md
new file mode 100644
index 00000000..1152680c
--- /dev/null
+++ b/docs/programming_guides/language_basics.md
@@ -0,0 +1,234 @@
+# Language Basics
+
+This page introduces the core TileLang (tile‑lang) DSL that you’ll use to write
+high‑performance kernels. It focuses on how to define a kernel, express
+iteration, move data across memory scopes, and run it with JIT.
+
+The examples use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## 1. Defining a Kernel with `@T.prim_func`
+
+TileLang kernels are TIR (TVM IR) functions produced by the `@T.prim_func`
+decorator. Arguments are annotated with shapes and dtypes via `T.Tensor` or
+`T.Buffer`.
+
+Note on dtypes
+- You can pass dtypes as a string (e.g., 'float32'), a TileLang dtype (e.g., `T.float32`),
+  or a framework dtype (e.g., `torch.float32`). TileLang normalizes all of these.
+  See Type System for details.
+
+```python
+@T.prim_func
+def add_kernel(
+    A: T.Tensor((N,), dtype),    # dtype could be 'float32' | T.float32 | torch.float32
+    B: T.Tensor((N,), dtype),
+    C: T.Tensor((N,), dtype),
+):
+    ...  # kernel body
+```
+
+- Shapes may be concrete integers or symbolic. For symbolic, you can pass
+  Python ints through the outer `@jit` wrapper (shown below), or annotate with
+  `T.dyn` when you want a named symbolic dimension.
+
+```python
+# Named symbolic dimension (optional)
+K = T.dyn['K']
+@T.prim_func
+def uses_dyn(A: T.Tensor((K,), 'float32')):
+    ...
+```
+
+### Dynamic symbolic dimensions: two ways
+
+TileLang supports two complementary ways to introduce symbolic (dynamic) dims:
+
+- Type-level annotations via `T.dyn[...]` (recommended for function signatures)
+  - Use in `T.Tensor((T.dyn['K'], ...), dtype)` or bind once then reuse (as above).
+  - Inside the kernel body, prefer reading from the buffer’s shape, e.g. `M = A.shape[0]`.
+
+- Term-level variables via `T.dynamic(name, dtype)`
+  - Creates a TIR `tir.Var` you can use directly in expressions/loops.
+  - Handy when you need to reference the dimension symbol in the body.
+
+```python
+# 1) Annotation-only symbol; read the bound size via shape
+K = T.dyn['K']  # dtype defaults to int32
+@T.prim_func
+def foo(A: T.Tensor((K,), 'float32')):
+    N = A.shape[0]
+    for i in T.serial(N):
+        ...
+
+# 2) Explicit Var symbol usable in the body
+K = T.dynamic('K', 'int32')   # or T.dynamic('K') defaults to int32
+@T.prim_func
+def bar(A: T.Tensor((K,), 'float32')):
+    for i in T.serial(K):
+        ...
+```
+
+Notes
+- `T.symbolic(name, dtype)` is a deprecated alias of `T.dynamic`; prefer `T.dynamic`.
+- Under `@jit`, concrete sizes come from the actual tensor arguments at the first call.
+- Symbols in annotations do not need to be separate kernel arguments; TileLang binds them from argument shapes.
+
+## 2. Launching Work with `T.Kernel`
+
+`with T.Kernel(...)` declares a launch context and creates block/thread
+bindings. For GPU backends, specify a grid and threads per block.
+
+```python
+with T.Kernel(grid_x, grid_y, threads=128) as (bx, by):
+    ...  # bx/by are blockIdx.x/y
+```
+
+You rarely need raw thread indices; most kernels use structured loops
+(`T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`) inside a `T.Kernel`.
+
+## 3. Loops and Control Flow
+
+Core loop constructs map to familiar hardware patterns:
+
+- `T.serial(start, stop[, step])`: plain for‑loop
+- `T.unroll(start, stop[, step])`: unrolled loop
+- `T.Parallel(ext0, ext1, ...)`: nested parallel loops (elementwise‑friendly)
+- `T.Pipelined(iters, num_stages=N)`: software pipelining for producer/consumer
+
+```python
+for i in T.serial(N):
+    ...
+
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+
+for k in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    # overlap copy/compute across stages
+    ...
+```
+
+Conditionals use standard Python `if`/`else`. Guard edges with predicates when
+tile sizes do not divide problem sizes evenly.
+
+## 4. Memory Scopes and Allocation
+
+TileLang exposes key software‑managed scopes:
+
+- Global: device memory (default for `T.Tensor` arguments)
+- Shared: on‑chip, block‑visible (`T.alloc_shared(shape, dtype)`)
+- Fragment and scalars: per‑thread fragments and scalar vars but in Shared View
+  (`T.alloc_fragment`, `T.alloc_var`)
+
+```python
+A_shared = T.alloc_shared((BM, BK), 'float16')
+B_shared = T.alloc_shared((BK, BN), 'float16')
+C_local  = T.alloc_fragment((BM, BN), 'float32')
+T.clear(C_local)  # zero accumulators
+```
+
+## 5. Moving Data: `T.copy`
+
+Use `T.copy(src, dst)` to move tiles between scopes. It accepts buffers,
+buffer regions, or buffer loads; extents are inferred or can be broadcast.
+
+```python
+# Global -> Shared (tile copy), extents inferred from dst
+T.copy(A[by * BM, ko * BK], A_shared)
+T.copy(B[ko * BK, bx * BN], B_shared)
+
+# Fragment -> Global (store back)
+T.copy(C_local, C[by * BM, bx * BN])
+```
+
+`T.copy` performs coalescing and scope‑specific lowering during compilation.
+
+## 6. A Minimal End‑to‑End Example (Vector Add)
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+
+@jit  # infers target from tensors at first call
+def add(N: int, block: int = 256, dtype: str = 'float32'):
+
+    @T.prim_func
+    def add_kernel(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block), threads=block) as bx:
+            for i in T.Parallel(block):
+                gi = bx * block + i
+                # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB
+                C[gi] = A[gi] + B[gi]
+
+    return add_kernel
+
+# Host side (PyTorch shown; NumPy/DLPack also supported)
+import torch
+N = 1 << 20
+A = torch.randn(N, device='cuda', dtype=torch.float32)
+B = torch.randn(N, device='cuda', dtype=torch.float32)
+C = torch.empty(N, device='cuda', dtype=torch.float32)
+
+kernel = add(N)
+kernel(A, B, C)  # runs on GPU
+torch.testing.assert_close(C, A + B)
+```
+
+Notes
+- The `@jit` wrapper returns a callable kernel after the first compilation.
+- You can pass compile‑time tunables (tile sizes, dtypes) through the outer
+  Python function and bake them into the generated TIR.
+
+## 7. Tiled GEMM Skeleton
+
+Below is a minimal pattern for a tiled GEMM using shared memory staging and a
+fragment accumulator. It mirrors the quickstart style found in the repository.
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)  # lowered to tensor‑core/ISA specific kernels
+
+        T.copy(C_f, C[by * BM, bx * BN])
+```
+
+## 8. Debugging and Printing
+
+Use `T.print` inside a kernel for quick introspection. TileLang emits printing
+from a single thread for shared/fragment scopes to avoid floods.
+
+```python
+T.print(C_f, msg='accumulator:')
+T.print(A_s, msg='A tile:')
+T.print(C[0], msg='C[0] = ')
+```
+
+## 9. Where to Go Next
+
+- Control flow details: see Programming Guides → Control Flow
+- Memory topics: see Programming Guides → (removed cache/layout); basics are covered inline
+- Autotuning tile sizes and mappings: Programming Guides → Autotuning
+- Operator examples (GEMM, GEMV, attention): see Deep Learning Operators
diff --git a/docs/programming_guides/overview.md b/docs/programming_guides/overview.md
new file mode 100644
index 00000000..64b6d203
--- /dev/null
+++ b/docs/programming_guides/overview.md
@@ -0,0 +1,27 @@
+# Programming Guides Overview
+
+This section provides a practical guide to writing high‑performance kernels with Tile Language (tile‑lang).
+It mirrors the structure of a similar guide in another project and adapts it to tile‑lang concepts and APIs.
+
+- Audience: Developers implementing custom GPU/CPU kernels with tile‑lang
+- Prereqs: Basic Python, NumPy/Tensor concepts, and familiarity with GPU programming notions
+- Scope: Language basics, control flow, instructions, autotuning, and type system
+
+## What You’ll Learn
+- How to structure kernels with TileLang’s core DSL constructs
+- How to move data across global/shared/fragment and pipeline compute
+- How to apply autotuning to tile sizes and schedules
+- How to specify and work with dtypes in kernels
+
+## Suggested Reading Order
+1. Language Basics
+2. Control Flow
+3. Instructions
+4. Autotuning
+5. Type System
+
+## Related Docs
+- Tutorials: see existing guides in `tutorials/`
+- Operators: examples in `deeplearning_operators/`
+
+> NOTE: This is a draft scaffold. Fill in code snippets and benchmarks as APIs evolve.
diff --git a/docs/programming_guides/type_system.md b/docs/programming_guides/type_system.md
new file mode 100644
index 00000000..32b9274d
--- /dev/null
+++ b/docs/programming_guides/type_system.md
@@ -0,0 +1,42 @@
+# Type System
+
+This page lists the data types supported by TileLang and how to specify them in
+kernels. For full details and the authoritative list, see the API Reference
+(`autoapi/tilelang/index`) and `tilelang.language.v2.dtypes`.
+
+How to specify dtypes
+- Use any of the following forms; TileLang normalizes them internally:
+  - String: `'float32'`, `'int8'`, `'bfloat16'`, ...
+  - TileLang dtype object: `T.float32`, `T.int8`, `T.bfloat16`, ...
+  - Framework dtype: `torch.float32`, `torch.int8`, `torch.bfloat16`, ...
+
+Common scalar types
+- Boolean: `bool`
+- Signed integers: `int8`, `int16`, `int32`, `int64`
+- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
+- Floating‑point: `float16` (half), `bfloat16`, `float32`, `float64`
+
+Float8 and low‑precision families
+- Float8: `float8_e3m4`, `float8_e4m3`, `float8_e4m3b11fnuz`, `float8_e4m3fn`,
+  `float8_e4m3fnuz`, `float8_e5m2`, `float8_e5m2fnuz`, `float8_e8m0fnu`
+- Float6: `float6_e2m3fn`, `float6_e3m2fn`
+- Float4: `float4_e2m1fn`
+
+Vectorized element types (SIMD packs)
+- For many base types, vector‑packed variants are available by lane count:
+  `x2`, `x4`, `x8`, `x16`, `x32`, `x64`.
+- Examples:
+  - Integers: `int8x2`, `int8x4`, ..., `int32x2`, `int32x4`, ...
+  - Unsigned: `uint8x2`, `uint8x4`, ...
+  - Floats: `float16x2`, `float16x4`, `float32x2`, `float32x4`, ...
+  - Float8/6/4 families also provide `x2/x4/x8/x16/x32/x64` where applicable,
+    e.g., `float8_e4m3x2`, `float8_e4m3x4`, `float6_e2m3fnx8`, `float4_e2m1fnx16`.
+
+Notes
+- Availability of certain low‑precision formats (float8/6/4) depends on target
+  architecture and backend support.
+- Choose accumulation dtypes explicitly for mixed‑precision compute (e.g.,
+  GEMM with `float16` inputs and `float32` accumulators).
+- The complete, up‑to‑date list is exposed in
+  `tilelang.language.v2.dtypes` and rendered in the API Reference.
+
-- 
GitLab


From ba2c1856d766c3abf4756cfc1da121c2bb44f963 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 12 Dec 2025 13:57:10 +0800
Subject: [PATCH 095/139] [Dependency] Add torch-c-dlpack-ext to project
 requirements (#1403)

* [Dependency] Add torch-c-dlpack-ext to project requirements

* Added torch-c-dlpack-ext to both pyproject.toml and requirements.txt to provide prebuilt torch extensions, which may prevent JIT compilation on first import of TVM FFI.

* [Build] Update manylinux images in project configuration

* Changed the manylinux image for x86_64 from "manylinux2014" to "manylinux_2_28" in both pyproject.toml and the Dockerfile to align with updated standards for compatibility and performance.

* [Build] Update CUDA repository configuration in pyproject.toml

* Changed the package manager command from `yum-config-manager` to `dnf config-manager` for adding the CUDA repository, ensuring compatibility with newer systems.

* fix

* [Build] Update CUDA repository to RHEL 8

* Changed the CUDA repository configuration in both pyproject.toml and the manylinux Dockerfile from RHEL 7 to RHEL 8, ensuring compatibility with newer systems.

* test: run out of space

* use cu130 to reduce size

* upd

* upd comment

* upd

---------

Co-authored-by: Your Name <wenji.yyc@alibaba-inc.com>
---
 .github/workflows/dist.yml              |  2 +-
 maint/scripts/pypi.manylinux.Dockerfile |  6 +++---
 pyproject.toml                          | 23 +++++++++++++----------
 requirements.txt                        |  1 +
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 86e584fe..7e9fae26 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -106,7 +106,7 @@ jobs:
     strategy:
       matrix:
         target:
-          - { runner: ubuntu-latest, toolkit: "CUDA-12.1" }
+          - { runner: ubuntu-latest, toolkit: "CUDA-12.8" }
           - { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
           - { runner: macos-latest, toolkit: "Metal" }
         python-version:
diff --git a/maint/scripts/pypi.manylinux.Dockerfile b/maint/scripts/pypi.manylinux.Dockerfile
index 5ca69412..de45df02 100644
--- a/maint/scripts/pypi.manylinux.Dockerfile
+++ b/maint/scripts/pypi.manylinux.Dockerfile
@@ -1,8 +1,8 @@
-FROM quay.io/pypa/manylinux2014_x86_64 AS builder_amd64
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64
 
-RUN yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.8
 ENV CUDA_VERSION=${CUDA_VERSION}
 
 FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64
diff --git a/pyproject.toml b/pyproject.toml
index 088737d4..22467134 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,9 @@ dependencies = [
     # should be removed after our tvm's update.
     # See discussion in tilelang#1373 and apache/tvm-ffi#307
     "apache-tvm-ffi>=0.1.2",
+    # torch-c-dlpack-ext provides prebuilt torch extensions.
+    # Without it, TVM FFI may require JIT compilation on first import.
+    "torch-c-dlpack-ext",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
@@ -218,12 +221,10 @@ environment.PYTHONDEVMODE = "1"
 environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
 environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
-# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
-# TODO: upgrade to manylinux_2_28 at some time
-manylinux-x86_64-image = "manylinux2014"   # CentOS 7
-manylinux-aarch64-image = "manylinux_2_28" # AlmaLinux 8
+manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_28"  # AlmaLinux 8
 # Install CUDA runtime and stub driver library
-# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
+# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
 set -eux
 
@@ -232,8 +233,8 @@ uname -a
 
 case "$(uname -m)" in
     "x86_64")
-        DEFAULT_CUDA_VERSION="12.1"
-        yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        DEFAULT_CUDA_VERSION="12.8"
+        dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
         ;;
     "aarch64")
         DEFAULT_CUDA_VERSION="12.8"
@@ -247,6 +248,7 @@ esac
 cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
 yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
+yum clean all
 """
 repair-wheel-command = [
     "auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
@@ -261,7 +263,8 @@ repair-wheel-command = [
 
 [[tool.cibuildwheel.overrides]]
 select = "*linux*x86_64*"
-# CentOS 7 is too old to run import test. Do wheel installation test only.
-test-command = [
-    "echo 'Wheel is installed successfully'",
+# x86_64 runners in GitHub Actions have limited storage,
+# pre-install torch without caching to reduce disk usage during install tilelang.
+before-test = [
+    "pip install torch --no-cache-dir",
 ]
diff --git a/requirements.txt b/requirements.txt
index 58e851d7..4148dcb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 # Runtime requirements
 apache-tvm-ffi>=0.1.2
+torch-c-dlpack-ext
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
-- 
GitLab


From 34632a1b012d593fd27516fb306eb780ea4bdf80 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 12 Dec 2025 13:57:28 +0800
Subject: [PATCH 096/139] [Dependency] Update TVM subproject to latest commit
 2b1ead1a (#1412)

---
 3rdparty/tvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index afc07935..2b1ead1a 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit afc079350def46a78931c6edeb7bad3fb248b4e1
+Subproject commit 2b1ead1a375704c75af563cc800aa9347583ba2b
-- 
GitLab


From 6f67da84bc6236e63ec3fbe43aaf594fc3a29814 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 12 Dec 2025 16:30:56 +0800
Subject: [PATCH 097/139] [Enhancement] Introduce `T.__ldg` (#1414)

* [Enhancement] Add __ldg intrinsic for CUDA read-only cache loads

* Introduced the __ldg intrinsic to enable explicit read-only cached loads from global memory in CUDA.
* Updated the corresponding documentation and added support in both CUDA and HIP code generation.
* Enhanced the Python interface for __ldg to accept BufferLoad and Buffer types, improving usability.

* [Enhancement] Update formatting and linting rules in pyproject.toml; minor test adjustment

* Added new formatting rules in pyproject.toml to enforce consistent code style, including hanging indents and argument splitting.
* Updated test_tilelang_language_intrinsics_codegen.py to improve readability by adding a blank line before the main execution block.
* Refactored error messages in builtin.py for better clarity and consistency, ensuring proper formatting in function definitions and raising ValueErrors.

* lint fix
---
 src/op/builtin.cc                             |  5 ++++
 src/op/builtin.h                              | 18 +++++++++++
 src/target/codegen_cuda.cc                    | 17 +++++++++++
 src/target/codegen_hip.cc                     | 10 +++++++
 ...st_tilelang_language_intrinsics_codegen.py | 30 +++++++++++++++++++
 tilelang/language/__init__.py                 |  1 +
 tilelang/language/builtin.py                  | 29 ++++++++++++++++++
 7 files changed, 110 insertions(+)
 create mode 100644 testing/python/language/test_tilelang_language_intrinsics_codegen.py

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 260ba6fa..7cc7f579 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -368,5 +368,10 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index ea861d06..47679a51 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -600,6 +600,24 @@ TVM_DLL const Op &warp_reduce_bitand();
  */
 TVM_DLL const Op &warp_reduce_bitor();
 
+/*!
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
+ */
+TVM_DLL const Op &__ldg();
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 65f23d8d..72a8a5ec 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -2354,6 +2354,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 7269a18d..420f4d9f 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
new file mode 100644
index 00000000..f817be26
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+            x: T.Tensor((N,), "float32"),
+            y: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index f78c8711..c91ac3cb 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -96,6 +96,7 @@ from .customize import (
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index e40d1f0d..0bc12fcd 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -59,6 +59,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
         raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
 
 
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
+
+    Args:
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
+
+    Returns:
+        PrimExpr: The loaded value.
+    """
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
+
+
 def get_mbarrier(*args):
     """Retrieve a memory barrier operation.
 
-- 
GitLab


From e84b24bc6a6b4bbfece53dcc221791bd169e09cf Mon Sep 17 00:00:00 2001
From: Xiangwen Wang <77378439+LJC00118@users.noreply.github.com>
Date: Fri, 12 Dec 2025 17:19:33 +0800
Subject: [PATCH 098/139] [Enhancement] Improve vectorization invariant check
 (#1398)

* Improve loop vectorize

* Improve loop vectorize

* Improve loop vectorize

* Improve loop vectorize

* Improve loop vectorize

* Add some vectorize tests and comments
---
 src/transform/loop_vectorize.cc               | 18 ++++++
 .../test_tilelang_language_vectorize.py       | 59 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 836a52b4..03b80825 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -291,6 +291,24 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
   if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_size_for_iter),
                                0))
     return false;
+
+  // Check if expr is invariant within vector boundaries
+  // We're trying to prove the access expression A[f(var)] depends only on
+  // floor(var/vecsize), not on var%vecsize
+  // Mathematically:
+  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
+  // f(floor(var/vecsize)*vecsize + 0)
+  // Example: for i in T.vectorized(8):
+  //     A[i] = B[i] * C[i//4]
+  // if vecsize=4, f(i)=i//4 depends only on i//4
+  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
+  PrimExpr var_aligned =
+      floordiv(var, target_vectorized_size) * target_vectorized_size;
+  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
+  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+    return true;
+  }
+
   auto simplified_expr = analyzer->Simplify(Substitute(expr, {{var, zero}}));
   // The base offset must be divisible
   if (!analyzer->CanProveEqual(FloorMod(simplified_expr, target_size_for_expr),
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index cee8b5a6..bc2d3144 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -5,7 +5,6 @@ import tilelang.language as T
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test(N, M, stride_A, stride_B):
-    assert N % 128 == 0 and M % 128 == 0
 
     @T.prim_func
     def main(
@@ -23,6 +22,7 @@ def vectorize_test(N, M, stride_A, stride_B):
 
 
 def run_vectorize(N, M, stride_A, stride_B):
+    assert N % 128 == 0 and M % 128 == 0
     assert stride_A >= N and stride_B >= N
 
     jit_kernel = vectorize_test(N, M, stride_A, stride_B)
@@ -59,5 +59,62 @@ def test_vectorize():
     run_vectorize(N, M, N + 8, N + 16)
 
 
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
+def vectorize_test_invariant_index(N, M, K):
+
+    @T.prim_func
+    def main(
+            A: T.Tensor[(N, M), "float32"],  # noqa: F821
+            B: T.Tensor[(N, M), "float32"],  # noqa: F821
+            C: T.Tensor[(N, M // K), "float32"],  # noqa: F821
+    ):
+        with T.Kernel(N // 128, threads=128) as (bx):
+            tx = T.get_thread_binding(0)
+            row = bx * 128 + tx
+
+            for col in T.vectorized(M):
+                B[row, col] = A[row, col] * C[row, col // K]
+
+    return main
+
+
+def run_vectorize_invariant_index(N, M, K):
+    assert N % 128 == 0 and M % K == 0
+
+    jit_kernel = vectorize_test_invariant_index(N, M, K)
+
+    a = torch.randn(N, M, device="cuda", dtype=torch.float32)
+    b = torch.zeros(N, M, device="cuda", dtype=torch.float32)
+    c = torch.randn(N, M // K, device="cuda", dtype=torch.float32)
+
+    jit_kernel(a, b, c)
+
+    indices = torch.arange(a.size(1)) // K
+    ret = a * c[:, indices]
+    torch.testing.assert_close(b, ret, atol=1e-8, rtol=1e-8)
+
+    code = jit_kernel.get_kernel_source()
+
+    vectorize_size = 1
+    while vectorize_size <= 2 and K % (vectorize_size * 2) == 0:
+        vectorize_size *= 2
+
+    if vectorize_size == 4:
+        assert "float4" in code
+    elif vectorize_size == 2:
+        assert "float2" in code
+
+
+def test_vectorize_invariant_index():
+    N, M = 512, 256
+
+    run_vectorize_invariant_index(N, M, 2)
+    run_vectorize_invariant_index(N, M, 4)
+    run_vectorize_invariant_index(N, M * 3, 6)
+    run_vectorize_invariant_index(N, M, 8)
+    run_vectorize_invariant_index(N, M * 3, 12)
+    run_vectorize_invariant_index(N, M * 7, 14)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
-- 
GitLab


From 29051439dbed90583bfad1d16dfca88a95e78709 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 12 Dec 2025 17:22:18 +0800
Subject: [PATCH 099/139] [Lint] Phaseout Yapf format and embrace ruff format
 (#1417)

---
 .pre-commit-config.yaml                       |  14 +-
 .../benchmark_library_dense_fmha.py           |  13 +-
 .../benchmark_tilelang_block_sparse_fmha.py   |  72 +-
 .../benchmark_torch_block_sparse_fmha.py      |  25 +-
 .../benchmark_triton_block_sparse_fmha.py     |  40 +-
 .../mamba2/benchmark_mamba_chunk_scan.py      | 207 +++--
 benchmark/matmul/benchmark_matmul.py          |  16 +-
 .../matmul/benchmark_matmul_intrinsic.py      |  40 +-
 benchmark/matmul/benchmark_matmul_sp.py       |  52 +-
 benchmark/matmul_fp8/benchmark_matmul.py      |  15 +-
 docs/conf.py                                  |  31 +-
 examples/amd/example_amd_flash_attn_bwd.py    | 206 +++--
 examples/amd/example_amd_flash_attn_fwd.py    | 105 +--
 examples/analyze/example_conv_analyze.py      |  45 +-
 examples/analyze/example_gemm_analyze.py      |   6 +-
 .../attention_sink/benchmark_gqa_sink_fwd.py  |  48 +-
 .../attention_sink/benchmark_mha_sink_fwd.py  |  64 +-
 .../example_gqa_sink_bwd_bhsd.py              | 286 +++----
 ...ample_gqa_sink_fwd_bhsd_wgmma_pipelined.py | 167 ++--
 .../example_mha_sink_bwd_bhsd.py              | 264 +++---
 .../example_mha_sink_fwd_bhsd.py              | 199 ++---
 ...ample_mha_sink_fwd_bhsd_wgmma_pipelined.py | 210 +++--
 examples/bitnet-1.58b/benchmark_generate.py   |  35 +-
 .../benchmark_inference_latency.py            |   9 +-
 examples/bitnet-1.58b/configuration_bitnet.py |  16 +-
 examples/bitnet-1.58b/eval_correctness.py     |  22 +-
 examples/bitnet-1.58b/eval_gpu_memory.py      |  13 +-
 examples/bitnet-1.58b/eval_ppl.py             |  28 +-
 examples/bitnet-1.58b/eval_utils.py           |  20 +-
 .../tilelang_bitnet_158_int8xint2_decode.py   |  36 +-
 .../tilelang_bitnet_158_int8xint2_prefill.py  |  77 +-
 .../kernel_benchmark/tl_int8xint8.py          |  22 +-
 examples/bitnet-1.58b/load_from_quantized.py  |   8 +-
 .../bitnet-1.58b/maint/create_bitblas_ckpt.py |  21 +-
 examples/bitnet-1.58b/modeling_bitnet.py      | 308 +++----
 examples/bitnet-1.58b/tokenization_bitnet.py  |  60 +-
 examples/bitnet-1.58b/utils_quant.py          |  24 +-
 .../bitnet-1.58b/vllm_workspace/conftest.py   |  35 +-
 .../inference_with_compress_format.py         |  15 +-
 .../inference_with_native_format.py           |  14 +-
 examples/bitnet-1.58b/vllm_workspace/utils.py |  23 +-
 .../block_sparse_attn_triton.py               |  71 +-
 .../example_tilelang_block_sparse_attn.py     |  81 +-
 ...xample_tilelang_sparse_gqa_decode_paged.py | 240 +++---
 ...ilelang_sparse_gqa_decode_varlen_indice.py | 240 +++---
 ..._tilelang_sparse_gqa_decode_varlen_mask.py | 232 ++---
 ..._triton_sparse_gqa_decode_varlen_indice.py | 153 ++--
 ...le_triton_sparse_gqa_decode_varlen_mask.py | 147 ++--
 examples/blocksparse_attention/heuristic.py   |   3 +-
 .../test_example_blocksparse_attention.py     |  20 +-
 .../example_blocksparse_gemm.py               |  69 +-
 ...ample_group_per_split_token_cast_to_fp8.py |  51 +-
 .../cast/example_per_token_cast_to_fp8.py     |  27 +-
 examples/cast/example_triton_cast_to_fp8.py   |   4 +-
 examples/cast/test_example_cast.py            |   3 +-
 examples/compile_flags/usecase.py             |  10 +-
 examples/conftest.py                          |   7 +-
 examples/convolution/example_convolution.py   |  64 +-
 .../example_convolution_autotune.py           | 135 +--
 .../example_deepgemm_fp8_2xAcc.py             |  29 +-
 .../amd/benchmark_mla_decode_amd_tilelang.py  | 186 ++--
 .../amd/benchmark_mla_decode_amd_torch.py     | 165 ++--
 .../amd/benchmark_mla_decode_amd_triton.py    | 165 ++--
 examples/deepseek_mla/benchmark_mla.py        | 198 ++---
 examples/deepseek_mla/example_mla_decode.py   | 181 ++--
 .../deepseek_mla/example_mla_decode_paged.py  | 226 +++--
 .../example_mla_decode_persistent.py          |  98 +--
 .../deepseek_mla/example_mla_decode_ws.py     | 245 +++---
 .../experimental/example_mla_decode_kv_fp8.py |  85 +-
 examples/deepseek_mla/torch_refs.py           |  29 +-
 .../benchmark/benchmark_nsa_fwd.py            | 418 ++++-----
 .../deepseek_nsa/example_tilelang_nsa_bwd.py  | 208 ++---
 .../example_tilelang_nsa_decode.py            |  50 +-
 .../deepseek_nsa/example_tilelang_nsa_fwd.py  |  63 +-
 .../example_tilelang_nsa_fwd_varlen.py        | 178 ++--
 .../deepseek_nsa/example_triton_nsa_bwd.py    | 354 +++++---
 .../deepseek_nsa/example_triton_nsa_fwd.py    | 124 +--
 .../example_triton_nsa_fwd_varlen.py          | 159 ++--
 examples/deepseek_nsa/reference.py            | 113 ++-
 examples/deepseek_v32/fp8_lighting_indexer.py |  96 +--
 examples/deepseek_v32/sparse_mla_bwd.py       | 199 ++---
 examples/deepseek_v32/sparse_mla_fwd.py       | 114 +--
 .../deepseek_v32/sparse_mla_fwd_pipelined.py  | 198 ++---
 .../test_tilelang_example_deepseek_v32.py     |   9 +-
 examples/deepseek_v32/topk_selector.py        |  25 +-
 examples/deepseek_v32/utils.py                | 144 ++--
 examples/dequantize_gemm/dequantize_utils.py  |  24 +-
 .../example_dequant_gemm_bf16_fp4_hopper.py   | 219 ++---
 .../example_dequant_gemm_bf16_mxfp4_hopper.py | 248 +++---
 ...mple_dequant_gemm_bf16_mxfp4_hopper_tma.py | 250 +++---
 .../example_dequant_gemm_fine_grained.py      | 107 ++-
 .../example_dequant_gemm_fp4_hopper.py        |  96 +--
 .../example_dequant_gemm_w4a8.py              |  40 +-
 .../example_dequant_gemv_fp16xint4.py         |  66 +-
 ...e_dequant_groupedgemm_bf16_mxfp4_hopper.py | 190 ++---
 examples/dsa_sparse_finetune/dsa.py           |  95 +--
 examples/dsa_sparse_finetune/index.py         |  23 +-
 examples/dsa_sparse_finetune/indexer_bwd.py   |  75 +-
 .../indexer_topk_reducesum.py                 |  56 +-
 .../dsa_sparse_finetune/sparse_mla_bwd.py     | 228 ++---
 .../dsa_sparse_finetune/sparse_mla_fwd.py     | 126 ++-
 .../sparse_mla_topk_reducesum.py              |  85 +-
 examples/dsa_sparse_finetune/utils.py         |   6 +-
 examples/dynamic_shape/example_dynamic.py     |  16 +-
 .../elementwise/example_elementwise_add.py    |  16 +-
 examples/flash_attention/bert_padding.py      |  16 +-
 examples/flash_attention/example_gqa_bwd.py   | 322 +++----
 .../example_gqa_bwd_tma_reduce.py             | 357 ++++----
 .../example_gqa_bwd_tma_reduce_varlen.py      | 460 +++++-----
 .../example_gqa_bwd_wgmma_pipelined.py        | 217 ++---
 .../flash_attention/example_gqa_fwd_bshd.py   | 133 ++-
 .../example_gqa_fwd_bshd_wgmma_pipelined.py   | 106 +--
 .../flash_attention/example_gqa_fwd_varlen.py | 150 ++--
 .../flash_attention/example_mha_bwd_bhsd.py   | 157 ++--
 .../flash_attention/example_mha_bwd_bshd.py   | 151 ++--
 .../example_mha_bwd_bshd_wgmma_pipelined.py   | 157 ++--
 .../flash_attention/example_mha_fwd_bhsd.py   |  98 +--
 .../example_mha_fwd_bhsd_wgmma_pipelined.py   | 109 +--
 .../flash_attention/example_mha_fwd_bshd.py   |  96 +--
 .../example_mha_fwd_bshd_wgmma_pipelined.py   | 107 +--
 .../flash_attention/example_mha_fwd_varlen.py |  82 +-
 .../test_example_flash_attention.py           |   6 +-
 examples/flash_attention/varlen_utils.py      |  32 +-
 examples/flash_decoding/example_gqa_decode.py | 242 +++---
 .../example_gqa_decode_varlen_logits.py       | 297 +++----
 .../example_gqa_decode_varlen_logits_paged.py | 292 +++----
 .../flash_decoding/example_mha_inference.py   | 145 ++--
 .../fusedmoe/example_fusedmoe_tilelang.py     | 349 ++++----
 examples/fusedmoe/example_fusedmoe_torch.py   |  91 +-
 examples/fusedmoe/test_example_fusedmoe.py    |   9 +-
 examples/gdn/example_chunk_delta_bwd.py       | 222 +++--
 examples/gdn/example_chunk_delta_h.py         | 138 +--
 examples/gdn/example_chunk_o.py               |  83 +-
 examples/gdn/example_chunk_o_bwd.py           | 195 ++---
 examples/gdn/example_chunk_scaled_dot_kkt.py  |  46 +-
 examples/gdn/example_cumsum.py                |  36 +-
 examples/gdn/example_wy_fast.py               |  73 +-
 examples/gdn/example_wy_fast_bwd_split.py     | 225 ++---
 examples/gdn/test_example_gdn_compilation.py  | 279 ++++--
 examples/gdn/test_utils.py                    |  14 +-
 examples/gemm/example_gemm.py                 |   7 +-
 examples/gemm/example_gemm_autotune.py        |  89 +-
 examples/gemm/example_gemm_intrinsics.py      |  22 +-
 examples/gemm/example_gemm_persistent.py      |  63 +-
 examples/gemm/example_gemm_schedule.py        |   7 +-
 .../gemm_fp8/example_tilelang_gemm_amd.py     |  77 +-
 .../gemm_fp8/example_tilelang_gemm_fp8.py     |  15 +-
 .../example_tilelang_gemm_fp8_2xAcc.py        |  16 +-
 .../example_tilelang_gemm_fp8_intrinsic.py    |  22 +-
 .../example_tilelang_gemm_fp8_sm100.py        |  10 +-
 examples/gemm_sm100/gemm_mma.py               |  10 +-
 examples/gemm_sm100/gemm_tcgen5mma.py         |  24 +-
 examples/gemm_sp/example_custom_compress.py   | 188 ++--
 examples/gemm_sp/example_gemm_sp.py           | 123 ++-
 .../example_tilelang_gemm_splitk.py           |  21 +-
 ...ilelang_gemm_splitk_vectorize_atomicadd.py |  21 +-
 .../example_tilelang_gemm_streamk.py          |  10 +-
 examples/gemv/example_gemv.py                 |  81 +-
 .../grouped_gemm/example_grouped_gemm_bwd.py  | 151 +---
 .../grouped_gemm/example_grouped_gemm_fwd.py  |  88 +-
 .../hadamard_transform/example_hadamard.py    |  35 +-
 examples/lazy_jit/lazyjit.en.ipynb            |  72 +-
 examples/lazy_jit/lazyjit.zh.ipynb            |  72 +-
 .../example_linear_attn_bwd.py                | 129 ++-
 .../example_linear_attn_fwd.py                |  86 +-
 .../example_mamba_chunk_scan.py               | 189 +++--
 .../example_mamba_chunk_state.py              | 119 ++-
 .../linear_attention/example_retention_fwd.py |  49 +-
 .../example_vertical_slash_sparse_attn.py     | 223 ++---
 examples/norm/rms_norm.py                     |   4 +-
 examples/norm/test_rms_norm.py                |   4 +-
 examples/online_softmax/online_softmax.py     |  12 +-
 examples/plot_layout/fragment_mfma_load_a.py  |  20 +-
 examples/plot_layout/fragment_mma_load_a.py   |  11 +-
 examples/quickstart.py                        |   7 +-
 .../block_sparse_attn_tilelang.py             | 112 ++-
 .../block_sparse_attn_triton.py               |  70 +-
 .../tilelang_example_sparse_tensorcore.py     |  30 +-
 examples/topk/example_topk.py                 |  14 +-
 .../visual_layout_inference.py                |  16 +-
 .../example_warp_specialize_flashmla.py       | 144 ++--
 ...warp_specialize_gemm_barrierpipe_stage2.py |  17 +-
 ...mple_warp_specialize_gemm_copy_0_gemm_1.py |  16 +-
 ...mple_warp_specialize_gemm_copy_1_gemm_0.py |  16 +-
 ...mple_warp_specialize_gemm_copy_gemm_0_1.py |  22 +-
 ...le_warp_specialize_gemm_softpipe_stage2.py |   1 -
 format.sh                                     |   2 +-
 maint/gemm_v2/correctness_evaluation.py       | 100 ++-
 maint/gemm_v2/correctness_evaluation_sm70.py  |  25 +-
 .../gemm_v2/correctness_evaluation_tcgen05.py |  29 +-
 maint/gemm_v2/latency.py                      |   7 +-
 maint/gemm_v2/latency_gemm.py                 |   7 +-
 maint/gemm_v2/latency_mha_fwd_bhsd.py         |  98 +--
 maint/host_checks/01_num_args_mismatch.py     |   1 +
 maint/host_checks/02_pointer_type_error.py    |   1 +
 maint/host_checks/03_ndim_mismatch.py         |   4 +-
 maint/host_checks/04_dtype_mismatch.py        |   4 +-
 maint/host_checks/05_shape_mismatch.py        |   4 +-
 maint/host_checks/06_strides_mismatch.py      |   4 +-
 maint/host_checks/07_device_type_mismatch.py  |   4 +-
 maint/host_checks/08_device_id_mismatch.py    |   4 +-
 maint/host_checks/09_null_data_pointer.py     |   1 +
 maint/host_checks/10_scalar_type_mismatch.py  |   4 +-
 maint/host_checks/common.py                   |  17 +-
 maint/precision/compare_ops.py                |  70 +-
 maint/scripts/ci_performance.py               |  43 +-
 maint/scripts/performance.py                  |  40 +-
 pyproject.toml                                |  15 +-
 requirements-lint.txt                         |   1 -
 testing/conftest.py                           |   7 +-
 .../amd/test_tilelang_gemm_mfma_intrinsic.py  |  54 +-
 .../amd/test_tilelang_gemm_mfma_preshuffle.py | 134 ++-
 testing/python/amd/test_tilelang_test_amd.py  |  15 +-
 .../test_tilelang_fragment_loop_checker.py    |  36 +-
 .../test_tilelang_nested_loop_checker.py      | 120 ++-
 .../python/autotune/test_tilelang_autotune.py |  30 +-
 .../test_tilelang_autotune_with_inputs.py     |  38 +-
 .../cache/test_tilelang_cache_matmul.py       |   7 +-
 ..._tilelang_carver_cuda_driver_properties.py |  16 +-
 .../test_tilelang_carver_generate_hints.py    |  25 +-
 .../test_tilelang_carver_recommend_hints.py   |  17 +-
 .../test_storage_rewrite_detect_inplace.py    |   3 +-
 ...ng_pass_config_disable_warp_specialized.py |   9 +-
 testing/python/cpu/test_tilelang_cpu_gemm.py  |  14 +-
 testing/python/debug/test_device_assert.py    |   2 -
 .../python/debug/test_tilelang_debug_print.py |  37 +-
 .../dynamic/test_tilelang_dynamic_symbolic.py |  81 +-
 .../test_tilelang_dynamic_symbolic_bench.py   |  54 +-
 .../python/fastmath/test_mathops_fastmath.py  |  81 +-
 .../python/issue/test_tilelang_issue_1001.py  |  13 +-
 .../python/issue/test_tilelang_issue_1008.py  |  22 +-
 .../python/issue/test_tilelang_issue_1115.py  |  24 +-
 .../python/issue/test_tilelang_issue_1198.py  |  14 +-
 .../python/issue/test_tilelang_issue_814.py   |   5 +-
 .../python/issue/test_tilelang_issue_830.py   |   2 -
 .../python/issue/test_tilelang_issue_96.py    |  16 +-
 .../issue/test_tilelang_issue_merge_if.py     |   1 -
 .../python/jit/test_tilelang_jit_callback.py  |  13 +-
 testing/python/jit/test_tilelang_jit_gemm.py  |   7 +-
 .../jit/test_tilelang_jit_gemm_cython.py      | 157 ++--
 .../python/jit/test_tilelang_jit_nullptr.py   |  23 +-
 testing/python/jit/test_tilelang_jit_nvrtc.py | 173 +---
 .../jit/test_tilelang_jit_parcompile.py       |  12 +-
 .../python/jit/test_tilelang_jit_tvm_ffi.py   | 181 ++--
 .../test_tilelang_kernel_bf16_gemm_mma.py     |  22 +-
 .../test_tilelang_kernel_element_wise_add.py  |   8 +-
 .../kernel/test_tilelang_kernel_fp8_gemm.py   |  10 +-
 .../test_tilelang_kernel_fp8_gemm_mma.py      |  22 +-
 .../test_tilelang_kernel_fp8_gemv_simt.py     |  36 +-
 .../kernel/test_tilelang_kernel_gemm.py       |  22 +-
 ...test_tilelang_kernel_gemm_mma_intrinsic.py |  22 +-
 .../kernel/test_tilelang_kernel_gemm_simt.py  |  27 +-
 .../test_tilelang_kernel_gemm_with_stride.py  |  10 +-
 .../kernel/test_tilelang_kernel_gemv_simt.py  |  36 +-
 .../test_tilelang_kernel_int4_gemm_mma.py     |  51 +-
 .../python/language/test_tilelang_capture.py  |  11 +-
 .../python/language/test_tilelang_intimm.py   |  22 +-
 .../language/test_tilelang_language_alias.py  |   7 +-
 .../language/test_tilelang_language_all_of.py |  42 +-
 .../language/test_tilelang_language_alloc.py  |  16 +-
 .../language/test_tilelang_language_annot.py  |  35 +-
 ...t_tilelang_language_annotate_safe_value.py |  14 +-
 .../language/test_tilelang_language_any_of.py |  42 +-
 .../language/test_tilelang_language_assume.py |  25 +-
 .../test_tilelang_language_atomic_add.py      |  45 +-
 .../test_tilelang_language_ceildiv.py         |   2 -
 .../test_tilelang_language_chain_equal.py     |  10 +-
 .../language/test_tilelang_language_clamp.py  |   8 +-
 .../language/test_tilelang_language_clear.py  |  11 +-
 ...test_tilelang_language_composable_index.py |   8 +-
 .../language/test_tilelang_language_copy.py   |  54 +-
 .../language/test_tilelang_language_cumsum.py |  31 +-
 .../test_tilelang_language_frontend_v2.py     |  87 +-
 .../test_tilelang_language_get_warp_info.py   |   5 -
 .../test_tilelang_language_if_range.py        |   9 +-
 .../test_tilelang_language_infinity.py        |   3 +-
 ...st_tilelang_language_intrinsics_codegen.py |   4 +-
 .../test_tilelang_language_lazy_jit.py        | 131 ++-
 .../language/test_tilelang_language_let.py    |   1 -
 .../test_tilelang_language_mask_op.py         |  62 +-
 .../test_tilelang_language_negative_index.py  |   3 +-
 .../test_tilelang_language_parallel.py        |  12 +-
 .../test_tilelang_language_pipeline.py        |  48 +-
 .../language/test_tilelang_language_ptr.py    |   1 -
 .../language/test_tilelang_language_reduce.py |  20 +-
 .../test_tilelang_language_reshape.py         |  54 +-
 .../test_tilelang_language_ternary.py         |  12 +-
 .../language/test_tilelang_language_tma_1d.py |   6 +-
 .../language/test_tilelang_language_unroll.py |   2 -
 .../test_tilelang_language_var_init.py        |  12 +-
 .../test_tilelang_language_vectorize.py       |  16 +-
 .../test_tilelang_language_vectorized_cast.py |  11 +-
 .../language/test_tilelang_language_view.py   |  12 +-
 .../test_tilelang_language_warp_reduce.py     |  21 +-
 .../test_tilelang_layout_fused_replicate.py   |  13 +-
 .../python/math/test_math_bitwise_reduce.py   |  11 +-
 testing/python/math/test_math_fast_math.py    |  81 +-
 testing/python/math/test_math_ieee_math.py    |  59 +-
 testing/python/metal/test_metal_codegen.py    |  29 +-
 .../test_tilelang_primitives_mma.py           |  48 +-
 .../python/profiler/test_tilelang_profiler.py |   7 +-
 .../test_tilelang_tilelibrary_gemm.py         |  62 +-
 .../test_tilelang_tilelibrary_gemm_sp.py      |  91 +-
 .../test_tilelang_tilelibrary_gemm_sp_v2.py   | 126 ++-
 ...lang_transform_Inject_software_pipeline.py |  12 +-
 ...est_tilelang_transform_cluster_planning.py |   7 +-
 ...ilelang_transform_config_index_bitwidth.py |  66 +-
 ...t_tilelang_transform_inject_fence_proxy.py |  56 +-
 ..._tilelang_transform_inject_set_max_nreg.py |  52 +-
 ...est_tilelang_transform_layout_inference.py |  94 +-
 ...g_transform_legalize_safe_memory_access.py |  54 +-
 ...lang_transform_legalize_vectorized_loop.py |   8 +-
 .../test_tilelang_transform_let_inline.py     |   5 +-
 ..._tilelang_transform_lower_hopper_intrin.py |  17 +-
 .../test_tilelang_transform_lower_tile_op.py  |  72 +-
 ...test_tilelang_transform_make_packed_api.py |  23 +-
 ...tilelang_transform_multi_version_buffer.py |  58 +-
 ...st_tilelang_transform_pipeline_planning.py |  22 +-
 .../test_tilelang_transform_simplify.py       |  13 +-
 .../test_tilelang_transform_thread_sync.py    |  30 +-
 ...est_tilelang_transform_warp_specialized.py |  72 +-
 testing/python/utils/test_compress_utils.py   |   2 +-
 testing/python/webgpu/test_webgpu_codegen.py  |   7 +-
 tilelang/__init__.py                          |   2 +
 tilelang/analysis/fragment_loop_checker.py    |  18 +-
 tilelang/analysis/layout_visual.py            |   6 +-
 tilelang/analysis/nested_loop_checker.py      |  23 +-
 tilelang/autotuner/capture.py                 |   3 +-
 tilelang/autotuner/param.py                   |  73 +-
 tilelang/autotuner/tuner.py                   | 171 ++--
 tilelang/cache/__init__.py                    |  15 +-
 tilelang/cache/kernel_cache.py                |  54 +-
 tilelang/carver/__init__.py                   |   1 +
 tilelang/carver/analysis.py                   |  22 +-
 tilelang/carver/arch/__init__.py              |  28 +-
 tilelang/carver/arch/arch_base.py             |   8 +-
 tilelang/carver/arch/cdna.py                  |   5 +-
 tilelang/carver/arch/cpu.py                   |   5 +-
 tilelang/carver/arch/cuda.py                  |  17 +-
 tilelang/carver/arch/driver/cuda_driver.py    |   3 +-
 tilelang/carver/arch/metal.py                 |   5 +-
 tilelang/carver/common_schedules.py           |   1 +
 tilelang/carver/matmul_analysis.py            |  79 +-
 tilelang/carver/roller/bestfit.py             |   8 +-
 tilelang/carver/roller/hint.py                |   5 +-
 tilelang/carver/roller/node.py                |  65 +-
 tilelang/carver/roller/policy/default.py      |  84 +-
 tilelang/carver/roller/policy/tensorcore.py   |  55 +-
 tilelang/carver/roller/rasterization.py       |   2 -
 .../carver/roller/shape_inference/common.py   |   5 +-
 tilelang/carver/roller/shape_inference/tir.py |  41 +-
 tilelang/carver/template/base.py              |  13 +-
 tilelang/carver/template/conv.py              |  25 +-
 tilelang/carver/template/flashattention.py    |   6 +-
 tilelang/carver/template/gemv.py              |   9 +-
 tilelang/carver/template/general_reduce.py    |  10 +-
 tilelang/carver/template/matmul.py            |   9 +-
 tilelang/carver/utils.py                      |  27 +-
 tilelang/contrib/cc.py                        |  34 +-
 tilelang/contrib/dlpack.py                    |   9 +-
 tilelang/contrib/hipcc.py                     |   9 +-
 tilelang/contrib/nvcc.py                      |  37 +-
 tilelang/contrib/nvrtc.py                     |  21 +-
 tilelang/contrib/rocm.py                      |   7 +-
 tilelang/engine/lower.py                      |  39 +-
 tilelang/engine/param.py                      |   3 +
 tilelang/engine/phase.py                      |  18 +-
 tilelang/env.py                               |  72 +-
 tilelang/intrinsics/mfma_layout.py            |  18 +-
 tilelang/intrinsics/mfma_macro_generator.py   | 195 ++---
 tilelang/intrinsics/mma_layout.py             |  22 +-
 tilelang/intrinsics/mma_macro_generator.py    | 143 ++--
 tilelang/intrinsics/mma_sm70_layout.py        |   8 +-
 .../intrinsics/mma_sm70_macro_generator.py    |  75 +-
 tilelang/intrinsics/mma_sp_layout.py          |  27 +-
 tilelang/intrinsics/mma_sp_macro_generator.py | 115 +--
 .../intrinsics/tcgen05_macro_generator.py     |  93 +-
 tilelang/intrinsics/utils.py                  |   2 +-
 tilelang/intrinsics/wgmma_macro_generator.py  | 179 ++--
 tilelang/ir.py                                |  48 +-
 tilelang/jit/__init__.py                      | 160 ++--
 tilelang/jit/adapter/base.py                  |  10 +-
 tilelang/jit/adapter/ctypes/adapter.py        |  63 +-
 tilelang/jit/adapter/cython/adapter.py        |  79 +-
 tilelang/jit/adapter/libgen.py                |  19 +-
 tilelang/jit/adapter/nvrtc/__init__.py        |  14 +-
 tilelang/jit/adapter/nvrtc/adapter.py         |  52 +-
 tilelang/jit/adapter/nvrtc/libgen.py          |  20 +-
 tilelang/jit/adapter/nvrtc/wrapper.py         | 146 ++--
 tilelang/jit/adapter/torch/__init__.py        |   2 +-
 tilelang/jit/adapter/torch/metal.py           |  12 +-
 tilelang/jit/adapter/tvm_ffi.py               |  80 +-
 tilelang/jit/adapter/utils.py                 | 112 +--
 tilelang/jit/adapter/wrapper.py               | 226 ++---
 tilelang/jit/execution_backend.py             |   7 +-
 tilelang/jit/kernel.py                        |  71 +-
 tilelang/language/__init__.py                 |   6 +-
 tilelang/language/allocate.py                 |  41 +-
 tilelang/language/annotations.py              |   1 +
 tilelang/language/ast/__init__.py             |   1 +
 tilelang/language/ast/_ffi_api.py             |   1 +
 tilelang/language/ast/ir.py                   |  87 +-
 tilelang/language/atomic.py                   |  26 +-
 tilelang/language/builtin.py                  | 110 ++-
 tilelang/language/copy.py                     |  58 +-
 tilelang/language/customize.py                |  12 +-
 tilelang/language/experimental/gemm_sp.py     |  13 +-
 tilelang/language/fill.py                     |   7 +-
 tilelang/language/frame.py                    |   8 +-
 tilelang/language/gemm.py                     |  36 +-
 tilelang/language/kernel.py                   |  28 +-
 tilelang/language/logical.py                  |   7 +-
 tilelang/language/loop.py                     |  25 +-
 tilelang/language/math_intrinsics.py          |   2 +-
 tilelang/language/overrides/parser.py         |  25 +-
 tilelang/language/parser/entry.py             |   8 +-
 tilelang/language/parser/operation.py         |  12 +-
 tilelang/language/parser/parser.py            |  12 +-
 tilelang/language/print.py                    |  40 +-
 tilelang/language/proxy.py                    |  92 +-
 tilelang/language/reduce.py                   |  11 +-
 tilelang/language/tir/entry.py                |   7 +-
 tilelang/language/tir/ir.py                   |  22 +-
 tilelang/language/tir/ir.pyi                  | 110 ++-
 tilelang/language/tir/op.py                   |  28 +-
 tilelang/language/utils.py                    |  13 +-
 tilelang/language/v2/annot.py                 | 166 ++--
 tilelang/language/v2/ast.py                   | 159 ++--
 tilelang/language/v2/builder.py               | 202 ++---
 tilelang/language/v2/dtypes.py                | 802 +++++++++---------
 tilelang/language/v2/utils.py                 |  21 +-
 tilelang/language/warpgroup.py                |   1 +
 tilelang/layout/fragment.py                   |  22 +-
 tilelang/layout/gemm_sp.py                    |   9 +-
 tilelang/layout/layout.py                     |   8 +-
 tilelang/layout/swizzle.py                    |  27 +-
 tilelang/libinfo.py                           |   3 +-
 tilelang/primitives/__init__.py               |   2 +-
 tilelang/primitives/gemm/__init__.py          |  12 +-
 tilelang/primitives/gemm/base.py              |  19 +-
 tilelang/primitives/gemm/gemm_mma.py          |  26 +-
 tilelang/profiler/__init__.py                 |  15 +-
 tilelang/profiler/bench.py                    |  10 +-
 tilelang/quantize/lop3.py                     |   7 +-
 tilelang/quantize/mxfp.py                     |  10 +-
 tilelang/quantize/utils.py                    |   9 +-
 tilelang/testing/__init__.py                  |  19 +-
 tilelang/tileop/gemm/__init__.py              |   3 +-
 tilelang/tileop/gemm/gemm_mfma.py             |  18 +-
 tilelang/tileop/gemm/gemm_mma.py              |  17 +-
 tilelang/tileop/gemm/gemm_mma_sm70.py         |  17 +-
 tilelang/tileop/gemm/gemm_tcgen05.py          |  30 +-
 tilelang/tileop/gemm/gemm_wgmma.py            |  37 +-
 tilelang/tileop/gemm_sp/__init__.py           |   6 +-
 tilelang/tileop/gemm_sp/gemm_sp_mma.py        |  14 +-
 tilelang/tools/Analyzer.py                    |  16 +-
 tilelang/tools/plot_layout.py                 |  80 +-
 tilelang/transform/__init__.py                |  30 +-
 tilelang/transform/add_bufstore_wrapper.py    |   9 +-
 tilelang/transform/pass_config.py             |   1 +
 tilelang/transform/simplify.py                |   1 -
 tilelang/utils/deprecated.py                  |   8 +-
 tilelang/utils/language.py                    |  19 +-
 tilelang/utils/sparse.py                      |  49 +-
 tilelang/utils/target.py                      |   5 +-
 tilelang/utils/tensor.py                      |  68 +-
 version_provider.py                           |  48 +-
 467 files changed, 12931 insertions(+), 15919 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1e5bab47..d1bb4cee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,19 +39,9 @@ repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.14.7  # sync with requirements-lint.txt
     hooks:
+      - id: ruff-format
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/google/yapf
-    rev: v0.43.0  # sync with requirements-lint.txt
-    hooks:
-      - id: yapf
-        name: yapf-multiproc-bugfix
-        # yapf is not multiprocess safe, so we run a dummy yapf first.
-        args: [--in-place, docs/conf.py]
-        always_run: true
-        pass_filenames: false
-      - id: yapf
-        args: [--recursive, --in-place]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.4.1  # sync with requirements-lint.txt
     hooks:
@@ -62,4 +52,4 @@ repos:
             ^.+\.(cpp|hpp|cxx|cc|c|h|cu|cuh)$|
             ^.+\.svg$|
             ^.*\brequirements\b.*\.txt$
-          )
\ No newline at end of file
+          )
diff --git a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
index 6401276a..3dd82aa5 100644
--- a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
@@ -7,10 +7,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -28,15 +25,15 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         import flash_attn
 
diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index 7c9edb59..fff65b44 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -39,7 +36,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
@@ -48,7 +45,6 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -60,11 +56,10 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,18 +74,18 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -116,22 +111,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -146,7 +140,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -155,20 +149,19 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k]:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -177,26 +170,23 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
-        program = blocksparse_flashattn(
-            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+        program = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
         kernel = tilelang.compile(program, out_idx=4)
 
         def benchmark_fn():
diff --git a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
index e4828ce5..85d754ae 100644
--- a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -31,39 +28,37 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
         def benchmark_fn():
             # Compute reference
             # Expand block mask to full attention matrix
-            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
             full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
             full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
             # PyTorch reference implementation
-            attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-            attn = attn.masked_fill(~full_mask, float('-inf'))
+            attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+            attn = attn.masked_fill(~full_mask, float("-inf"))
             attn = F.softmax(attn, dim=-1)
-            ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+            ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
             return ref_output
 
         ref_latency = do_bench(
diff --git a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
index 86ac894b..7ebca93a 100644
--- a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -72,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -153,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -191,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -253,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,24 +253,22 @@ block_sparse_triton_fn = _sparse_attention.apply
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
diff --git a/benchmark/mamba2/benchmark_mamba_chunk_scan.py b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
index aff810f6..a3ed72b1 100644
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
@@ -51,14 +51,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -74,7 +75,6 @@ def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
 
 
 def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
-
     @helion.kernel()
     def helion_mamba2_chunk_scan_kernel(
         cb: torch.Tensor,
@@ -118,8 +118,7 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
         dtype = cb.dtype
         accum_dtype = torch.float32
-        assert (x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype ==
-                dtype)
+        assert x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype == dtype
 
         out = torch.empty_like(x)
 
@@ -127,11 +126,10 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
         for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
             [nheads, chunk_size, headdim, batch, nchunks],
-                block_size=[1, block_m, block_n, 1, 1],
+            block_size=[1, block_m, block_n, 1, 1],
         ):
             acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
-            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                          tile_m].to(torch.float32)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_m].to(torch.float32)
             scale_m_local = torch.exp2(dA_cumsum_local_m * p)
 
             C_local = C[
@@ -152,10 +150,8 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
                     tile_m,
                     tile_k,
                 ]
-                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin,
-                                              tile_k].to(torch.float32)
-                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p -
-                                       dA_cumsum_local_k[None, :] * p)
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p - dA_cumsum_local_k[None, :] * p)
                 dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
                 cb_local = (cb_local * dt_local[None, :]).to(dtype)
                 pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
@@ -169,11 +165,9 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
                 acc_o = hl.dot(cb_local, x_local, acc=acc_o)
 
             D_local = D[tile_h.begin].to(torch.float32)
-            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                           tile_n].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n].to(torch.float32)
             acc_o += x_residual * D_local
-            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin,
-                tile_n] = acc_o.to(dtype=dtype)
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n] = acc_o.to(dtype=dtype)
 
         return out
 
@@ -182,12 +176,7 @@ def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -198,19 +187,21 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
     dtype = "float16"
     accum_dtype = "float"
     nchunks = T.ceildiv(seqlen, chunk_size)
@@ -218,20 +209,20 @@ def chunk_scan_fwd(batch,
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
             cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
@@ -257,27 +248,32 @@ def chunk_scan_fwd(batch,
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -286,34 +282,47 @@ def chunk_scan_fwd(batch,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -321,24 +330,37 @@ def chunk_scan_fwd(batch,
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     nchunks = math.ceil(seq_len / chunk_size)
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
@@ -360,8 +382,7 @@ if __name__ == "__main__":
     D = torch.randn(heads).half().cuda()
 
     print("Benchmarking Triton...")
-    triton_latency = do_bench(
-        lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    triton_latency = do_bench(lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
     print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
 
     print("Benchmarking Helion...")
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index c64f4fab..6ca1402d 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -6,6 +6,7 @@ import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
+
 # Configure logger
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -101,9 +102,7 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
     return configs
 
 
@@ -112,7 +111,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -159,9 +160,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -176,7 +177,6 @@ def matmul(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index 94e36b38..010ce87f 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -6,7 +6,8 @@ import tilelang as tl
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.autotuner import autotune
 import itertools
@@ -103,12 +104,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
@@ -127,7 +129,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -223,7 +223,6 @@ def get_configs(args, kwargs):
         for config in configs:
             print(config)
     else:
-
         iter_params = dict(
             block_row_warps=[1, 2, 4],
             block_col_warps=[1, 2, 4],
@@ -233,9 +232,7 @@ def get_configs(args, kwargs):
             stage=[0, 2],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -247,7 +244,9 @@ def get_configs(args, kwargs):
     ref_prog=ref_program,
     skip_check=True,
 )
-@tl.jit(out_idx=[2],)
+@tl.jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -291,13 +290,8 @@ if __name__ == "__main__":
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--with_roller",
-        type=bool,
-        default=False,
-        help="Whether to use roller to deduce search spaces")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
+    parser.add_argument("--with_roller", type=bool, default=False, help="Whether to use roller to deduce search spaces")
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 0ff3cd0b..22b5d13c 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -70,7 +70,8 @@ def get_configs(M, N, K):
             thread_num,
             policy,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -81,7 +82,8 @@ def get_configs(M, N, K):
             "thread_num": c[4],
             "policy": c[5],
             "enable_rasterization": c[6],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
@@ -126,7 +128,9 @@ def matmul_sp(M, N, K, in_dtype, accum_dtype):
         warmup=3,
         rep=20,
     )
-    @jit(out_idx=[2],)
+    @jit(
+        out_idx=[2],
+    )
     def kernel(
         block_M=None,
         block_N=None,
@@ -165,10 +169,10 @@ def matmul_sp(M, N, K, in_dtype, accum_dtype):
 
         @T.prim_func
         def main(
-                A_sparse: T.Tensor((M, K // 2), in_dtype),
-                E: T.Tensor((M, K // e_factor), e_dtype),
-                B: T.Tensor((K, N), in_dtype),
-                C: T.Tensor((M, N), accum_dtype),
+            A_sparse: T.Tensor((M, K // 2), in_dtype),
+            E: T.Tensor((M, K // e_factor), e_dtype),
+            B: T.Tensor((K, N), in_dtype),
+            C: T.Tensor((M, N), accum_dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -182,9 +186,7 @@ def matmul_sp(M, N, K, in_dtype, accum_dtype):
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
                 A_shared = T.alloc_shared((block_M, block_K // 2), in_dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -201,12 +203,12 @@ def matmul_sp(M, N, K, in_dtype, accum_dtype):
                 T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
-                T.annotate_layout({
-                    E:
-                        make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
-                    E_shared:
-                        make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
+                    }
+                )
                 # Loop over sub-blocks in K dimension, pipelined by num_stages
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     # Load a sub-block of A from global memory into A_shared
@@ -241,18 +243,13 @@ if __name__ == "__main__":
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--disable_cache", action="store_true")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument(
         "--bench_torch_sparse",
         type=str,
-        choices=['cutlass', 'cusparselt'],
+        choices=["cutlass", "cusparselt"],
         default=None,
-        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported"
+        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported",
     )
     args = parser.parse_args()
 
@@ -274,7 +271,8 @@ if __name__ == "__main__":
 
     if args.bench_torch_sparse is not None:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
-        if args.bench_torch_sparse == 'cutlass':
+
+        if args.bench_torch_sparse == "cutlass":
             SparseSemiStructuredTensor._FORCE_CUTLASS = True
         A_sp = to_sparse_semi_structured(A, transposed=False)
         torch_sparse_latency = do_bench(lambda: A_sp @ B)
@@ -285,8 +283,6 @@ if __name__ == "__main__":
     print(f"Best config: {best_config}")
 
     if args.bench_torch_sparse is not None:
-        print(
-            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
-        )
+        print(f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}")
 
     print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 796f7b90..930e8a6d 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -104,9 +104,7 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -116,7 +114,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -164,9 +164,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -181,7 +181,6 @@ def matmul(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/docs/conf.py b/docs/conf.py
index 9d524157..877b5582 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -20,33 +20,27 @@ extensions = [
     "autoapi.extension",
 ]
 
-autoapi_type = 'python'
-autoapi_dirs = ['../tilelang']
+autoapi_type = "python"
+autoapi_dirs = ["../tilelang"]
 
 autoapi_options = [
-    'members',
-    'undoc-members',
-    'show-inheritance',
-    'show-module-summary',
-    'special-members',
+    "members",
+    "undoc-members",
+    "show-inheritance",
+    "show-module-summary",
+    "special-members",
 ]
 autoapi_keep_files = False  # Useful for debugging the generated rst files
 
 autoapi_generate_api_docs = True
 
-autodoc_typehints = 'description'
+autodoc_typehints = "description"
 
 autoapi_ignore = ["*language/ast*", "*version*", "*libinfo*", "*parser*"]
 
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 
-myst_enable_extensions = [
-    "colon_fence",
-    "deflist",
-]
+myst_enable_extensions = ["colon_fence", "deflist"]
 
 redirects = {"get_started/try_out": "../index.html#getting-started"}
 
@@ -66,10 +60,7 @@ html_css_files = ["custom.css"]
 footer_copyright = "© 2025-2026 TileLang"
 footer_note = " "
 
-html_theme_options = {
-    "light_logo": "img/logo-v2.png",
-    "dark_logo": "img/logo-v2.png",
-}
+html_theme_options = {"light_logo": "img/logo-v2.png", "dark_logo": "img/logo-v2.png"}
 
 header_links = [
     ("Home", "https://github.com/tile-ai/tilelang"),
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d5c52f9c..a5461103 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -11,22 +11,20 @@ import time
 
 
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K_ref = K.repeat_interleave(groups, dim=2)
     V_ref = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K_ref)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K_ref)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V_ref)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V_ref)
     lse = torch.logsumexp(scores, dim=-1).float()
     return output, lse
 
@@ -45,23 +43,23 @@ def get_fwd_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
@@ -85,7 +83,7 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -97,11 +95,11 @@ def fast_flashattn(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -135,33 +133,21 @@ def fast_flashattn(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = (
-                    T.ceildiv(q_block_offset +
-                              block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -216,8 +202,7 @@ def fast_flashattn(
 
                 for i in T.Parallel(block_M):
                     if q_block_offset + i < seq_len:
-                        lse_val = T.if_then_else(l_i[i] > 0,
-                                                 T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
+                        lse_val = T.if_then_else(l_i[i] > 0, T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
                         LSE[bz, by, q_block_offset + i] = lse_val
 
                 bx_loop_var = current_bx + num_split_q
@@ -234,16 +219,17 @@ def get_bwd_configs():
     panel_size = [7, 8, 9, 10]
 
     configs = []
-    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads,
-                                                   enable_rasterization, panel_size):
-        configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_stages": stages,
-            "threads": t,
-            "enable_rasterization": r,
-            "panel_size": p,
-        })
+    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads, enable_rasterization, panel_size):
+        configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_stages": stages,
+                "threads": t,
+                "enable_rasterization": r,
+                "panel_size": p,
+            }
+        )
 
     return configs
 
@@ -256,8 +242,7 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
     blk = 32
 
     @T.prim_func
-    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
-                       Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
+    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype), Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
         with T.Kernel(batch, heads, T.ceildiv(seq_len, blk)) as (bz, bx, by):
             o = T.alloc_fragment([blk, blk], dtype)
             do = T.alloc_fragment([blk, blk], dtype)
@@ -265,21 +250,33 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 @tilelang.autotune(configs=get_bwd_configs(), cache_input_tensors=True)
 @tilelang.jit
-def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, block_N: int,
-                  num_stages: int, threads: int, enable_rasterization: bool, panel_size: int):
-    sm_scale = (1.0 / dim)**0.5
+def flashattn_bwd(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups,
+    block_M: int,
+    block_N: int,
+    num_stages: int,
+    threads: int,
+    enable_rasterization: bool,
+    panel_size: int,
+):
+    sm_scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -287,14 +284,17 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
     accum_dtype = "float"
 
     @T.prim_func
-    def flash_bwd_kernel(Q: T.Tensor(q_shape,
-                                     dtype), K: T.Tensor(kv_shape,
-                                                         dtype), V: T.Tensor(kv_shape, dtype),
-                         dO: T.Tensor(q_shape, dtype), lse: T.Tensor([batch, heads, seq_len],
-                                                                     accum_dtype),
-                         Delta: T.Tensor([batch, heads, seq_len],
-                                         accum_dtype), dQ: T.Tensor(q_shape, accum_dtype),
-                         dK: T.Tensor(kv_shape, accum_dtype), dV: T.Tensor(kv_shape, accum_dtype)):
+    def flash_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             T.use_swizzle(panel_size, enable=enable_rasterization)
 
@@ -315,8 +315,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
             dk = T.alloc_fragment([block_M, dim], accum_dtype)
             dq = T.alloc_fragment([block_N, dim], accum_dtype)
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
@@ -324,22 +324,21 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
             loop_ed = T.ceildiv(seq_len, block_N)
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q_shared)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q_shared)
                 T.clear(qkT)
 
                 T.gemm(K_shared, q_shared, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     P_acc[i, j] = T.exp(qkT[i, j] * sm_scale - lse_shared[j])
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j,
-                                                     P_acc[i, j], 0.0)
+                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, P_acc[i, j], 0.0)
 
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do_shared)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do_shared)
                 T.clear(dP)
 
                 T.gemm(V_shared, do_shared, dP, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -347,7 +346,7 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, b
                 T.copy(P_acc, p_cast)
                 T.gemm(p_cast, do_shared, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta_shared)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
@@ -378,8 +377,8 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     def flash_bwd_post(dQ_in: T.Tensor(shape, accum_dtype), dQ_out: T.Tensor(shape, dtype)):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.copy(
-                dQ_in[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ_in[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
@@ -446,22 +445,14 @@ def benchmark_function(func, *args, warmup=10, repeat=100):
     return np.median(times)
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     device = "cuda"
     dtype = torch.float16
 
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
 
-    print(
-        f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}"
-    )
+    print(f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}")
 
     flops_per_gemm = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 5 * flops_per_gemm
@@ -517,22 +508,19 @@ def main(batch: int = 1,
     o_ref.backward(dO)
 
     print("Verifying backward pass correctness...")
-    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(
-        dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
+    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
     if dq_close:
         print("dQ is correct.")
     else:
         print("dQ mismatch detected.")
 
-    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(
-        dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
+    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
     if dk_close:
         print("dK is correct.")
     else:
         print("dK mismatch detected.")
 
-    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(
-        dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
+    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
     if dv_close:
         print("dV is correct.")
     else:
@@ -553,9 +541,7 @@ def main(batch: int = 1,
             torch.cuda.synchronize()
 
     ref_latency = benchmark_function(run_reference_fwd_bwd, warmup=10, repeat=100)
-    print(
-        f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops")
 
     def run_complete_fwd_bwd():
         o_tl_bench, lse_tl_bench = fwd_kernel(q, k, v)
@@ -593,12 +579,12 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=1024, help='sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=1024, help="sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 9ffa7cbb..e53299a2 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -13,10 +13,10 @@ def supply_tensors_gpu(params):
     """Supply function that creates tensors on GPU for ROCm/HIP."""
     tensors = []
     for param in params:
-        if hasattr(param, 'shape') and hasattr(param, 'dtype'):
+        if hasattr(param, "shape") and hasattr(param, "dtype"):
             # Force creation on GPU device
             shape = [int(s) for s in param.shape]
-            tensor = torch.randn(shape, dtype=param.dtype, device='cuda')
+            tensor = torch.randn(shape, dtype=param.dtype, device="cuda")
             tensors.append(tensor)
         else:
             tensors.append(param)
@@ -24,22 +24,20 @@ def supply_tensors_gpu(params):
 
 
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -58,23 +56,23 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
@@ -98,7 +96,7 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -110,10 +108,10 @@ def fast_flashattn(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -147,32 +145,21 @@ def fast_flashattn(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = T.ceildiv(q_block_offset + block_M,
-                                       block_N) if is_causal else T.ceildiv(seq_len, block_N)
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -222,13 +209,7 @@ def fast_flashattn(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
@@ -250,18 +231,16 @@ def main(batch: int = 1,
     print(f"Reference (PyTorch): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
     latency = profiler.do_bench(warmup=100)
-    print(
-        f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index 540fcf4b..b90be143 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -25,22 +25,7 @@ def check_hopper():
     return False
 
 
-def kernel(N,
-           C,
-           H,
-           W,
-           F,
-           K,
-           S,
-           D,
-           P,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           threads,
-           dtype="float16",
-           accum_dtype="float"):
+def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
@@ -50,13 +35,11 @@ def kernel(N,
 
     @T.prim_func
     def conv(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -65,11 +48,13 @@ def kernel(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: make_swizzled_layout(out_shared),
-                data_shared: make_swizzled_layout(data_shared),
-                kernel_shared: make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: make_swizzled_layout(out_shared),
+                    data_shared: make_swizzled_layout(data_shared),
+                    kernel_shared: make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -81,10 +66,8 @@ def kernel(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index bfd934f6..e28440e1 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -20,9 +20,9 @@ def kernel(
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 1b7de6b6..3538adc3 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -51,8 +51,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -120,7 +119,8 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
@@ -137,12 +137,11 @@ def main(
 ):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -170,15 +169,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         if torch.allclose(
-                triton_program(Q, K, V, sinks, window_size),
-                ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-                rtol=1e-2,
-                atol=1e-2):
+            triton_program(Q, K, V, sinks, window_size), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        ):
             print("Checks for triton passed.✅")
         else:
             print("Checks for triton failed.❌")
@@ -198,20 +196,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index f50b9453..76997d84 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -50,8 +50,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -117,26 +116,28 @@ def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.T
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -163,15 +164,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: triton_program(Q, K, V, sinks, window_size), warmup=500)
@@ -184,19 +184,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index b442505f..5af787a1 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -20,28 +20,30 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        groups=1,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    groups=1,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: str = "float16",
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
@@ -51,12 +53,12 @@ def flashattn_fwd(
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            Output: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        Output: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -73,7 +75,7 @@ def flashattn_fwd(
             sinks = T.alloc_fragment([heads], dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -81,22 +83,20 @@ def flashattn_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -106,8 +106,7 @@ def flashattn_fwd(
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -124,22 +123,23 @@ def flashattn_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -147,9 +147,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -158,26 +158,27 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -185,32 +186,27 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim,
-                  groups,
-                  window_size=None,
-                  sm_scale=None,
-                  dtype="float16"):  # None for full attention
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype="float16"):  # None for full attention
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
@@ -225,15 +221,15 @@ def flashattn_bwd(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            dO: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        dO: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -253,44 +249,47 @@ def flashattn_bwd(batch,
             dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
             dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx // groups, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx // groups, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx // groups, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx // groups, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -299,12 +298,12 @@ def flashattn_bwd(batch,
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dv_shared)
+            T.atomic_add(dV[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dk_shared)
+            T.atomic_add(dK[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dk_shared)
 
     return flash_bwd
 
@@ -316,10 +315,10 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=256) as (bx, by, bz):
             sink = T.alloc_local([1], dtype)
@@ -328,21 +327,18 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"
             dsink_fragment = T.alloc_fragment([block], dtype)
 
             sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size, groups):
-
         def maybe_contiguous(x):
             if x.stride(-1) != 1:
                 return x.contiguous()
@@ -388,13 +384,14 @@ attention = _attention.apply
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -430,32 +427,31 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 8,
-         N_CTX: int = 512,
-         D_HEAD: int = 64,
-         groups: int = 2,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
+def main(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
-    K = torch.randn(
-        BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     V = torch.randn_like(K).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device="cuda").requires_grad_()
     dO = torch.randn_like(Q)
@@ -479,16 +475,11 @@ def main(BATCH: int = 1,
         "float16": (1e-2, 1e-2),
         "bfloat16": (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -509,17 +500,12 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument('--groups', type=int, default=8, help='Groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--groups", type=int, default=8, help="Groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.groups, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 8d181726..feb5844f 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -23,9 +23,11 @@ def get_configs():
     rep=100,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -41,12 +43,11 @@ def flashattn(
     threads=256,
     dtype: str = "float16",
 ):
-
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
@@ -68,13 +69,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -89,18 +89,18 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -112,8 +112,7 @@ def flashattn(
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -128,19 +127,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -157,58 +156,58 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -244,23 +243,15 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        groups,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, groups, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
@@ -277,12 +268,11 @@ def main(
 ):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -310,15 +300,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         # Benchmark tilelang
@@ -329,20 +318,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index b9fa0fd9..155c488e 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -20,27 +20,29 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        window_size=None,  # None for full attention,
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    window_size=None,  # None for full attention,
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: str = "float16",
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
@@ -48,12 +50,12 @@ def flashattn_fwd(
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -70,7 +72,7 @@ def flashattn_fwd(
             sinks = T.alloc_fragment([heads], dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -78,22 +80,20 @@ def flashattn_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.max(0,
-                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -103,8 +103,7 @@ def flashattn_fwd(
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -121,22 +120,23 @@ def flashattn_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -144,9 +144,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -155,26 +155,27 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16")
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
     accum_dtype = "float"
     shape = [batch, heads, seq_len, dim]
@@ -182,22 +183,24 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(
     batch,
     heads,
@@ -207,11 +210,10 @@ def flashattn_bwd(
     sm_scale=None,
     dtype: str = "float16",
 ):
-
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
@@ -222,15 +224,15 @@ def flashattn_bwd(
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -254,43 +256,46 @@ def flashattn_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.min(
-                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
-                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, B=do, C=dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -299,12 +304,12 @@ def flashattn_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
@@ -316,10 +321,10 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=128) as (bx, by, bz):
             sink = T.alloc_local([1], dtype)
@@ -328,18 +333,16 @@ def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"
             dsink_fragment = T.alloc_fragment([block], accum_dtype)
 
             sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size):
         BATCH, H, N_CTX, D_HEAD = q.shape
@@ -383,15 +386,15 @@ attention = _attention.apply
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -426,29 +429,22 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 1,
-         N_CTX: int = 512,
-         D_HEAD: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16"):
+def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: str = "float16"):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     K = torch.randn_like(Q).requires_grad_()
     V = torch.randn_like(Q).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device=Q.device).requires_grad_()
@@ -473,16 +469,11 @@ def main(BATCH: int = 1,
         "float16": (1e-2, 1e-2),
         "bfloat16": (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -503,16 +494,11 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 0ccb6958..78ac443b 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -18,27 +18,30 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: str = "float16",
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
@@ -58,13 +61,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,18 +81,18 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -102,8 +104,7 @@ def flashattn(
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -118,19 +119,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -147,53 +148,51 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(start, end, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Modified from https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -228,41 +227,35 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 1,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
+def main(
+    batch: int = 1,
+    heads: int = 1,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -289,19 +282,17 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
-        latency = do_bench(
-            lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
+        latency = do_bench(lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
         print("Ref: {:.2f} ms".format(latency))
         print("Ref: {:.2f} TFlops".format(total_flops / latency * 1e-9))
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -311,19 +302,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index 64d6ec69..decdc8f4 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -19,28 +19,30 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=128,
+    block_N=128,
+    num_stages=2,
+    threads=256,
+    dtype: str = "float16",
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     q_shape = [batch, heads, seq_q, dim]
@@ -61,13 +63,12 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         for i, j in T.Parallel(block_M, block_N):
             q_idx = bx * block_M + i + past_len
             k_idx = k * block_N + j
             if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
             else:
                 acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -82,18 +83,18 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -105,8 +106,7 @@ def flashattn(
         # NOTE(wt): check_inf is necessary for sliding window attention.
         for i in T.Parallel(block_M):
             if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
+                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
             scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
         for i, j in T.Parallel(block_M, block_N):
@@ -121,19 +121,19 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -150,60 +150,59 @@ def flashattn(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: make_swizzled_layout(Q_shared),
+                    K_shared: make_swizzled_layout(K_shared),
+                    V_shared: make_swizzled_layout(V_shared),
+                    O_shared: make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.max(0, (bx * block_M + past_len - window_size) //
-                          block_N) if window_size is not None else 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start,
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function'sinterface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function'sinterface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -238,41 +237,35 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: Optional[int] = None,
-         dtype: str = "float16",
-         tune: bool = False):
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
     torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -299,15 +292,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -317,19 +309,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/bitnet-1.58b/benchmark_generate.py b/examples/bitnet-1.58b/benchmark_generate.py
index d6f21ed5..d678b91a 100644
--- a/examples/bitnet-1.58b/benchmark_generate.py
+++ b/examples/bitnet-1.58b/benchmark_generate.py
@@ -12,8 +12,7 @@ bitblas.set_log_level("INFO")
 
 def generate_text_batch(model, tokenizer, prompts, max_length=100):
     # Encode the input prompts as a batch
-    input_ids = tokenizer(
-        prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
+    input_ids = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
 
     # Generate cos and sin values (commented out as not used in generation)
     seq_length = input_ids.size(1)
@@ -37,9 +36,7 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
     end_time = time.time()
 
     # Decode the output ids to text
-    generated_texts = [
-        tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids
-    ]
+    generated_texts = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]
 
     generation_time = end_time - start_time
     num_tokens = sum(len(output_id) for output_id in output_ids)
@@ -52,8 +49,8 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -74,25 +71,29 @@ def profile(model, input_data):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--bs', default=16, type=int)
-    parser.add_argument('--in_seq_len', default=32, type=int)
-    parser.add_argument('--out_seq_len', default=128, type=int)
-    parser.add_argument('--bitblas', action='store_true')
+    parser.add_argument("--bs", default=16, type=int)
+    parser.add_argument("--in_seq_len", default=32, type=int)
+    parser.add_argument("--out_seq_len", default=128, type=int)
+    parser.add_argument("--bitblas", action="store_true")
     args = parser.parse_args()
     bs = args.bs
     in_seq_len = args.in_seq_len
     out_seq_len = args.out_seq_len
     is_bitblas = args.bitblas
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     if is_bitblas:
         with torch.no_grad():
             model.quantize()
@@ -109,5 +110,5 @@ def main():
     print(generate_text_batch(model, tokenizer, prompts, max_length=max_length))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/benchmark_inference_latency.py b/examples/bitnet-1.58b/benchmark_inference_latency.py
index 9ce7a389..788fc556 100644
--- a/examples/bitnet-1.58b/benchmark_inference_latency.py
+++ b/examples/bitnet-1.58b/benchmark_inference_latency.py
@@ -6,13 +6,14 @@ from modeling_bitnet import BitnetForCausalLM
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,8 +36,8 @@ def profile(model, input_data):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
@@ -52,5 +53,5 @@ def main():
         print(f"Batch size: {batch_size}, Seq len: {seq_len}, Latency: {latency}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/configuration_bitnet.py b/examples/bitnet-1.58b/configuration_bitnet.py
index 5f4937b8..63c499db 100644
--- a/examples/bitnet-1.58b/configuration_bitnet.py
+++ b/examples/bitnet-1.58b/configuration_bitnet.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LLaMA model configuration"""
+"""LLaMA model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -180,16 +180,10 @@ class BitnetConfig(PretrainedConfig):
             return
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}")
+            raise ValueError(f"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, got {self.rope_scaling}")
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor,
-                                                         float) or rope_scaling_factor <= 1.0:
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/examples/bitnet-1.58b/eval_correctness.py b/examples/bitnet-1.58b/eval_correctness.py
index ac1e3407..11d47004 100644
--- a/examples/bitnet-1.58b/eval_correctness.py
+++ b/examples/bitnet-1.58b/eval_correctness.py
@@ -47,8 +47,8 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -69,18 +69,22 @@ def profile(model, input_data):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=False,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=False,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
 
     tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False)
-    input_id = tokenizer("Hello")['input_ids']
+    input_id = tokenizer("Hello")["input_ids"]
     input_id = torch.tensor(input_id).unsqueeze(0).cuda()
 
     print("original model generated text:")
@@ -91,5 +95,5 @@ def main():
     print(generate_text(model, tokenizer, "Hello", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_gpu_memory.py b/examples/bitnet-1.58b/eval_gpu_memory.py
index 597cbbfc..00c914cb 100644
--- a/examples/bitnet-1.58b/eval_gpu_memory.py
+++ b/examples/bitnet-1.58b/eval_gpu_memory.py
@@ -6,13 +6,14 @@ from modeling_bitnet import BitnetForCausalLM
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,17 +36,17 @@ def profile(model, input_data):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
     ).half()
-    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024**3} GB")
     with torch.no_grad():
         model._post_process_weights()
-    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024**3} GB")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_ppl.py b/examples/bitnet-1.58b/eval_ppl.py
index 61c8488e..97db2d0f 100644
--- a/examples/bitnet-1.58b/eval_ppl.py
+++ b/examples/bitnet-1.58b/eval_ppl.py
@@ -15,9 +15,9 @@ from tqdm import tqdm
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--seed', default=0, type=int)
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
-parser.add_argument('--seqlen', default=2048, type=int)
+parser.add_argument("--seed", default=0, type=int)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
+parser.add_argument("--seqlen", default=2048, type=int)
 
 
 def calulate_loss(model, input, loss_fct):
@@ -29,12 +29,16 @@ def calulate_loss(model, input, loss_fct):
 
 
 def main(args):
-    datasets = ['c4', 'wikitext2']
-    model = BitnetForCausalLM.from_pretrained(
-        args.hf_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    datasets = ["c4", "wikitext2"]
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            args.hf_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     with torch.no_grad():
         model._post_process_weights()
     tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False)
@@ -48,9 +52,9 @@ def main(args):
         for ii in progress:
             input = torch.Tensor(testdata[ii]).long().cuda().view(1, -1)
             loss = calulate_loss(model, input, loss_fct)
-            count += (input.size(-1) - 1)
+            count += input.size(-1) - 1
             acc_loss += loss.item()
-            progress.set_description(f"avg_loss = {acc_loss/ count / math.log(2)}")
+            progress.set_description(f"avg_loss = {acc_loss / count / math.log(2)}")
 
         avg_loss = acc_loss / count / math.log(2)
         ppl.append(2**avg_loss)
@@ -60,7 +64,7 @@ def main(args):
     print("Avg PPL:", sum(ppl) / len(ppl))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     torch.set_grad_enabled(False)
     args = parser.parse_args()
     random.seed(args.seed)
diff --git a/examples/bitnet-1.58b/eval_utils.py b/examples/bitnet-1.58b/eval_utils.py
index 46241eed..72480c39 100644
--- a/examples/bitnet-1.58b/eval_utils.py
+++ b/examples/bitnet-1.58b/eval_utils.py
@@ -15,21 +15,17 @@ def set_seed(seed):
 
 def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
     if dataset_name == "wikitext2":
-        testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-        testdata = "".join(testdata['text']).split('\n')
+        testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        testdata = "".join(testdata["text"]).split("\n")
     elif dataset_name == "c4":
-        testdata = load_dataset(
-            'allenai/c4',
-            data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
-            split='validation')['text']
+        testdata = load_dataset("allenai/c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation")[
+            "text"
+        ]
     else:
         raise NotImplementedError
 
     testdata = [item for item in testdata if item != ""]
-    tokenized_text = [
-        tokenizer(item, add_special_tokens=False)['input_ids'] + [tokenizer.eos_token_id]
-        for item in testdata
-    ]
+    tokenized_text = [tokenizer(item, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id] for item in testdata]
 
     data, doc = [], [tokenizer.bos_token_id]
     for sen in tokenized_text:
@@ -45,7 +41,6 @@ def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
 
 
 class LMEvalAdaptor(BaseLM):
-
     def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
         super().__init__()
 
@@ -137,5 +132,4 @@ class LMEvalAdaptor(BaseLM):
         return out
 
     def _model_generate(self, context, max_length, eos_token_id):
-        return self.model.generate(
-            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
+        return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index e5af16cc..35a044e5 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -102,17 +102,17 @@ def bitnet_158_int8xint2_decode(
 
     @T.prim_func
     def kernel(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer(C_shape, out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer(C_shape, out_dtype),
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -133,8 +133,7 @@ def bitnet_158_int8xint2_decode(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 T.call_extern(
@@ -156,9 +155,9 @@ def bitnet_158_int8xint2_decode(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -168,7 +167,8 @@ def bitnet_158_int8xint2_decode(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -234,13 +234,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_decode_correctness(M,
-                                                   N,
-                                                   K,
-                                                   in_dtype,
-                                                   out_dtype,
-                                                   accum_dtype,
-                                                   fast_decoding=True):
+def assert_bitnet_158_int8xint2_decode_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_decode(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index d8b1f622..d68a0128 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -8,11 +8,13 @@ import tilelang.language as T
 from tilelang import tvm as tvm
 from tvm import DataType
 from tilelang.intrinsics.mma_layout import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 import numpy as np
 
 from tilelang.intrinsics.mma_macro_generator import (
-    INT4TensorCoreIntrinEmitter,)
+    INT4TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 torch.manual_seed(42)
@@ -181,38 +183,36 @@ def bitnet_158_int8xint2_prefill(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         """
-            GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
+        GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
 
-            This kernel:
-            - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
-            - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
-            - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
-            - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
+        This kernel:
+        - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
+        - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
+        - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
+        - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
 
-            Parameters:
-                A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
-                B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
-                C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
+        Parameters:
+            A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
+            B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
+            C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
 
-            Side effects:
-                Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
+        Side effects:
+            Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
         """
         with T.Kernel(
-                T.ceildiv(N, block_N),
-                T.ceildiv(M, block_M),
-                threads=threads,
-                prelude=decode_i2s_to_i8s,
+            T.ceildiv(N, block_N),
+            T.ceildiv(M, block_M),
+            threads=threads,
+            prelude=decode_i2s_to_i8s,
         ) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
-            B_dequantize_shared = T.alloc_shared(
-                B_dequantize_shared_shape, in_dtype, scope=shared_scope)
+            B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
             A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
             B_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
@@ -223,10 +223,12 @@ def bitnet_158_int8xint2_prefill(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -234,7 +236,6 @@ def bitnet_158_int8xint2_prefill(
             T.clear(C_frag)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -243,12 +244,9 @@ def bitnet_158_int8xint2_prefill(
                 for j, k in T.Parallel(block_N, block_K // num_elems_per_byte):
                     B_shared[j, k] = B[bx * block_N + j, ko * (block_K // num_elems_per_byte) + k]
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
-                        index = (
-                            i * threads * local_size_compressed +
-                            thread_bindings * local_size_compressed + v)
+                        index = i * threads * local_size_compressed + thread_bindings * local_size_compressed + v
                         vi, vj = T.index_to_coordinates(index, B_shared_shape)
                         B_local[v] = B_shared[vi, vj]
 
@@ -260,12 +258,11 @@ def bitnet_158_int8xint2_prefill(
                     )
 
                     for v in T.vectorized(0, local_size):
-                        index = (i * threads * local_size + thread_bindings * local_size + v)
+                        index = i * threads * local_size + thread_bindings * local_size + v
                         vi, vj = T.index_to_coordinates(index, B_dequantize_shared_shape)
                         B_dequantize_shared[vi, vj] = B_dequantize_local[v]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_frag,
@@ -360,13 +357,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_prefill_correctness(M,
-                                                    N,
-                                                    K,
-                                                    in_dtype,
-                                                    out_dtype,
-                                                    accum_dtype,
-                                                    fast_decoding=True):
+def assert_bitnet_158_int8xint2_prefill_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_prefill(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
index 98646359..f2a0e2e7 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
@@ -6,7 +6,8 @@ from tvm import tl as TL
 import tvm.tl.language as T
 from bitblas.tl.utils import get_swizzle_layout
 from bitblas.tl.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from bitblas.base import simplify_prim_func
 
 torch.manual_seed(0)
@@ -101,12 +102,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, in_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, in_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def tl_matmul(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -127,7 +129,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
diff --git a/examples/bitnet-1.58b/load_from_quantized.py b/examples/bitnet-1.58b/load_from_quantized.py
index 26a32f97..8c775aa4 100644
--- a/examples/bitnet-1.58b/load_from_quantized.py
+++ b/examples/bitnet-1.58b/load_from_quantized.py
@@ -49,7 +49,13 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 def main():
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
     # print("original model generated text:")
     # print(generate_text(model, tokenizer, "Hi, ", max_length=100))
diff --git a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
index 1e29a553..2604ef38 100644
--- a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
+++ b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
@@ -25,9 +25,9 @@ parser.add_argument("--saved_model_path", type=str, default=None)
 args = parser.parse_args()
 
 model_name_or_path = args.model_name_or_path
-saved_model_path = os.path.join(
-    dirpath, "models",
-    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+saved_model_path = (
+    os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+)
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):
@@ -67,7 +67,10 @@ def main():
             model_name_or_path,
             use_flash_attention_2=False,
             torch_dtype=torch.float16,
-        ).cuda().half())
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
 
     # print("original model generated text:")
@@ -112,10 +115,16 @@ def main():
         file_path = cached_file(model_name_or_path, file)
         os.system(f"cp {file_path} {saved_model_path}")
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     print("quantized model generated text:")
     print(generate_text(qmodel, tokenizer, "Hi, ", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py
index 6e3c42b6..1830995e 100644
--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -64,8 +64,7 @@ def find_layers(module, layers=None, name=""):
             return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(
-            find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
     return res
 
 
@@ -87,7 +86,6 @@ def _get_unpad_data(attention_mask):
 
 
 class BitnetRMSNorm(nn.Module):
-
     def __init__(self, hidden_size, eps=1e-6):
         """
         BitnetRMSNorm is equivalent to T5LayerNorm
@@ -108,34 +106,23 @@ ALL_LAYERNORM_LAYERS.append(BitnetRMSNorm)
 
 
 class BitnetRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         self.scaling_factor = scaling_factor
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (
-            self.base
-            **(torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         t = t / self.scaling_factor
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
-        self.register_buffer(
-            "_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
 
     @property
     def sin_cached(self):
@@ -156,14 +143,12 @@ class BitnetRotaryEmbedding(nn.Module):
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :,
-                                          None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type,
-                                                str) and device_type != "mps" else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
@@ -174,8 +159,8 @@ class BitnetRotaryEmbedding(nn.Module):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
@@ -207,7 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 
 class BitnetMLP(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -245,7 +229,6 @@ class BitnetMLP(nn.Module):
 
 
 class BitnetMLPFuseGateUp(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -272,8 +255,7 @@ class BitnetMLPFuseGateUp(nn.Module):
     def from_bit_mlp(cls, bit_mlp: BitnetMLP):
         module = cls(bit_mlp.config)
         # assign the weights
-        module.gate_up_proj.weight = nn.Parameter(
-            torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
+        module.gate_up_proj.weight = nn.Parameter(torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
         module.down_proj = bit_mlp.down_proj
         module.ffn_layernorm = bit_mlp.ffn_layernorm
         return module
@@ -295,8 +277,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -311,7 +292,8 @@ class BitnetAttention(nn.Module):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -325,8 +307,8 @@ class BitnetAttention(nn.Module):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.q_proj = BitLinear(
             self.hidden_size,
@@ -387,10 +369,8 @@ class BitnetAttention(nn.Module):
         value_states = self.v_proj(hidden_states)
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -399,30 +379,24 @@ class BitnetAttention(nn.Module):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -448,7 +422,8 @@ class BitnetAttentionQKVFused(nn.Module):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -462,8 +437,8 @@ class BitnetAttentionQKVFused(nn.Module):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.qkv_proj = BitLinear(
             self.hidden_size,
@@ -497,17 +472,12 @@ class BitnetAttentionQKVFused(nn.Module):
         module = cls(bit_attention.config, bit_attention.layer_idx)
         # assign the weights
         module.qkv_proj.weight = nn.Parameter(
-            torch.cat([
-                bit_attention.q_proj.weight, bit_attention.k_proj.weight,
-                bit_attention.v_proj.weight
-            ],
-                      dim=0))
+            torch.cat([bit_attention.q_proj.weight, bit_attention.k_proj.weight, bit_attention.v_proj.weight], dim=0)
+        )
         if bit_attention.q_proj.bias is not None and bit_attention.k_proj.bias is not None and bit_attention.v_proj.bias is not None:
             module.qkv_proj.bias = nn.Parameter(
-                torch.cat([
-                    bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias
-                ],
-                          dim=0))
+                torch.cat([bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias], dim=0)
+            )
         module.o_proj = bit_attention.o_proj
         module.inner_attn_ln = bit_attention.inner_attn_ln
         if bit_attention.config.rope_scaling is None:
@@ -528,16 +498,13 @@ class BitnetAttentionQKVFused(nn.Module):
         bsz, q_len, _ = hidden_states.size()
         qkv_states = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = torch.split(
-            qkv_states, [
-                self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim,
-                self.num_key_value_heads * self.head_dim
-            ],
-            dim=-1)
+            qkv_states,
+            [self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim, self.num_key_value_heads * self.head_dim],
+            dim=-1,
+        )
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -546,30 +513,24 @@ class BitnetAttentionQKVFused(nn.Module):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -622,10 +583,8 @@ class BitnetFlashAttention2(BitnetAttention):
         # batch_size x seq_length x head_dim x hidden_dim
         # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -635,8 +594,7 @@ class BitnetFlashAttention2(BitnetAttention):
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
@@ -665,14 +623,14 @@ class BitnetFlashAttention2(BitnetAttention):
             logger.warning_once(
                 f"The input hidden states seems to be silently casted in float32, this might be related to"
                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}.")
+                f" {target_dtype}."
+            )
 
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
+        attn_output = self._flash_attention_forward(query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
 
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.inner_attn_ln(attn_output)
@@ -683,14 +641,9 @@ class BitnetFlashAttention2(BitnetAttention):
 
         return attn_output, attn_weights, past_key_value
 
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
@@ -720,7 +673,8 @@ class BitnetFlashAttention2(BitnetAttention):
         if attention_mask is not None:
             batch_size = query_states.shape[0]
             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length)
+                query_states, key_states, value_states, attention_mask, query_length
+            )
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -740,13 +694,7 @@ class BitnetFlashAttention2(BitnetAttention):
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
+            attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)
 
         return attn_output
 
@@ -754,28 +702,24 @@ class BitnetFlashAttention2(BitnetAttention):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
+            query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32,
-                device=query_layer.device)  # There is a memcpy here, that is very bad.
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask)
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -794,13 +738,11 @@ LLAMA_ATTENTION_CLASSES = {
 
 
 class BitnetDecoderLayer(nn.Module):
-
     def __init__(self, config: BitnetConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
-            config=config, layer_idx=layer_idx)
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
 
         self.mlp = BitnetMLP(config)
         self.input_layernorm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -834,7 +776,8 @@ class BitnetDecoderLayer(nn.Module):
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
         residual = hidden_states
 
@@ -925,8 +868,7 @@ class BitnetPreTrainedModel(PreTrainedModel):
                 dtype = self.config._pre_quantization_dtype
             else:
                 dtype = layer.self_attn.o_proj.weight.dtype
-            layer.self_attn.past_key_value = cache_cls(
-                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
+            layer.self_attn.past_key_value = cache_cls(self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
 
     def _reset_cache(self):
         for layer in self.model.layers:
@@ -1025,9 +967,7 @@ class BitnetModel(BitnetPreTrainedModel):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([
-            BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList([BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
         self.norm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
 
@@ -1055,21 +995,15 @@ class BitnetModel(BitnetPreTrainedModel):
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
 
         if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
+            logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
             use_cache = False
 
         if inputs_embeds is None:
@@ -1083,10 +1017,7 @@ class BitnetModel(BitnetPreTrainedModel):
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1143,12 +1074,9 @@ class BitnetModel(BitnetPreTrainedModel):
 
         next_cache = None
         if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache()
-                if isinstance(next_decoder_cache, Cache) else next_decoder_cache)
+            next_cache = next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                         if v is not None)
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1172,14 +1100,9 @@ class BitnetModel(BitnetPreTrainedModel):
         if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
             target_length = self.config.max_position_embeddings
         else:  # dynamic cache
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1)
-
-        causal_mask = torch.full((sequence_length, target_length),
-                                 fill_value=min_dtype,
-                                 dtype=dtype,
-                                 device=device)
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -1188,10 +1111,8 @@ class BitnetModel(BitnetPreTrainedModel):
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(
-                    0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
-                    padding_mask, min_dtype)
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
             elif attention_mask.dim() == 4:
                 # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                 # cache. In that case, the 4D attention mask attends to the newest tokens only.
@@ -1201,8 +1122,7 @@ class BitnetModel(BitnetPreTrainedModel):
                     offset = 0
                 mask_shape = attention_mask.shape
                 mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-                causal_mask[:mask_shape[0], :mask_shape[1],
-                            offset:mask_shape[2] + offset, :mask_shape[3]] = mask_slice
+                causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = mask_slice
 
         return causal_mask
 
@@ -1279,9 +1199,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -1327,13 +1245,9 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      cache_position=None,
-                                      **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
+    ):
         # With static cache, the `past_key_values` is None
         # TODO joao: standardize interface for the different Cache classes and remove of this if
         has_static_cache = False
@@ -1344,13 +1258,13 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length()
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                 max_cache_length = (
                     torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                    if past_key_values.get_max_length() is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
             # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
@@ -1361,7 +1275,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1369,8 +1283,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None and
-                    cache_length + input_ids.shape[1] > max_cache_length):
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids")
@@ -1379,7 +1292,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -1392,39 +1305,38 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
 
         input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
         if cache_position is None:
-            cache_position = torch.arange(
-                past_length, past_length + input_length, device=input_ids.device)
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
         else:
             cache_position = cache_position[-input_length:]
 
         if has_static_cache:
             past_key_values = None
 
-        model_inputs.update({
-            "position_ids": position_ids,
-            "cache_position": cache_position,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        })
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
 
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past),)
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
     @staticmethod
     def recursive_set(model, name, attr):
-        '''
-            set layers.25.mlp.up_proj to attr
-        '''
+        """
+        set layers.25.mlp.up_proj to attr
+        """
 
-        names = name.split('.')
+        names = name.split(".")
         obj = model
         for n in names[:-1]:
             obj = getattr(obj, n)
@@ -1521,6 +1433,7 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
         fuse_gateup = quant_config.get("fuse_gateup", True)
 
         import accelerate
+
         if checkpoint_format == "bitblas":
             model = cls(config)
             for name, module in model.named_modules():
@@ -1567,7 +1480,6 @@ class BitnetForCausalLM(BitnetPreTrainedModel):
     LLAMA_START_DOCSTRING,
 )
 class BitnetForSequenceClassification(BitnetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1631,8 +1543,7 @@ class BitnetForSequenceClassification(BitnetPreTrainedModel):
         else:
             if input_ids is not None:
                 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids,
-                                            self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                 sequence_lengths = sequence_lengths % input_ids.shape[-1]
                 sequence_lengths = sequence_lengths.to(logits.device)
             else:
@@ -1646,8 +1557,7 @@ class BitnetForSequenceClassification(BitnetPreTrainedModel):
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or
-                                              labels.dtype == torch.int):
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py
index 6fea3252..2adfd6de 100644
--- a/examples/bitnet-1.58b/tokenization_bitnet.py
+++ b/examples/bitnet-1.58b/tokenization_bitnet.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -37,12 +38,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
     },
     "tokenizer_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
     },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -159,14 +158,10 @@ class BitnetTokenizer(PreTrainedTokenizer):
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(
-            bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(
-            eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(
-            unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(
-            pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
 
         if legacy is None:
             logger.warning_once(
@@ -174,7 +169,8 @@ class BitnetTokenizer(PreTrainedTokenizer):
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behavior, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565")
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
             legacy = True
 
         self.legacy = legacy
@@ -214,8 +210,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         with open(self.vocab_file, "rb") as f:
             sp_model = f.read()
-            model_pb2 = import_protobuf(
-                f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
+            model_pb2 = import_protobuf(f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
             model = model_pb2.ModelProto.FromString(sp_model)
             normalizer_spec = model_pb2.NormalizerSpec()
             normalizer_spec.add_dummy_prefix = False
@@ -261,8 +256,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         tokens = super().tokenize(text, **kwargs)
 
-        if len(tokens
-              ) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
             tokens = tokens[1:]
         return tokens
 
@@ -284,7 +278,7 @@ class BitnetTokenizer(PreTrainedTokenizer):
         # 1. Encode string + prefix ex: "<unk> Hey"
         tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length:] if len(tokens) >= self.unk_token_length else tokens
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
@@ -332,12 +326,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
-        out_vocab_file = os.path.join(save_directory,
-                                      (filename_prefix + "-" if filename_prefix else "") +
-                                      VOCAB_FILES_NAMES["vocab_file"])
+        out_vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
-                self.vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
@@ -357,10 +348,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
 
         return output
 
-    def get_special_tokens_mask(self,
-                                token_ids_0: List[int],
-                                token_ids_1: Optional[List[int]] = None,
-                                already_has_special_tokens: bool = False) -> List[int]:
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
@@ -377,20 +367,16 @@ class BitnetTokenizer(PreTrainedTokenizer):
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
+            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
 
         bos_token_id = [1] if self.add_bos_token else []
         eos_token_id = [1] if self.add_eos_token else []
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id +
-                ([0] * len(token_ids_1)) + eos_token_id)
+        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
 
-    def create_token_type_ids_from_sequences(self,
-                                             token_ids_0: List[int],
-                                             token_ids_1: Optional[List[int]] = None) -> List[int]:
+    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
@@ -473,9 +459,9 @@ class BitnetTokenizer(PreTrainedTokenizer):
             "{% elif message['role'] == 'assistant' %}"
             "{{ ' '  + content.strip() + ' ' + eos_token }}"
             "{% endif %}"
-            "{% endfor %}")
-        template = template.replace("USE_DEFAULT_PROMPT",
-                                    "true" if self.use_default_system_prompt else "false")
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
         default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
         template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
 
diff --git a/examples/bitnet-1.58b/utils_quant.py b/examples/bitnet-1.58b/utils_quant.py
index 5f5db5db..5a50edb3 100644
--- a/examples/bitnet-1.58b/utils_quant.py
+++ b/examples/bitnet-1.58b/utils_quant.py
@@ -24,15 +24,14 @@ def weight_quant(weight, num_bits=1):
 def activation_quant(x, num_bits=8):
     dtype = x.dtype
     x = x.float()
-    Qn = -(2**(num_bits - 1))
-    Qp = 2**(num_bits - 1) - 1
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
     s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
     result = (x * s).round().clamp(Qn, Qp) / s
     return result.type(dtype)
 
 
 class BitLinearBitBLAS(nn.Module):
-
     def __init__(
         self,
         in_features: int,
@@ -68,7 +67,7 @@ class BitLinearBitBLAS(nn.Module):
         self.bitblas_matmul = self._get_or_create_bitblas_operator(matmul_config, ENABLE_TUNING)
 
         self.format = "bitnet"
-        self.Qp = 2**(self.input_bits - 1) - 1
+        self.Qp = 2 ** (self.input_bits - 1) - 1
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         if global_operator_cache.size() == 0:
@@ -99,8 +98,7 @@ class BitLinearBitBLAS(nn.Module):
 
     @classmethod
     def from_bit_linear(cls, bitlinear, weight_group=1):
-        bitblas_linear = cls(
-            bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
+        bitblas_linear = cls(bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
         sw, qweight = bitblas_linear.create_bitblas_weights(bitlinear.weight, weight_group)
         bitblas_linear.register_buffer("qweight", qweight)
         bitblas_linear.register_buffer("sw", sw)
@@ -158,8 +156,8 @@ class BitLinearBitBLAS(nn.Module):
     @torch.compile
     def activation_quant(self, x, num_bits=8):
         x = x.float()
-        Qn = -(2**(num_bits - 1))
-        Qp = 2**(num_bits - 1) - 1
+        Qn = -(2 ** (num_bits - 1))
+        Qp = 2 ** (num_bits - 1) - 1
         s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
         result = (x * s).round().clamp(Qn, Qp)
         return result.type(torch.int8), s
@@ -173,9 +171,8 @@ class BitLinearBitBLAS(nn.Module):
 
     # for the correctness evaluation.
     def native_forward(self, input):
-        quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
-        quant_weight = (
-            self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach())
+        quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
@@ -214,7 +211,6 @@ class BitLinearBitBLAS(nn.Module):
 
 # Naive BitLinear from HuggingFace
 class BitLinear(nn.Linear):
-
     def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         super(BitLinear, self).__init__(*kargs, **kwargs)
         """
@@ -224,10 +220,8 @@ class BitLinear(nn.Linear):
         self.input_bits = input_bits
 
     def forward(self, input):
-
         quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
-        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) -
-                                      self.weight).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
diff --git a/examples/bitnet-1.58b/vllm_workspace/conftest.py b/examples/bitnet-1.58b/vllm_workspace/conftest.py
index 951f3899..e9e2997e 100644
--- a/examples/bitnet-1.58b/vllm_workspace/conftest.py
+++ b/examples/bitnet-1.58b/vllm_workspace/conftest.py
@@ -20,7 +20,7 @@ from transformers import (
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import TokenizerPoolConfig
-from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel)
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.sequence import SampleLogprobs
@@ -56,12 +56,13 @@ else:
 
 
 class _ImageAssets(_ImageAssetsBase):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -136,7 +137,6 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
 
 
 class HfRunner:
-
     def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
             return input.to("cuda")
@@ -166,7 +166,8 @@ class HfRunner:
                 SentenceTransformer(
                     model_name,
                     device="cpu",
-                ).to(dtype=torch_dtype))
+                ).to(dtype=torch_dtype)
+            )
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
@@ -184,7 +185,8 @@ class HfRunner:
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
                     **model_kwargs,
-                ))
+                )
+            )
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -204,8 +206,7 @@ class HfRunner:
             )
         except Exception:
             logger.warning(
-                "Unable to auto-load processor from HuggingFace for "
-                "model %s. Using tokenizer instead.",
+                "Unable to auto-load processor from HuggingFace for model %s. Using tokenizer instead.",
                 model_name,
             )
             self.processor = self.tokenizer
@@ -362,7 +363,7 @@ class HfRunner:
                     last_hidden_states,
                     self.model.get_output_embeddings().weight.t(),
                 )
-                if (getattr(self.model.get_output_embeddings(), "bias", None) is not None):
+                if getattr(self.model.get_output_embeddings(), "bias", None) is not None:
                     logits += self.model.get_output_embeddings().bias.unsqueeze(0)
                 logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
@@ -389,8 +390,7 @@ class HfRunner:
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
@@ -409,7 +409,6 @@ def hf_runner():
 
 
 class VllmRunner:
-
     def __init__(
         self,
         model_name: str,
@@ -514,12 +513,10 @@ class VllmRunner:
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
-        greedy_logprobs_params = SamplingParams(
-            temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
+        greedy_logprobs_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
         outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params, images=images)
 
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def generate_beam_search(
         self,
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
index 55a24543..ea18239c 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
@@ -32,15 +32,14 @@ args = parser.parse_args()
 
 ckpt_path = args.ckpt_path
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitblas",
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitblas",
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
-                                                   max_tokens=1024)
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024)
     print("bitnet inference:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
index 4f5f87f6..f631fb30 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
@@ -33,13 +33,13 @@ args = parser.parse_args()
 ckpt_path = args.ckpt_path
 
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitnet_bitblas",
-        gpu_memory_utilization=0.5,
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitnet_bitblas",
+    gpu_memory_utilization=0.5,
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
     bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
     print("bitnet inference output:")
diff --git a/examples/bitnet-1.58b/vllm_workspace/utils.py b/examples/bitnet-1.58b/vllm_workspace/utils.py
index daa9d8f5..e96b19e2 100644
--- a/examples/bitnet-1.58b/vllm_workspace/utils.py
+++ b/examples/bitnet-1.58b/vllm_workspace/utils.py
@@ -3,8 +3,7 @@ from typing import Dict, List, Tuple
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText],
-                        name_0: str, name_1: str):
+def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText], name_0: str, name_1: str):
     """
     Compare the two sequences generated by different models,
     which should be equal.
@@ -15,19 +14,14 @@ def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[Tok
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_str_0 == output_str_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+        assert output_ids_0 == output_ids_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
 
 TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs], outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
@@ -41,16 +35,11 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
 
         # Loop through generated tokens.
         for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_0 in logprobs_1[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                assert output_id_1 in logprobs_0[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/examples/blocksparse_attention/block_sparse_attn_triton.py b/examples/blocksparse_attention/block_sparse_attn_triton.py
index 014f0c5f..17948363 100644
--- a/examples/blocksparse_attention/block_sparse_attn_triton.py
+++ b/examples/blocksparse_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
     # print
 
@@ -73,8 +69,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -154,7 +149,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -192,24 +187,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -254,7 +237,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -278,9 +260,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -288,9 +270,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -302,22 +282,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -329,9 +308,9 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
@@ -339,8 +318,7 @@ def test_topk_sparse_attention_qlt_kl():
     print("downsample_factor", downsample_factor)
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
     print("downsample_len", downsample_len)
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -351,26 +329,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
index 7e90db7e..afb4cc88 100644
--- a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
+++ b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,15 +27,17 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 1
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
@@ -47,7 +46,6 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -59,11 +57,10 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -78,18 +75,18 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -113,22 +110,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def blocksparse_flashattn(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -143,7 +139,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -152,20 +148,19 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return blocksparse_flashattn
 
@@ -180,18 +175,16 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
@@ -202,15 +195,15 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index 1c4b847d..99418d5f 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -13,17 +13,20 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages,
-                    max_num_blocks_per_seq, max_selected_blocks):
+        },
+    )
+    def kernel_func(
+        block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages, max_num_blocks_per_seq, max_selected_blocks
+    ):
         shape_q = [batch, heads, dim]
         shape_k = [num_pages, page_block_size, heads_kv, dim]
         shape_v = [num_pages, page_block_size, heads_kv, dim_v]
@@ -37,17 +40,16 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, "int32"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            block_table: T.Tensor(shape_block_table, "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -67,7 +69,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -75,7 +77,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
@@ -85,29 +87,20 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         block_table_idx = T.floordiv(logical_block_idx, block_ratio)
                         block_tile_idx = T.floormod(logical_block_idx, block_ratio)
                         physical_block_idx = block_table[bid, block_table_idx]
-                        T.copy(
-                            K[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], K_shared)
+                        T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
                                 acc_s[i, j] = T.if_then_else(
-                                    logical_block_idx * block_N + j >= cache_seqlens[bid],
-                                    -T.infinity(accum_dtype), acc_s[i, j])
+                                    logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
+                                )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
                             scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -116,10 +109,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], V_shared)
+                        T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -138,9 +128,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -151,17 +141,18 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 scale_local = T.alloc_local([1], accum_dtype)
                 max_split = T.alloc_local([1], "int32")
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
                 lse_max_local[0] = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
                     lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
+                    if lse_local_split[0] != 0:
                         max_split[0] = k
                         lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
 
@@ -183,18 +174,17 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, "int32"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            block_table: T.Tensor(shape_block_table, "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse,
-                             Output_partial)
+            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse, Output_partial)
             combine(glse, Output_partial, Output)
 
         return main
@@ -203,7 +193,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -249,18 +238,11 @@ class SparseFlashAttn(torch.nn.Module):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(
             query,
@@ -275,14 +257,13 @@ class SparseFlashAttn(torch.nn.Module):
         return output
 
 
-def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens,
-                            block_table, page_block_size, block_size):
+def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens, block_table, page_block_size, block_size):
     """
     Paged version of sparse attention reference implementation.
-    
+
     Args:
         query: [batch, heads, dim]
-        key_cache: [num_pages, page_block_size, heads_kv, dim] 
+        key_cache: [num_pages, page_block_size, heads_kv, dim]
         value_cache: [num_pages, page_block_size, heads_kv, dim]
         block_indices: [batch, heads_kv, max_selected_blocks] - logical block indices
         cache_seqlens: [batch] - actual sequence lengths
@@ -298,12 +279,8 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 
     # Reconstruct the full key and value tensors from paged cache
     max_cache_seqlen = max(cache_seqlens).item()
-    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim),
-                           dtype=key_cache.dtype,
-                           device=key_cache.device)
-    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v),
-                             dtype=value_cache.dtype,
-                             device=value_cache.device)
+    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim), dtype=key_cache.dtype, device=key_cache.device)
+    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v), dtype=value_cache.dtype, device=value_cache.device)
 
     # Reconstruct full tensors from paged cache using block_table
     for b in range(batch):
@@ -319,20 +296,14 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
             actual_block_size = end_token - start_token
 
             # Copy from paged cache to full tensors
-            key_full[b, :, start_token:end_token, :] = key_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
-            value_full[b, :, start_token:end_token, :] = value_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            key_full[b, :, start_token:end_token, :] = key_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            value_full[b, :, start_token:end_token, :] = value_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
 
     # Reshape query for grouped attention
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Compute attention scores
-    scores = einsum(
-        query, key_full,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key_full, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     # Create sparse mask based on block_indices
     sparse_mask = torch.zeros_like(scores)
@@ -348,24 +319,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
                     sparse_mask[b, :, h, start_pos:end_pos] = 1
 
     # Apply sparse mask
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
     # Apply causal mask based on actual sequence lengths
     range_len = torch.arange(scores.shape[-1], device=scores.device).unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
+    scores = scores.masked_fill(pad_mask, float("-inf"))
 
     # Compute attention weights
     attention = F.softmax(scores / scale, dim=-1)
 
     # Apply attention to values
-    out = einsum(attention, value_full,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
+    out = einsum(attention, value_full, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Reshape output back to original format
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
 
     return out
 
@@ -373,17 +343,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 def ref_program_fa(query, kcache, vcache, cache_seqlens, block_table):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
-    output = flash_attn_with_kvcache(
-        query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
+    output = flash_attn_with_kvcache(query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
     output = output.squeeze(1)
     return output
 
 
 def main(args):
-
-    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
     sparse_ratio = args.sparse_ratio
     block_N = args.block_N
     page_block_size = args.page_block_size
@@ -395,35 +371,30 @@ def main(args):
     dtype = torch.float16
 
     # Generate random inputs
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(
-        max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
     print("cache_seqlens: ", cache_seqlens)
 
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create paged KV cache
-    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device='cuda')
-    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v),
-                          dtype=dtype,
-                          device='cuda')
+    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create block table and block indices for dense case (all blocks selected)
     max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
     print("max_num_blocks_per_seq: ", max_num_blocks_per_seq)
-    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device='cuda')
-    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks),
-                                dtype=torch.int32,
-                                device='cuda')
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
 
     # Fill block table and block indices and cache
 
     # Create a pool of available physical blocks
-    total_blocks_needed = sum(
-        int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
     available_blocks = list(range(total_blocks_needed))
     import random
+
     random.seed(42)  # For reproducibility
     random.shuffle(available_blocks)
 
@@ -458,10 +429,8 @@ def main(args):
             actual_block_size = end_token - start_token
 
             # Copy K and V data to the paged cache
-            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx,
-                                                                      start_token:end_token, :, :]
-            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx,
-                                                                      start_token:end_token, :, :]
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
 
     # Fill block_indices for sparse attention
     # For dense case (verification), we select all blocks in reverse order
@@ -496,10 +465,9 @@ def main(args):
                     remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
                     if remaining_blocks:
                         import random
+
                         random.seed(42)  # For reproducibility
-                        additional_blocks = random.sample(
-                            remaining_blocks,
-                            min(num_selected - recent_blocks, len(remaining_blocks)))
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
                         selected_blocks.extend(additional_blocks)
 
                 # Sort selected blocks in reverse order (most recent first)
@@ -512,25 +480,20 @@ def main(args):
                     block_indices[seq_idx, head_idx, i] = -1
 
     # Initialize sparse attention module
-    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
-                                  num_blocks)
-    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                        block_table)
+    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
+    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table)
 
     import flash_attn  # noqa: F401
 
-    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                               block_table, page_block_size, block_N)
+    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, page_block_size, block_N)
 
     output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
     # Check correctness
     if sparse_ratio == 0.0:
         max_diff = torch.max(torch.abs(output_sparse - output_ref_fa)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_fa)).item()
-        assert torch.allclose(
-            output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
+        assert torch.allclose(output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
     else:
-
         max_diff = torch.max(torch.abs(output_sparse - output_ref_torch)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_torch)).item()
 
@@ -574,16 +537,15 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.0, help='sparse ratio')
-    parser.add_argument('--block_N', type=int, default=64, help='block_N')
-    parser.add_argument('--page_block_size', type=int, default=256, help='block size of pages')
-    parser.add_argument('--num_pages', type=int, default=1024, help='total number of pages')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.0, help="sparse ratio")
+    parser.add_argument("--block_N", type=int, default=64, help="block_N")
+    parser.add_argument("--page_block_size", type=int, default=256, help="block size of pages")
+    parser.add_argument("--num_pages", type=int, default=1024, help="total number of pages")
     args = parser.parse_args()
     main(args)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index b3087522..8b5cde38 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -10,17 +10,18 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen,
-                    max_selected_blocks):
+        },
+    )
+    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, max_selected_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
         shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
@@ -31,17 +32,16 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, "int32"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            # actual_num_blocks: T.Tensor([batch], "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -62,7 +62,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -70,7 +70,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
 
@@ -78,26 +78,18 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                     i_s = block_indices[bid, cur_kv_head, start + k]
                     if i_s >= 0:
                         has_valid_block = True
-                        T.copy(K[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], K_shared)
+                        T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i,
-                                      j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid],
-                                                          -T.infinity(accum_dtype), acc_s[i, j])
+                                acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
                             scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -106,7 +98,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], V_shared)
+                        T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -125,9 +117,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -138,17 +130,18 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 scale_local = T.alloc_local([1], accum_dtype)
                 max_split = T.alloc_local([1], "int32")
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
                 lse_max_local[0] = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
                     lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
+                    if lse_local_split[0] != 0:
                         max_split[0] = k
                         lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
 
@@ -170,15 +163,15 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, "int32"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            # actual_num_blocks: T.Tensor([batch], "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
             flash_attn_split(Q, K, V, block_indices, cache_seqlens, glse, Output_partial)
@@ -190,7 +183,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -209,7 +201,8 @@ class SparseFlashAttn(torch.nn.Module):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"))
+            max_selected_blocks=T.dynamic("max_selected_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -232,25 +225,17 @@ class SparseFlashAttn(torch.nn.Module):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
 
-def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens,
-                                    max_cache_seqlen, block_size):
+def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, block_size):
     """
     Args:
         query: [batch, heads, dim]
@@ -272,31 +257,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
     block_H = 64
 
     actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
 
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
-
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
         block_H=block_H,
@@ -304,29 +282,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        max_selected_blocks=T.dynamic("max_selected_blocks"))
+        max_selected_blocks=T.dynamic("max_selected_blocks"),
+    )
 
     output = kernel(query, key, value, block_indices, cache_seqlens, glse, Output_partial)
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -335,28 +308,26 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -368,23 +339,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
     print(name + "  all_close={}".format(all_close))
     if not all_close:
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -392,10 +353,10 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # # Ensure at least one element equals cache_seqlen
     # random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
@@ -406,10 +367,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
     # max_num_blocks = int((max_cache_seqlen + block_size - 1)/ block_size)
     # block_indices = torch.full((batch, heads_kv, max_num_blocks), -1, dtype=torch.int32, device='cuda')
 
@@ -418,10 +376,9 @@ def main(batch=8,
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
                 # valid_indices = torch.randperm(max_valid_block, device='cuda', dtype=torch.int32)[:max_num_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -434,8 +391,7 @@ def main(batch=8,
     print("max_num_blocks: ", max_num_blocks)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
@@ -445,13 +401,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -469,15 +423,13 @@ def main(batch=8,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index 3417bd7f..0d759211 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -12,15 +12,17 @@ from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
     def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, num_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
@@ -32,16 +34,15 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_mask: T.Tensor(shape_mask, "bool"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -62,39 +63,31 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[bid, hid, start + k]:
                         has_valid_block = True
-                        T.copy(
-                            K[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            K_shared)
+                        T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.if_then_else((start + k) * block_N + j
-                                                         >= cache_seqlens[bx],
-                                                         -T.infinity(accum_dtype), acc_s[i, j])
+                            acc_s[i, j] = T.if_then_else(
+                                (start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j]
+                            )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
                             scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -103,9 +96,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            V_shared)
+                        T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -123,9 +114,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.macro
         def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
@@ -135,10 +126,11 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
 
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                T.annotate_layout(
+                    {
+                        lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    }
+                )
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
@@ -161,14 +153,14 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_mask: T.Tensor(shape_mask, "bool"),
+            cache_seqlens: T.Tensor([batch], "int32"),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
             flash_attn_split(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
             combine(glse, Output_partial, Output)
@@ -179,7 +171,6 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -198,7 +189,8 @@ class SparseFlashAttn(torch.nn.Module):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            num_blocks=T.dynamic("num_blocks"))
+            num_blocks=T.dynamic("num_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -217,24 +209,16 @@ class SparseFlashAttn(torch.nn.Module):
         num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
         num_n_blocks = max_selected_blocks
 
-        size_one_kv_head = max_selected_blocks * block_size * (
-            dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+        size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
         total_mblocks = batch * heads_kv * num_m_blocks
         # num_sm = 132
         num_sm = self.num_sm
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
         # print("num_split: ", num_split)
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
         output = self.kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
         return output
 
@@ -259,26 +243,21 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     block_H = 64
 
     actual_num_blocks = torch.sum(block_mask, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
     max_selected_blocks = actual_num_blocks.max().item()
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
@@ -287,11 +266,10 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        num_blocks=T.dynamic("num_blocks"))
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        num_blocks=T.dynamic("num_blocks"),
+    )
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     # print(kernel.get_kernel_source())
 
     output = kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
@@ -299,24 +277,18 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -324,29 +296,27 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -360,23 +330,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
         # print(expect[3, 28])
         # print(actual[3, 28])
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -384,14 +344,13 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
 
     print("cache_seqlens: ", cache_seqlens)
@@ -403,7 +362,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -411,13 +370,12 @@ def main(batch=8,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
     # print("block_mask: ", block_mask)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     # out = sparse_gqa_decode_varlen_mask(Q, K, V, block_mask, cache_seqlens, block_size)
     model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = model(Q, K, V, block_mask, cache_seqlens)
@@ -427,13 +385,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -452,15 +408,13 @@ def main(batch=8,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index 85b72b77..b61d52fa 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -12,12 +12,8 @@ from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -79,16 +75,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for i in range(loop_range):
         block_idx = tl.load(mask_ptr + (start + i) * stride_mask_s)
@@ -119,23 +110,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -163,18 +149,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -209,19 +192,13 @@ def block_sparse_flash_decode_gqa_indice_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -295,24 +272,18 @@ def block_sparse_flash_decode_gqa_indice_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     dim_v = value.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -321,42 +292,33 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -369,34 +331,29 @@ def main(batch=64,
     dtype = torch.float16
     block_H = 64
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     print("cache_seqlens: ", cache_seqlens)
 
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -408,8 +365,7 @@ def main(batch=64,
     max_num_blocks = torch.max(max_valid_num_blocks).item()
     print("max_num_blocks: ", max_num_blocks)
 
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_indice_triton(
         Q,
@@ -423,8 +379,7 @@ def main(batch=64,
     )
 
     print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -466,15 +421,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
index 34857252..c05b3777 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
@@ -11,12 +11,8 @@ from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -77,16 +73,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for block_idx in range(loop_range):
         start_n = (start + block_idx) * BLOCK_N
@@ -117,23 +108,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -161,18 +147,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -207,19 +190,13 @@ def block_sparse_flash_decode_gqa_mask_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -292,24 +269,18 @@ def block_sparse_flash_decode_gqa_mask_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -317,43 +288,34 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     block_size = block_size
     sparse_ratio = sparse_ratio
@@ -363,14 +325,13 @@ def main(batch=64,
 
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     num_blocks = (max_cache_seqlen + block_size - 1) // block_size
 
@@ -379,7 +340,7 @@ def main(batch=64,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -387,11 +348,10 @@ def main(batch=64,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
 
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_mask_triton(
         Q,
@@ -404,8 +364,7 @@ def main(batch=64,
     )
 
     # print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -448,15 +407,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/heuristic.py b/examples/blocksparse_attention/heuristic.py
index b60a81dc..0e6fc528 100644
--- a/examples/blocksparse_attention/heuristic.py
+++ b/examples/blocksparse_attention/heuristic.py
@@ -1,8 +1,7 @@
 import math
 
 
-def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head,
-                         is_causal_or_local, max_splits):
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local, max_splits):
     """
     Determines the optimal number of splits for maximizing GPU occupancy while balancing memory efficiency.
 
diff --git a/examples/blocksparse_attention/test_example_blocksparse_attention.py b/examples/blocksparse_attention/test_example_blocksparse_attention.py
index adda1f0f..dd33f46c 100644
--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
@@ -25,26 +25,14 @@ def test_example_tilelang_sparse_gqa_decode_varlen_mask():
 
 def test_example_triton_sparse_gqa_decode_varlen_indice():
     example_triton_sparse_gqa_decode_varlen_indice.main(
-        batch=8,
-        heads=8,
-        heads_kv=4,
-        max_cache_seqlen=2048,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=8, heads=8, heads_kv=4, max_cache_seqlen=2048, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 def test_example_triton_sparse_gqa_decode_varlen_mask():
     example_triton_sparse_gqa_decode_varlen_mask.main(
-        batch=16,
-        heads=16,
-        heads_kv=8,
-        max_cache_seqlen=1024,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=16, heads=16, heads_kv=8, max_cache_seqlen=1024, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 8cd3a821..0cbef5e0 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -19,8 +19,7 @@ parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
 parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
 parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
 parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
-parser.add_argument(
-    "--use_autotune", action="store_true", default=False, help="Whether to use autotune")
+parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
 
 args, _ = parser.parse_known_args()
 M, N, K = args.m, args.n, args.k
@@ -41,17 +40,19 @@ def get_configs():
     thread_num = [128, 256]
     enable_rasterization = [True, False]
 
-    _configs = list(
-        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
+    _configs = list(itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
 
-    return [{
-        "block_M": c[0],
-        "block_N": c[1],
-        "block_K": c[2],
-        "num_stages": c[3],
-        "thread_num": c[4],
-        "enable_rasteration": c[5],
-    } for c in _configs]
+    return [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "enable_rasteration": c[5],
+        }
+        for c in _configs
+    ]
 
 
 def ref_program(A, B, BlockMask, block_M, block_N, block_K):
@@ -61,12 +62,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if BlockMask[i, j, k]:
-                    accu += (
-                        A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                            torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                               j * block_N:(j + 1) * block_N].to(torch.float32))
-            ref_c[i * block_M:(i + 1) * block_M,
-                  j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -89,28 +88,21 @@ def supply_program(params: List[KernelParam]):
     return input_tensors
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       thread_num,
-                       enable_rasteration,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+def blocksparse_matmul(
+    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"
+):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -134,7 +126,6 @@ def blocksparse_matmul(M,
 
 
 def main():
-
     # Initialize input matrices A and B on the GPU with half precision
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
@@ -147,8 +138,7 @@ def main():
 
         best_config = kernel.config
         best_latency = kernel.latency
-        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config[
-            "block_K"]
+        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config["block_K"]
 
         print(f"Best Config: {best_config}")
         print(f"Sparsity Ratio: {sparsity}")
@@ -163,7 +153,8 @@ def main():
             block_K=DEFAULT_BLOCK_K,
             num_stages=DEFAULT_NUM_STAGES,
             thread_num=DEFAULT_THREAD_NUM,
-            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
+            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+        )
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
     # Create block mask with desired sparsity
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index 102ac202..ec15b292 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -16,11 +16,13 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
-            (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
-        with T.Kernel(
-                T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
+    def group_per_split_token_cast(
+        X: T.Tensor((M, N), dtype),
+        batch_sizes: T.Tensor((BG,), "int32"),
+        X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"),
+        X_amax: T.Tensor((BG, M_max, T.ceildiv(N, group_size)), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
             row = bx
             row_g_id = by
             bg = bz
@@ -31,36 +33,32 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
             y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
             row_offset = T.alloc_fragment((1,), "int32")
 
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
+            T.annotate_layout(
+                {
+                    y_local: T.Fragment(y_local.shape, forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
+                }
+            )
 
             row_offset[0] = 0
             for i in T.serial(bg):
                 row_offset[0] += batch_sizes[i]
 
             T.copy(
-                X[row_offset[0] + row * blk_m:row_offset[0] + (row + 1) * blk_m,
-                  row_g_id * group_size:(row_g_id + 1) * group_size], y_local)
+                X[row_offset[0] + row * blk_m : row_offset[0] + (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size],
+                y_local,
+            )
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
-                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                              y_amax_local[i] / fp8_max, 0)
+                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_amax_local[i] / fp8_max, 0)
             for i, j in T.Parallel(blk_m, group_size):
                 y_q_local[i, j] = T.clamp(y_local[i, j] / y_s_local[i], fp8_min, fp8_max)
             T.copy(y_q_local, y_q_local_fp8)
             for i, j in T.Parallel(blk_m, group_size):
-                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                                     y_q_local[i, j], 0)
+                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_q_local[i, j], 0)
             for i in T.Parallel(blk_m):
                 X_amax[bg, row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[bg, row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[bg, row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return group_per_split_token_cast
 
@@ -127,8 +125,7 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
         return x.squeeze(0) if remove_dim else x
 
     # Normal layout requires transposing
-    aligned_x = torch.transpose(
-        torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
     aligned_x[:, :m, :] = x
     aligned_x = aligned_x[:, :m, :]
     return aligned_x.squeeze(0) if remove_dim else aligned_x
@@ -146,15 +143,17 @@ def ref_per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
     x_fp8 = x_fp8.view(m, -1)[:, :n].contiguous()
     return x_fp8, (x_amax / 448.0).view(m, -1)
 
-def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
-        Tuple[torch.Tensor, torch.Tensor]:
+
+def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     # assert x.shape[0] == batch_sizes.sum()
     M_max = ceil_div(batch_sizes.max(), 128) * 128
     split_x = torch.split(x, batch_sizes.tolist(), dim=0)
     padded_x = [torch.nn.functional.pad(t, (0, 0, 0, M_max - t.shape[0])) for t in split_x]
     num_groups, m, n = batch_sizes.shape[0], M_max, x.shape[1]
-    x_fp8 = (torch.empty((num_groups, m, n), device='cuda', dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, m, n // 128), device='cuda', dtype=torch.float))
+    x_fp8 = (
+        torch.empty((num_groups, m, n), device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty((num_groups, m, n // 128), device="cuda", dtype=torch.float),
+    )
     for i in range(num_groups):
         x_fp8[0][i], x_fp8[1][i] = ref_per_token_cast_to_fp8(padded_x[i])
     x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1]))
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 484a092f..45281ab1 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -13,8 +13,9 @@ def per_token_cast_to_fp8(M, N, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
-                       X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
+    def per_token_cast(
+        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
             row_g_id = by
@@ -24,16 +25,13 @@ def per_token_cast_to_fp8(M, N, blk_m):
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
             y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
 
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
+            T.annotate_layout(
+                {
+                    y_local: T.Fragment(y_local.shape, forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
+                }
+            )
 
-            T.copy(
-                X[row * blk_m:(row + 1) * blk_m, row_g_id * group_size:(row_g_id + 1) * group_size],
-                y_local)
+            T.copy(X[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size], y_local)
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
@@ -43,9 +41,7 @@ def per_token_cast_to_fp8(M, N, blk_m):
             T.copy(y_q_local, y_q_local_fp8)
             for i in T.Parallel(blk_m):
                 X_amax[row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return per_token_cast
 
@@ -105,8 +101,7 @@ def main(M=8192, N=8192, blk_m=8):
     from example_triton_cast_to_fp8 import per_token_group_quant_fp8
 
     def run_triton():
-        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(
-            x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
+        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
         return x_fp8_triton_, x_amax_triton_
 
     x_fp8_triton, x_amax_triton = run_triton()
diff --git a/examples/cast/example_triton_cast_to_fp8.py b/examples/cast/example_triton_cast_to_fp8.py
index cc56defe..1859433f 100644
--- a/examples/cast/example_triton_cast_to_fp8.py
+++ b/examples/cast/example_triton_cast_to_fp8.py
@@ -128,9 +128,7 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
-    assert (x.shape[-1] %
-            group_size == 0), (f"the last dimension of `x` {x.shape[-1]} must be divisible "
-                               f"by `group_size` {group_size}")
+    assert x.shape[-1] % group_size == 0, f"the last dimension of `x` {x.shape[-1]} must be divisible by `group_size` {group_size}"
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
diff --git a/examples/cast/test_example_cast.py b/examples/cast/test_example_cast.py
index 1ca000eb..e8b10a79 100644
--- a/examples/cast/test_example_cast.py
+++ b/examples/cast/test_example_cast.py
@@ -4,8 +4,7 @@ import example_per_token_cast_to_fp8
 
 
 def test_example_group_per_split_token_cast_to_fp8():
-    example_group_per_split_token_cast_to_fp8.main(
-        M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
+    example_group_per_split_token_cast_to_fp8.main(M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
 
 
 def test_example_per_token_cast_to_fp8():
diff --git a/examples/compile_flags/usecase.py b/examples/compile_flags/usecase.py
index 8451b04f..80e2b784 100644
--- a/examples/compile_flags/usecase.py
+++ b/examples/compile_flags/usecase.py
@@ -4,12 +4,11 @@ import tilelang.language as T
 
 # @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -36,8 +35,7 @@ block_K = 32
 
 func = matmul(M, N, K, block_M, block_N, block_K)
 
-jit_kernel = tilelang.compile(
-    func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
+jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
 # or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
 # or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
 
diff --git a/examples/conftest.py b/examples/conftest.py
index 9f49d40a..4010e0d8 100644
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index b2696ba8..a84e5878 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -26,22 +25,7 @@ def ref_program(stride, padding, dilation):
 
 
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                threads,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
@@ -51,13 +35,11 @@ def convolution(N,
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -66,11 +48,13 @@ def convolution(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -82,10 +66,8 @@ def convolution(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -97,15 +79,15 @@ def convolution(N,
 
 def main(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
 
     args = parser.parse_args(argv)
     N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index 39367748..600b608a 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -40,7 +39,8 @@ def get_configs():
             num_stages,
             thread_num,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -50,7 +50,8 @@ def get_configs():
             "num_stages": c[3],
             "thread_num": c[4],
             "enable_rasteration": c[5],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
@@ -64,53 +65,18 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasteration,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(
+    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"
+):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
@@ -120,13 +86,11 @@ def convolution(N,
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=thread_num) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=thread_num) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -136,9 +100,11 @@ def convolution(N,
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
             if is_hopper:
-                T.annotate_layout({
-                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                })
+                T.annotate_layout(
+                    {
+                        out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    }
+                )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -150,10 +116,8 @@ def convolution(N,
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -166,17 +130,19 @@ def convolution(N,
     return main
 
 
-def main(n: int = 128,
-         c: int = 128,
-         h: int = 64,
-         w: int = 64,
-         f: int = 128,
-         k: int = 3,
-         s: int = 1,
-         d: int = 1,
-         p: int = 1,
-         use_autotune: bool = False,
-         with_roller: bool = True):
+def main(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
     N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
     ref_prog = ref_program(S, P, D)
 
@@ -196,25 +162,16 @@ def main(n: int = 128,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=True,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=True, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
-    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune,
-         args.with_roller)
+    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune, args.with_roller)
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 715f09a9..8aba9140 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -41,14 +41,13 @@ def tl_gemm(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            scales_a: T.Tensor(Scales_A_shape, "float32"),
-            scales_b: T.Tensor(Scales_B_shape, "float32"),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        scales_a: T.Tensor(Scales_A_shape, "float32"),
+        scales_b: T.Tensor(Scales_B_shape, "float32"),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
@@ -93,21 +92,18 @@ def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     m, n = x.shape
     x_view = x.view(m, -1, 128)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
-        m, n), (x_amax / 448.0).view(m, -1)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
 
 def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros(
-        ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
+    x_padded = torch.zeros(ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
-        x_view.size(0), x_view.size(2))
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
 
 
 def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
@@ -127,13 +123,14 @@ def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
             c_acc.zero_()
             for k in range(ceildiv(K, 128)):
                 c = torch._scaled_mm(
-                    A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
-                    B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+                    A_fp8[i * 128 : (i + 1) * 128, k * 128 : (k + 1) * 128],
+                    B_fp8[j * 128 : (j + 1) * 128, k * 128 : (k + 1) * 128].T,
                     scale_a=A_scales[i, k].view(128, 1).contiguous(),
                     scale_b=B_scales[j, k].view(1, 128).contiguous(),
-                    out_dtype=torch.bfloat16)
+                    out_dtype=torch.bfloat16,
+                )
                 c_acc += c.to(torch.float32)
-            C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+            C[i * 128 : (i + 1) * 128, j * 128 : (j + 1) * 128] = c_acc.to(out_dtype)
     return C
 
 
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index 61c3b63c..49958379 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -8,6 +8,7 @@ import argparse
 
 def get_configs():
     import itertools
+
     BLOCK_N = [16, 32, 64, 128]
     BLOCK_H = [16, 32, 64, 128]
     num_split = [1, 2, 4, 8, 16, 32]
@@ -15,30 +16,26 @@ def get_configs():
 
     _configs = list(itertools.product(BLOCK_N, BLOCK_H, num_split, threads))
 
-    return [{
-        "block_N": c[0],
-        "block_H": c[1],
-        "num_split": c[2],
-        "threads": c[3],
-    } for c in _configs]
+    return [
+        {
+            "block_N": c[0],
+            "block_H": c[1],
+            "num_split": c[2],
+            "threads": c[3],
+        }
+        for c in _configs
+    ]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashmla_decode(batch,
-                    heads,
-                    kv_head_num,
-                    seqlen_kv,
-                    dim,
-                    pe_dim,
-                    block_N,
-                    block_H,
-                    num_split,
-                    threads=128):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashmla_decode(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, threads=128):
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
@@ -47,11 +44,11 @@ def flashmla_decode(batch,
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
@@ -70,24 +67,19 @@ def flashmla_decode(batch,
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -107,20 +99,18 @@ def flashmla_decode(batch,
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=threads) as (bx, by, bz):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -136,8 +126,8 @@ def flashmla_decode(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -150,12 +140,7 @@ def flashmla_decode(batch,
                 T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -176,14 +161,14 @@ def flashmla_decode(batch,
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
-            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
+            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -193,9 +178,11 @@ def flashmla_decode(batch,
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -218,26 +205,26 @@ def flashmla_decode(batch,
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -262,43 +249,36 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
-    parser.add_argument('--autotune', action='store_true', help='auto tune')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
+    parser.add_argument("--autotune", action="store_true", help="auto tune")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     enable_autotune = args.autotune
@@ -314,17 +294,7 @@ if __name__ == "__main__":
     if enable_autotune:
         kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
     else:
-        kernel = flashmla_decode(
-            batch,
-            heads,
-            kv_heads,
-            kv_ctx,
-            dim,
-            pe_dim,
-            BLOCK_N,
-            BLOCK_H,
-            num_split,
-            threads=threads)
+        kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, threads=threads)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     input_tensors = profiler._get_inputs()
     tilelang_output = kernel(*input_tensors)
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
index 0006d946..18c0a5f8 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
@@ -32,8 +32,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -94,8 +93,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -141,9 +139,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -309,24 +305,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -362,14 +364,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -377,21 +380,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -408,19 +404,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -429,26 +422,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -470,26 +459,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
index 644f97da..861e841c 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
@@ -29,8 +29,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -91,8 +90,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -138,9 +136,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -306,24 +302,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -359,14 +361,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -374,21 +377,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -405,19 +401,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -426,26 +419,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [64, 128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -467,26 +456,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/benchmark_mla.py b/examples/deepseek_mla/benchmark_mla.py
index a542ff61..544b5e12 100644
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
@@ -33,8 +33,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -61,8 +60,7 @@ def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
 
 
 @torch.inference_mode()
-def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     from flash_mla import flash_mla_with_kvcache, get_mla_metadata
 
     blocked_v = blocked_k[..., :dv]
@@ -87,14 +85,13 @@ def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
 
 
 @torch.inference_mode()
-def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                   h_q, h_kv, d, dv, causal, dtype):
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # pip install flashinfer-python
     import flashinfer
+
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     kv_indptr = [0]
     kv_indices = []
@@ -111,8 +108,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
 
-    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
-        torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
+    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
     mla_wrapper.plan(
         q_indptr,
         kv_indptr,
@@ -129,12 +125,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     )
 
     def flashinfer():
-        output, lse = mla_wrapper.run(
-            q_nope.view(-1, h_q, dv),
-            q_pe.view(-1, h_q, d - dv),
-            blocked_k_nope,
-            blocked_k_pe,
-            return_lse=True)
+        output, lse = mla_wrapper.run(q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope, blocked_k_pe, return_lse=True)
         return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
 
     out_flash, lse_flash = flashinfer()
@@ -177,8 +168,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -224,9 +214,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -393,24 +381,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -419,13 +413,10 @@ def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size,
 
 
 @torch.inference_mode()
-def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                           cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -434,8 +425,7 @@ def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size)
 
     def flash_mla_tilelang():
         out = kernel(
@@ -486,38 +476,31 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
         # flashinfer has a different lse return value
         # flash_mla_triton and flash_mla_tilelang doesn't return lse
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -534,19 +517,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -558,26 +538,22 @@ available_targets = [
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384, 32768] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]
+    for head in [128]
+]
 
 
 def get_args():
@@ -599,26 +575,54 @@ if __name__ == "__main__":
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 3932d112..733ae3c4 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -8,11 +8,12 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+    },
+)
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
@@ -22,11 +23,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -44,33 +45,24 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
-                T.gemm(
-                    Q_shared,
-                    KV_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.copy(KV[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -90,20 +82,18 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=256) as (bid, hid, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=256) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -121,13 +111,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = hid // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -139,14 +131,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                 T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -168,16 +154,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                            bz, :])
+            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (hid, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -187,9 +172,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -212,26 +199,26 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -256,31 +243,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -298,10 +278,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -311,12 +290,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index d23ff00c..dee05c1e 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -8,22 +8,14 @@ import math
 
 
 @tilelang.jit(
-    out_idx=[8], pass_configs={
+    out_idx=[8],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def mla_decode_tilelang(batch,
-                        h_q,
-                        h_kv,
-                        max_seqlen_pad,
-                        dv,
-                        dpe,
-                        block_N,
-                        block_H,
-                        num_split,
-                        block_size,
-                        softmax_scale=None):
+    },
+)
+def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split, block_size, softmax_scale=None):
     if softmax_scale is None:
-        softmax_scale = (dv + dpe)**-0.5
+        softmax_scale = (dv + dpe) ** -0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
@@ -34,13 +26,13 @@ def mla_decode_tilelang(batch,
 
     @T.macro
     def flash_mla_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
+        CACHE_SEQLENS: T.Tensor([batch], "int32"),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
@@ -59,13 +51,15 @@ def mla_decode_tilelang(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -73,25 +67,17 @@ def mla_decode_tilelang(batch,
             loop_range = T.ceildiv(CACHE_SEQLENS[bx], block_N)
             for kr in T.Pipelined(loop_range, num_stages=2):
                 k = loop_range - 1 - kr
-                kv_start = BLOCK_TABLE[bx, (k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = BLOCK_TABLE[bx, (k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 if kr == 0:
                     for i, j in T.Parallel(block_H, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
-                                                     -T.infinity(accum_dtype), acc_s[i, j])
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_H):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
@@ -109,21 +95,20 @@ def mla_decode_tilelang(batch,
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     @T.macro
     def flash_mla_split_kv_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
+        CACHE_SEQLENS: T.Tensor([batch], "int32"),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
     ):
-        with T.Kernel(
-                batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -141,13 +126,15 @@ def mla_decode_tilelang(batch,
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -155,28 +142,20 @@ def mla_decode_tilelang(batch,
             total_blocks = T.ceildiv(CACHE_SEQLENS[bx], block_N)
             blocks_per_split = T.floordiv(total_blocks, num_split)
             remaining_blocks = T.floormod(total_blocks, num_split)
-            loop_range = (blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0))
+            loop_range = blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0)
             start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
 
             for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = BLOCK_TABLE[bx, (start + k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = BLOCK_TABLE[bx, (start + k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
-                                                 -T.infinity(accum_dtype), acc_s[i, j])
+                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_H):
                     scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
@@ -196,15 +175,15 @@ def mla_decode_tilelang(batch,
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
+            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         with T.Kernel(h_q, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dv], dtype)
@@ -214,9 +193,11 @@ def mla_decode_tilelang(batch,
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -239,31 +220,30 @@ def mla_decode_tilelang(batch,
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
+        cache_seqlens: T.Tensor([batch], "int32"),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse,
-                                  Output_partial)
+        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
+        cache_seqlens: T.Tensor([batch], "int32"),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         flash_mla_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, Output)
 
@@ -284,8 +264,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
         s_q = query.shape[-2]
         s_k = key.shape[-2]
         attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype, device=query.device)
-        temp_mask = torch.ones(
-            s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         attn_bias.to(query.dtype)
         attn_weight += attn_bias
@@ -295,8 +274,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # q: [b, s_q, h_q, d]
     # block_table: [b, max_seqlen_pad // block_size]
     # blocked_k: [b * max_seqlen_pad // block_size, block_size, h_kv, d]
@@ -325,13 +303,10 @@ def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
     return out_torch
 
 
-def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                     h_q, h_kv, d, dv, causal, dtype):
-
+def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -341,8 +316,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size, softmax_scale)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
 
     def flash_mla_tilelang():
@@ -360,8 +334,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_flash = flash_mla_tilelang()
     t = do_bench(flash_mla_tilelang)
-    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                            cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
     torch.testing.assert_close(out_flash, out_ref, rtol=0.01, atol=0.01)
     print("All close")
     return out_flash, t
@@ -369,12 +342,12 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--h_q', type=int, default=128, help='q heads number')
-    parser.add_argument('--h_kv', type=int, default=1, help='kv heads number')
-    parser.add_argument('--cache_seqlen', type=int, default=8192, help='kv cache context length')
-    parser.add_argument('--d', type=int, default=576, help='query/key head dim, d = dv + dpe')
-    parser.add_argument('--dv', type=int, default=512, help='value head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--h_q", type=int, default=128, help="q heads number")
+    parser.add_argument("--h_kv", type=int, default=1, help="kv heads number")
+    parser.add_argument("--cache_seqlen", type=int, default=8192, help="kv cache context length")
+    parser.add_argument("--d", type=int, default=576, help="query/key head dim, d = dv + dpe")
+    parser.add_argument("--dv", type=int, default=512, help="value head dim")
     args = parser.parse_args()
     b, h_q, h_kv, cache_seqlen, d, dv = args.batch, args.h_q, args.h_kv, args.cache_seqlen, args.d, args.dv
 
@@ -383,9 +356,7 @@ if __name__ == "__main__":
 
     s_q = 1  # for decode, s_q = 1
     block_size = 64
-    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)],
-                                 dtype=torch.int32,
-                                 device=device)
+    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)], dtype=torch.int32, device=device)
     dpe = d - dv
     causal = True
 
@@ -397,12 +368,11 @@ if __name__ == "__main__":
     total_flops = s_q * total_seqlens * h_q * d * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32,
-        device=device).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32, device=device).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d, dtype=dtype, device=device)
-    out_flash, latency = run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                          s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_flash, latency = run_tilelang_mla(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 2f896f26..305fd30e 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -9,11 +9,13 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
@@ -23,13 +25,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split_persistent(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(sm_num, threads=256) as (block_id):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -53,11 +55,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
             T.use_swizzle(10)
 
             total_tiles = batch * (heads // min(block_H, kv_group_num)) * num_split
@@ -70,8 +74,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 cur_kv_head = hid // (kv_group_num // block_H)
 
                 if bid < batch and hid * VALID_BLOCK_H < heads and sid < num_split:
-                    T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+                    T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
@@ -83,26 +87,15 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                         T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            KV_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
-                        T.gemm(
-                            Q_pe_shared,
-                            K_pe_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
                             scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -117,11 +110,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         acc_o[i, j] /= logsum[i]
                     for i in T.Parallel(block_H):
                         logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, sid])
+                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid])
                     # T.copy(acc_o, O_shared)
-                    T.copy(
-                        acc_o, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                              sid, :])
+                    T.copy(acc_o, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid, :])
 
             T.sync_grid()
             waves = T.ceildiv(heads * batch, sm_num)
@@ -167,42 +158,35 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index fcd427ef..3fb90a55 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -13,14 +13,19 @@ import argparse
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     sm_scale = float(softmax_scale * 1.44269504)  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
@@ -30,11 +35,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
@@ -75,16 +80,16 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -166,8 +171,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          0:dim // 2])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -197,8 +201,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          dim // 2:dim])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -211,19 +214,17 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
@@ -233,33 +234,29 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=384) as (bid, hid, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=384) as (bid, hid, bz):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -298,16 +295,16 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -389,10 +386,8 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(
-                    O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, 0:dim // 2])
-                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+                T.copy(O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, 0 : dim // 2])
+                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -422,9 +417,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(
-                    O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, dim // 2:dim])
+                T.copy(O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -433,54 +426,48 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (hid, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -490,9 +477,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             lse_max_local = T.alloc_local([1], accum_dtype)
             scale_local = T.alloc_local([1], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -515,26 +504,26 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -559,31 +548,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -601,10 +583,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -614,12 +595,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index b141822f..4a1a84cf 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -8,11 +8,13 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[-1], pass_configs={
+    out_idx=[-1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     q_dtype = "float8_e4m3"
     accum_dtype = "float"
@@ -22,11 +24,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -46,31 +48,27 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
+
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], qKV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.copy(qKV_shared, KV_shared)
 
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -90,7 +88,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     return main_no_split
 
@@ -108,42 +106,35 @@ def ref_program(q, q_pe, kv, k_pe):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/torch_refs.py b/examples/deepseek_mla/torch_refs.py
index 4b4c888c..aae6c7cd 100644
--- a/examples/deepseek_mla/torch_refs.py
+++ b/examples/deepseek_mla/torch_refs.py
@@ -11,7 +11,7 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     block_N = 64
     seqlen_kv = KV.size(1)
 
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, nheads, dim), device="cuda", dtype=torch.float)
@@ -31,18 +31,20 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bhd,bkhd->bhk', Q_,
-                                 KV_[:, (seqlen_kv // num_split) * ks +
-                                     i * block_N:(seqlen_kv // num_split) * ks +
-                                     (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bhd,bkhd->bhk",
+                Q_,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             acc_s += torch.einsum(
-                'bhd,bkhd->bhk', Q_pe_,
-                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                      (i + 1) * block_N, :, :])
+                "bhd,bkhd->bhk",
+                Q_pe_,
+                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -50,9 +52,10 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
             acc_s = torch.exp2(acc_s - scores_max[:, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bhk,bkhd->bhd', acc_s_cast,
-                KV_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                    (i + 1) * block_N, :, :])
+                "bhk,bkhd->bhd",
+                acc_s_cast,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, None]
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index daee3986..ea3f72c5 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -14,21 +14,44 @@ from fla.ops.utils import prepare_token_indices
 from fla.utils import autocast_custom_fwd, contiguous
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -40,20 +63,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -66,7 +87,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -87,7 +108,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -100,8 +120,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -172,7 +191,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -195,7 +213,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -207,18 +226,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -258,44 +279,44 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -335,26 +356,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -364,14 +383,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -379,10 +395,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -404,71 +420,58 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
 
 def get_configs():
     import itertools
+
     iter_params = dict(
         block_T=[128, 256, 512],
         num_stages=[0, 1, 2, 4, 5],
         threads=[32, 64, 128, 256, 512],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def tilelang_sparse_attention(batch,
-                              heads,
-                              seq_len,
-                              dim,
-                              is_causal,
-                              scale=None,
-                              block_size=64,
-                              groups=1,
-                              selected_blocks=16,
-                              block_T=128,
-                              num_stages=2,
-                              threads=32):
+    }
+)
+def tilelang_sparse_attention(
+    batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16, block_T=128, num_stages=2, threads=32
+):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -493,11 +496,11 @@ def tilelang_sparse_attention(batch,
 
     @T.prim_func
     def tilelang_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -520,7 +523,7 @@ def tilelang_sparse_attention(batch,
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -530,21 +533,15 @@ def tilelang_sparse_attention(batch,
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -564,45 +561,33 @@ def tilelang_sparse_attention(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return tilelang_sparse_attention
 
 
 def generate_block_indices(batch, seq_len, heads, selected_blocks, block_size):
     """Generate random block indices for the benchmark."""
-    block_indices = torch.full((batch, seq_len, heads, selected_blocks),
-                               seq_len,
-                               dtype=torch.long,
-                               device='cuda')
+    block_indices = torch.full((batch, seq_len, heads, selected_blocks), seq_len, dtype=torch.long, device="cuda")
 
     for b in range(batch):
         for t in range(seq_len):
             for h in range(heads):
                 i_i = torch.randperm(max(1, (t // block_size)))[:selected_blocks]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
 
     return block_indices.sort(-1)[0]
 
 
-def benchmark_nsa(batch_size,
-                  seq_len,
-                  heads,
-                  head_query,
-                  dim,
-                  selected_blocks,
-                  block_size,
-                  dtype,
-                  scale,
-                  warmup=10,
-                  iterations=100,
-                  validate=False):
+def benchmark_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -628,14 +613,13 @@ def benchmark_nsa(batch_size,
     print(f"Profiler latency: {profiler_latency} ms")
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
 
     # Generate block indices
-    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks,
-                                           block_size).to(torch.int32)
+    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size).to(torch.int32)
 
     # Warmup
     for _ in range(warmup):
@@ -666,10 +650,9 @@ def benchmark_nsa(batch_size,
 
     # Validate result against reference if requested
     if validate:
-        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        block_counts = torch.randint(
-            1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
+        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
 
         ref = naive_nsa(
             q=Q,
@@ -700,22 +683,13 @@ def benchmark_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def benchmark_triton_nsa(batch_size,
-                         seq_len,
-                         heads,
-                         head_query,
-                         dim,
-                         selected_blocks,
-                         block_size,
-                         dtype,
-                         scale,
-                         warmup=10,
-                         iterations=100,
-                         validate=False):
+def benchmark_triton_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the Triton-based TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -723,18 +697,17 @@ def benchmark_triton_nsa(batch_size,
     torch.random.manual_seed(0)
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
 
     # Generate block indices
     block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size)
-    block_counts = torch.randint(
-        1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
-    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device='cuda')
+    block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
+    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device="cuda")
 
     # Warmup
     for _ in range(warmup):
@@ -750,7 +723,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
 
     # Synchronize before timing
     torch.cuda.synchronize()
@@ -770,7 +744,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
     torch.cuda.synchronize()
     end_time = time.time()
 
@@ -815,54 +790,28 @@ def benchmark_triton_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def run_benchmark_suite(impl='all'):
+def run_benchmark_suite(impl="all"):
     """Run a suite of benchmarks with different configurations."""
 
     # Define configurations to benchmark
     configs = [
         # Small model config - Note: head_query must be a multiple of heads*16 for Triton
-        {
-            "batch_size": 2,
-            "seq_len": 1024,
-            "heads": 8,
-            "head_query": 8 * 16,
-            "dim": 64,
-            "selected_blocks": 8,
-            "block_size": 32
-        },
-
+        {"batch_size": 2, "seq_len": 1024, "heads": 8, "head_query": 8 * 16, "dim": 64, "selected_blocks": 8, "block_size": 32},
         # Medium model config
-        {
-            "batch_size": 2,
-            "seq_len": 2048,
-            "heads": 16,
-            "head_query": 16 * 16,
-            "dim": 64,
-            "selected_blocks": 16,
-            "block_size": 64
-        },
-
+        {"batch_size": 2, "seq_len": 2048, "heads": 16, "head_query": 16 * 16, "dim": 64, "selected_blocks": 16, "block_size": 64},
         # Large model config
-        {
-            "batch_size": 1,
-            "seq_len": 4096,
-            "heads": 32,
-            "head_query": 32 * 16,
-            "dim": 128,
-            "selected_blocks": 32,
-            "block_size": 128
-        },
+        {"batch_size": 1, "seq_len": 4096, "heads": 32, "head_query": 32 * 16, "dim": 128, "selected_blocks": 32, "block_size": 128},
     ]
 
     results = []
     for config in configs:
         print(f"Running benchmark with config: {config}")
 
-        if impl in ['all', 'tilelang']:
+        if impl in ["all", "tilelang"]:
             print("Benchmarking TileLang implementation:")
             result = benchmark_nsa(
                 batch_size=config["batch_size"],
@@ -874,12 +823,13 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "tilelang", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all', 'triton']:
+        if impl in ["all", "triton"]:
             print("Benchmarking Triton implementation:")
             result = benchmark_triton_nsa(
                 batch_size=config["batch_size"],
@@ -891,19 +841,24 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "triton", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all']:
+        if impl in ["all"]:
             # Print comparison if both implementations were run
             tilelang_result = next(
-                r for r in results if r["impl"] == "tilelang" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "tilelang" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             triton_result = next(
-                r for r in results if r["impl"] == "triton" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "triton" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             speedup = tilelang_result["avg_time_ms"] / triton_result["avg_time_ms"]
             print(f"Speedup (Triton vs TileLang): {speedup:.2f}x")
 
@@ -921,8 +876,7 @@ if __name__ == "__main__":
     parser.add_argument("--dim", type=int, default=128, help="Head dimension")
     parser.add_argument("--selected_blocks", type=int, default=16, help="Number of selected blocks")
     parser.add_argument("--block_size", type=int, default=32, help="Block size")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", help="Data type (float16 or float32)")
+    parser.add_argument("--dtype", type=str, default="float16", help="Data type (float16 or float32)")
     parser.add_argument("--scale", type=float, default=0.1, help="Attention scale factor")
     parser.add_argument("--iterations", type=int, default=100, help="Number of iterations")
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
@@ -933,7 +887,8 @@ if __name__ == "__main__":
         type=str,
         default="all",
         choices=["tilelang", "triton", "all"],
-        help="Implementation to benchmark (tilelang, triton, or all)")
+        help="Implementation to benchmark (tilelang, triton, or all)",
+    )
 
     args = parser.parse_args()
 
@@ -941,8 +896,7 @@ if __name__ == "__main__":
     if args.impl in ["triton", "all"] and args.head_query % (args.heads * 16) != 0:
         # Adjust head_query to nearest valid value
         args.head_query = ((args.head_query // (args.heads * 16)) + 1) * (args.heads * 16)
-        print(
-            f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
+        print(f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
 
     if args.suite:
         run_benchmark_suite(impl=args.impl)
@@ -963,12 +917,14 @@ if __name__ == "__main__":
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (TileLang):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
@@ -986,11 +942,13 @@ if __name__ == "__main__":
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (Triton):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 1d1b5ea3..56e98a95 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -7,6 +7,7 @@ import torch
 import triton
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -22,7 +23,8 @@ import tilelang
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tilelang_kernel_fwd(
     batch,
     heads,
@@ -34,11 +36,10 @@ def tilelang_kernel_fwd(
     groups=1,
     selected_blocks=16,
 ):
-
     from tilelang import language as T
 
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -67,12 +68,12 @@ def tilelang_kernel_fwd(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -93,7 +94,7 @@ def tilelang_kernel_fwd(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -103,12 +104,11 @@ def tilelang_kernel_fwd(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for k, j in T.Parallel(G, BS):
-                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
@@ -138,7 +138,7 @@ def tilelang_kernel_fwd(
                         acc_o[k, j] *= scores_scale[k]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
@@ -146,18 +146,20 @@ def tilelang_kernel_fwd(
             T.copy(acc_o, O_shared)
             T.copy(
                 O_shared,
-                O_slc[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV],
+                O_slc[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV],
             )
             for i in T.Parallel(G):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G:(i_h + 1) * G])
+            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G : (i_h + 1) * G])
 
     return native_sparse_attention
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dkv(
     batch,
     heads,
@@ -172,7 +174,7 @@ def tilelang_kernel_bwd_dkv(
     accum_dtype="float",
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -207,15 +209,15 @@ def tilelang_kernel_bwd_dkv(
 
     @T.prim_func
     def flash_bwd_dkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, "int32"),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -238,31 +240,33 @@ def tilelang_kernel_bwd_dkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -273,7 +277,7 @@ def tilelang_kernel_bwd_dkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -282,7 +286,7 @@ def tilelang_kernel_bwd_dkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -296,7 +300,7 @@ def tilelang_kernel_bwd_dkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
                     for i, j in T.Parallel(BS, G):
                         dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
 
@@ -305,8 +309,8 @@ def tilelang_kernel_bwd_dkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dkv
 
@@ -321,9 +325,11 @@ def make_dq_layout(dQ):
     )
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dqkv(
     batch,
     heads,
@@ -338,7 +344,7 @@ def tilelang_kernel_bwd_dqkv(
     accum_dtype="float",
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -373,16 +379,16 @@ def tilelang_kernel_bwd_dqkv(
 
     @T.prim_func
     def flash_bwd_dqkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DQ: T.Tensor(dq_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DQ: T.Tensor(dq_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, "int32"),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -406,31 +412,33 @@ def tilelang_kernel_bwd_dqkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -441,7 +449,7 @@ def tilelang_kernel_bwd_dqkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -450,7 +458,7 @@ def tilelang_kernel_bwd_dqkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -464,7 +472,7 @@ def tilelang_kernel_bwd_dqkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
                     for _i, _j in T.Parallel(BS, G):
                         dsT_cast[_i, _j] = qkT[_i, _j] * (dsT[_i, _j] - delta[_j]) * sm_scale
 
@@ -480,16 +488,18 @@ def tilelang_kernel_bwd_dqkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dqkv
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_preprocess(
     batch,
     heads,
@@ -505,9 +515,9 @@ def tilelang_kernel_preprocess(
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -516,20 +526,22 @@ def tilelang_kernel_preprocess(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * blk:(by + 1) * blk, bx])
+            T.copy(delta, Delta[bz, by * blk : (by + 1) * blk, bx])
 
     return flash_bwd_prep
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_block_mask(
     batch,
     heads,
@@ -551,9 +563,9 @@ def tilelang_kernel_block_mask(
 
     @T.prim_func
     def flash_bwd_block_mask(
-            BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
-            BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
-            BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
+        BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
+        BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
+        BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, batch, heads * S) as (bx, by, bz):
             i_t, i_b, i_hs = bx, by, bz
@@ -603,9 +615,7 @@ def parallel_nsa_bwd(
     dk = torch.empty(NV, *k.shape, dtype=k.dtype, device=q.device)
     dv = torch.empty(v.shape, dtype=v.dtype, device=q.device)
 
-    block_mask = tilelang_kernel_block_mask(B, H, T, S,
-                                            BS)(block_indices.to(torch.int32),
-                                                block_counts.to(torch.int32)).to(torch.bool)
+    block_mask = tilelang_kernel_block_mask(B, H, T, S, BS)(block_indices.to(torch.int32), block_counts.to(torch.int32)).to(torch.bool)
 
     fused_qkv_bwd_kernel = tilelang_kernel_bwd_dqkv(
         batch=B,
@@ -618,8 +628,7 @@ def parallel_nsa_bwd(
         selected_blocks=S,
         scale=scale,
     )
-    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv,
-                         block_mask.to(torch.int32))
+    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv, block_mask.to(torch.int32))
 
     dq = dq.sum(0)
     dk = dk.sum(0)
@@ -628,7 +637,6 @@ def parallel_nsa_bwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -773,23 +781,21 @@ def parallel_nsa(
             Outputs of shape `[B, SEQLEN, HQ, V]` if `head_first=False` else `[B, HQ, SEQLEN, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"),
-                                     (q, k, v, block_indices))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
         g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
             block_counts = rearrange(block_counts, "b h t -> b t h")
-    assert (q.shape[2] % (k.shape[2] * 16) == 0), "Group size must be a multiple of 16 in NSA"
+    assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
@@ -814,7 +820,7 @@ if __name__ == "__main__":
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
     block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 58f43550..38fc51a9 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -16,7 +16,8 @@ tilelang.testing.set_random_seed(42)
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def native_sparse_attention(
     batch,
     heads,
@@ -25,10 +26,10 @@ def native_sparse_attention(
     scale=None,
     block_size=64,  # Tile size for attention computation
     groups=1,  # Grouped query attention (GQA) groups
-    selected_blocks=16  # Number of blocks to select per attention head
+    selected_blocks=16,  # Number of blocks to select per attention head
 ):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     # Modified shapes for inference (q has seq_len=1)a
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
@@ -53,12 +54,11 @@ def native_sparse_attention(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim] 
-            K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
-            V: T.Tensor(kv_shape, dtype),  # Same shape as K
-            BlockIndices: T.Tensor(block_indices_shape,
-                                   block_indices_dtype),  # Selected block indices
-            Output: T.Tensor(q_shape, dtype),  # Output attention tensor
+        Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim]
+        K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
+        V: T.Tensor(kv_shape, dtype),  # Same shape as K
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),  # Selected block indices
+        Output: T.Tensor(q_shape, dtype),  # Output attention tensor
     ):
         with T.Kernel(1, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             # Shared memory allocations for tile storage
@@ -82,7 +82,7 @@ def native_sparse_attention(
 
             NS = S
             # Copy Q for the single position
-            T.copy(Q[i_b, 0, i_h * G:(i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
+            T.copy(Q[i_b, 0, i_h * G : (i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -93,16 +93,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, 0, i_h, i] * BS  # Get block offset
                 if i_s >= 0:  # Skip invalid/padding blocks
                     # Load current key block to shared memory
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     # Compute QK^T attention scores
                     T.clear(acc_s)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Online softmax with numerical stability
                     # 1. Compute max for scaling
@@ -122,15 +117,14 @@ def native_sparse_attention(
                     T.copy(acc_s, acc_s_cast)
 
                     # Accumulate attention-weighted values
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             # Final normalization and output
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]  # Normalize by logsum
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, 0, i_h * G:(i_h + 1) * G,
-                                    i_v * BV:(i_v + 1) * BV])  # Changed i_t to 0
+            T.copy(O_shared, Output[i_b, 0, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])  # Changed i_t to 0
 
     return native_sparse_attention
 
@@ -149,21 +143,21 @@ def main():
         selected_blocks=S,
     )
 
-    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
 
-    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device='cuda')
-    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda')
+    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device="cuda")
+    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda")
 
-    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN_Q):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device="cuda")
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index 0b71779b..a8dd26b6 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -14,18 +14,11 @@ tilelang.testing.set_random_seed(0)
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention(batch,
-                            heads,
-                            seq_len,
-                            dim,
-                            is_causal,
-                            scale=None,
-                            block_size=64,
-                            groups=1,
-                            selected_blocks=16):
+    },
+)
+def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -52,11 +45,11 @@ def native_sparse_attention(batch,
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -77,7 +70,7 @@ def native_sparse_attention(batch,
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -87,21 +80,15 @@ def native_sparse_attention(batch,
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -121,13 +108,13 @@ def native_sparse_attention(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention
 
@@ -148,20 +135,20 @@ def main():
     )
     print(kernel.get_kernel_source())
     torch.random.manual_seed(0)
-    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
-    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device='cuda')
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
                 block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
     block_indices = block_indices.sort(-1)[0]
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index d365e7a5..af87db8b 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@ from tilelang import language as T
 import tilelang.testing
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -21,18 +22,11 @@ from einops import rearrange
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention_varlen(batch,
-                                   heads,
-                                   c_seq_len,
-                                   dim,
-                                   is_causal,
-                                   scale=None,
-                                   block_size=64,
-                                   groups=1,
-                                   selected_blocks=16):
+    }
+)
+def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [c_seq_len, heads, dim]
     kv_shape = [c_seq_len, head_kv, dim]
@@ -66,14 +60,14 @@ def native_sparse_attention_varlen(batch,
 
     @T.prim_func
     def native_sparse_attention_varlen(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
-            Offsets: T.Tensor(offsets_shape, offsets_dtype),
-            TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
+        Offsets: T.Tensor(offsets_shape, offsets_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
     ):
         with T.Kernel(c_seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -100,7 +94,7 @@ def native_sparse_attention_varlen(batch,
             current_seq_len = eos - bos
 
             NS = BlockCounts[i_t, i_h]
-            T.copy(Q[bos + i_t, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+            T.copy(Q[bos + i_t, i_h * G : (i_h + 1) * G, :BK], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -112,21 +106,15 @@ def native_sparse_attention_varlen(batch,
                     # [BS, BK]
                     # Lei: may have some padding issues
                     # we should learn from mha varlen templates to handle this
-                    T.copy(K[bos + i_s:bos + i_s + BS, i_h, :BK], K_shared)
+                    T.copy(K[bos + i_s : bos + i_s + BS, i_h, :BK], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -146,13 +134,13 @@ def native_sparse_attention_varlen(batch,
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[bos + i_s:bos + i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[bos + i_s : bos + i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, O_slc[bos + i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, O_slc[bos + i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention_varlen
 
@@ -190,17 +178,20 @@ def parallel_nsa_fwd(
 
     o_slc = torch.empty(B, C_SEQ_LEN, HQ, V, dtype=v.dtype, device=q.device)
     kernel(
-        q.view(C_SEQ_LEN, HQ, D), k.view(C_SEQ_LEN, H, D), v.view(C_SEQ_LEN, H, D),
+        q.view(C_SEQ_LEN, HQ, D),
+        k.view(C_SEQ_LEN, H, D),
+        v.view(C_SEQ_LEN, H, D),
         o_slc.view(C_SEQ_LEN, HQ, V),
         block_indices.to(torch.int32).view(C_SEQ_LEN, H, S),
-        block_counts.to(torch.int32).view(C_SEQ_LEN, H), offsets.to(torch.int32),
-        token_indices.to(torch.int32))
+        block_counts.to(torch.int32).view(C_SEQ_LEN, H),
+        offsets.to(torch.int32),
+        token_indices.to(torch.int32),
+    )
     return o_slc
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size, scale, offsets):
         ctx.dtype = q.dtype
@@ -221,22 +212,25 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         return o_slc.to(q.dtype)
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -276,29 +270,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size,
-                                      scale, cu_seqlens)
+    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         assert False, "Window size is not supported yet"
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -306,41 +298,57 @@ if __name__ == "__main__":
     N, C_SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[:N - 1]],
-        torch.tensor([C_SEQ_LEN], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [
+                torch.tensor([0], dtype=torch.long),
+                torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[: N - 1]],
+                torch.tensor([C_SEQ_LEN], dtype=torch.long),
+            ],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
 
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_k = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_v = torch.randperm(C_SEQ_LEN, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_q].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, HQ,
-                                                               D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_k].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_v].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_k = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_v = torch.randperm(C_SEQ_LEN, device="cuda")
+    q = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_q]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, HQ, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    k = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_k]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    v = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_v]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device="cuda")
     for i in range(C_SEQ_LEN):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, tilelang.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -351,7 +359,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -362,7 +371,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/example_triton_nsa_bwd.py b/examples/deepseek_nsa/example_triton_nsa_bwd.py
index e912794a..af05bfa7 100644
--- a/examples/deepseek_nsa/example_triton_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_bwd.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -134,7 +154,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
@@ -199,37 +220,56 @@ def parallel_nsa_fwd(
     return o_slc, lse_slc, o_swa, lse_swa
 
 
-@triton.heuristics({'USE_OFFSETS': lambda args: args['offsets'] is not None})
+@triton.heuristics({"USE_OFFSETS": lambda args: args["offsets"] is not None})
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa, do_slc, do_swa, dk,
-                                dv, block_mask, offsets, chunk_indices, scale, T, B: tl.constexpr,
-                                H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr,
-                                V: tl.constexpr, M: tl.constexpr, BS: tl.constexpr,
-                                WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                                USE_OFFSETS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dkv(
+    q,
+    k,
+    v,
+    lse_slc,
+    lse_swa,
+    delta_slc,
+    delta_swa,
+    do_slc,
+    do_swa,
+    dk,
+    dv,
+    block_mask,
+    offsets,
+    chunk_indices,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    M: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+):
     i_v, i_s, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK),
-                            (1, 0))
-    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                            (BS, BV), (1, 0))
-    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1),
-                             (i_s * BS, 0), (BS, BK), (1, 0))
-    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                             (BS, BV), (1, 0))
+    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
+    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
 
     # [BS, BK]
     b_k = tl.load(p_k, boundary_check=(0, 1))
@@ -241,14 +281,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     for i in range(i_s * BS, T):
         b_m_slc = tl.load(block_mask + (bos + i) * H * M + i_h * M + i_s)
         if b_m_slc:
-            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                                    (1, 0))
+            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
             # [G, BK]
             b_q = tl.load(p_q, boundary_check=(0, 1))
             b_q = (b_q * scale).to(b_q.dtype)
 
-            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                         (i_h * G, i_v * BV), (G, BV), (1, 0))
+            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
             p_lse_slc = lse_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             p_delta_slc = delta_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             # [G, BV]
@@ -272,14 +310,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
         if WS > 0:
             o_s = i_s * BS + tl.arange(0, BS)
             if max(i_s * BS, i - WS + 1) < min((i_s + 1) * BS, i + 1):
-                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0),
-                                        (G, BK), (1, 0))
+                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
                 # [G, BK]
                 b_q = tl.load(p_q, boundary_check=(0, 1))
                 b_q = (b_q * scale).to(b_q.dtype)
 
-                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                             (i_h * G, i_v * BV), (G, BV), (1, 0))
+                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
                 p_lse_swa = lse_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 p_delta_swa = delta_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 # [G, BV]
@@ -304,12 +340,19 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics(
-    {'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)})
+@triton.heuristics({"USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor)})
 @triton.jit
-def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.constexpr,
-                             H: tl.constexpr, S: tl.constexpr, BS: tl.constexpr, NS: tl.constexpr,
-                             USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_kernel_mask(
+    block_indices,
+    block_counts,
+    block_mask,
+    T: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    NS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_b, i_hs = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_h, i_s = i_hs // S, i_hs % S
 
@@ -320,31 +363,56 @@ def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.cons
         b_m = b_i * BS <= i_t
 
     if b_i < NS and b_i >= 0:
-        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i,
-                 b_m.to(block_mask.dtype.element_ty))
+        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i, b_m.to(block_mask.dtype.element_ty))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, delta_swa, do_swa, dq,
-                               scale, block_indices, block_counts, offsets, token_indices, T,
-                               B: tl.constexpr, H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr,
-                               K: tl.constexpr, V: tl.constexpr, S: tl.constexpr, BS: tl.constexpr,
-                               WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                               USE_OFFSETS: tl.constexpr, USE_BLOCK_COUNTS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dq(
+    q,
+    k,
+    v,
+    lse_slc,
+    delta_slc,
+    do_slc,
+    lse_swa,
+    delta_swa,
+    do_swa,
+    dq,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -449,27 +517,49 @@ def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, del
         tl.store(p_dq, (b_dq_slc + b_dq_swa).to(p_dq.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -484,20 +574,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -510,7 +598,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -529,13 +617,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -546,7 +633,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -593,14 +680,8 @@ def parallel_nsa_block_mask(
     block_mask = torch.zeros(B, T, H, NS, dtype=torch.bool, device=block_indices.device)
 
     parallel_nsa_kernel_mask[(T, B, H * S)](
-        block_indices=block_indices,
-        block_counts=block_counts,
-        block_mask=block_mask,
-        T=T,
-        H=H,
-        S=S,
-        BS=BS,
-        NS=NS)
+        block_indices=block_indices, block_counts=block_counts, block_mask=block_mask, T=T, H=H, S=S, BS=BS, NS=NS
+    )
     return block_mask
 
 
@@ -676,7 +757,8 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dq = dq.sum(0)
 
     if offsets is not None:
@@ -719,14 +801,14 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dk = dk.sum(0)
     return dq, dk, dv
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -749,7 +831,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -781,22 +864,25 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -836,51 +922,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd.py b/examples/deepseek_nsa/example_triton_nsa_fwd.py
index 2c740013..c9ab28da 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ class ParallelNSAFunction(torch.autograd.Function):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -177,7 +197,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -200,7 +219,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -212,18 +232,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -263,51 +285,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
index 9ccbff6a..cb4eb6d7 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@ import triton
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,27 +18,49 @@ from reference import naive_nsa
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -52,20 +75,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -78,7 +99,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -97,13 +118,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -114,7 +134,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -196,7 +216,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -219,7 +238,8 @@ class ParallelNSAFunction(torch.autograd.Function):
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -231,18 +251,20 @@ class ParallelNSAFunction(torch.autograd.Function):
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -282,29 +304,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -312,38 +332,35 @@ if __name__ == "__main__":
     N, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, T)[torch.randperm(T - 1)[:N - 1]],
-        torch.tensor([T], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [torch.tensor([0], dtype=torch.long), torch.arange(16, T)[torch.randperm(T - 1)[: N - 1]], torch.tensor([T], dtype=torch.long)],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
     # offsets.shape is [N+1]
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(T, device='cuda')
-    perm_k = torch.randperm(T, device='cuda')
-    perm_v = torch.randperm(T, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, T, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(T, device="cuda")
+    perm_k = torch.randperm(T, device="cuda")
+    perm_v = torch.randperm(T, device="cuda")
+    q = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
+    k = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    v = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    g_slc = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device="cuda")
     for i in range(T):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -354,7 +371,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -365,7 +383,8 @@ if __name__ == "__main__":
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/reference.py b/examples/deepseek_nsa/reference.py
index 958d0c19..58083108 100644
--- a/examples/deepseek_nsa/reference.py
+++ b/examples/deepseek_nsa/reference.py
@@ -6,18 +6,20 @@ from typing import Union
 from einops import rearrange, repeat
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -57,26 +59,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -86,14 +86,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -101,10 +98,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -126,34 +123,28 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
@@ -187,7 +178,7 @@ def naive_nsa_simple(
         o (torch.Tensor):
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -197,8 +188,8 @@ def naive_nsa_simple(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(v)
@@ -228,10 +219,10 @@ def naive_nsa_simple(
                     v_i[t, h] = v_b[selected_block_index, h, :]
 
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float('-inf'))
+            attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float("-inf"))
             attn = torch.softmax(attn, dim=0)
-            o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+            o[i, i_q] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
 
@@ -265,7 +256,7 @@ def naive_nsa_simple_inference(
         o (torch.Tensor):
             Outputs of shape `[B, 1, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -275,8 +266,8 @@ def naive_nsa_simple_inference(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(q)
@@ -306,9 +297,9 @@ def naive_nsa_simple_inference(
                 v_i[t, h] = v_b[selected_block_index, h, :]
 
         # [S*BS, HQ]
-        attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-        attn = attn.masked_fill((c >= s_i), float('-inf'))
+        attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+        attn = attn.masked_fill((c >= s_i), float("-inf"))
         attn = torch.softmax(attn, dim=0)
-        o[i, 0] = torch.einsum('n h, n h v -> h v', attn, v_i)
+        o[i, 0] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index dd940648..305e2afc 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -28,11 +28,11 @@ def validate_tensor_match(a, b, tolerance=1e-8, tensor_name="tensor", should_rai
         if should_raise:
             assert False
     if not torch.isclose(
-            a.masked_fill(a_finite, 0),
-            b.masked_fill(b_finite, 0),
-            rtol=0,
-            atol=0,
-            equal_nan=True,
+        a.masked_fill(a_finite, 0),
+        b.masked_fill(b_finite, 0),
+        rtol=0,
+        atol=0,
+        equal_nan=True,
     ).all():
         display_error_message(f"{tensor_name} Error: nonfinite value mismatch")
         if should_raise:
@@ -55,13 +55,10 @@ def get_configs():
         threads=[128, 256],
         block_Q=[1, 2, 4],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 class SupplyProg:
-
     def __init__(self):
         self.tensors_dict = {}
 
@@ -88,7 +85,8 @@ supply_prog = SupplyProg()
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },)
+    },
+)
 def mqa_attn_return_logits(
     heads,
     index_dim,
@@ -113,16 +111,15 @@ def mqa_attn_return_logits(
 
     @T.prim_func
     def mqa_attn_return_logits_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
-            IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
-            IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
-            Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
-            Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
+        IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
+        IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
+        IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
+        Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
+        Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_Q), threads=threads) as bx:
-
             index_q_shared = T.alloc_shared([block_Q * heads, index_dim], dtype)
             index_k_shared = T.alloc_shared([block_N, index_dim], dtype)
             index_k_scale_fragment = T.alloc_fragment([block_N], accum_dtype)
@@ -140,17 +137,14 @@ def mqa_attn_return_logits(
             cu_k_e_max[0] = -2147483648
 
             for bq_i in T.serial(block_Q):
-                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i], seq_len_kv))
             for bq_i in T.serial(block_Q):
-                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i], seq_len_kv))
 
             T.copy(IndexQ[seq_len_i * heads, 0], index_q_shared)
             T.copy(Weights[seq_len_i, 0], weights)
 
-            for nbn_i in T.Pipelined(
-                    T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
+            for nbn_i in T.Pipelined(T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
                 T.copy(IndexK[cu_k_s_min[0] + nbn_i * block_N, 0], index_k_shared)
                 T.copy(IndexKScale[cu_k_s_min[0] + nbn_i * block_N], index_k_scale_fragment)
 
@@ -164,15 +158,14 @@ def mqa_attn_return_logits(
                 )
 
                 for bn_i, bq_i, h_i in T.Parallel(block_N, block_Q, heads):
-                    s_reshaped[bn_i, bq_i,
-                               h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) *
-                                       weights[bq_i, h_i]) * index_k_scale_fragment[bn_i]
+                    s_reshaped[bn_i, bq_i, h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) * weights[bq_i, h_i]) * index_k_scale_fragment[
+                        bn_i
+                    ]
 
                 T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
 
                 for bq_i, bn_i in T.Parallel(block_Q, block_N):
-                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = (
-                        logits[bn_i, bq_i])
+                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = logits[bn_i, bq_i]
 
     return mqa_attn_return_logits_kernel
 
@@ -190,9 +183,9 @@ def clean_logits_(
 
     @T.prim_func
     def clean_logits_kernel(
-            Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, threads=threads) as bx:
             tx = T.thread_binding(0, threads, thread="threadIdx.x")
@@ -210,13 +203,7 @@ def clean_logits_(
     return clean_logits_kernel
 
 
-def mqa_attn_return_logits_interface(q,
-                                     kv,
-                                     kv_scales,
-                                     weights,
-                                     cu_seqlen_ks,
-                                     cu_seqlen_ke,
-                                     clean_logits=True):
+def mqa_attn_return_logits_interface(q, kv, kv_scales, weights, cu_seqlen_ks, cu_seqlen_ke, clean_logits=True):
     seq_len, heads, index_dim = q.shape
     seq_len_kv = kv.shape[0]
 
@@ -238,20 +225,19 @@ def mqa_attn_return_logits_interface(q,
     return logits
 
 
-def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor,
-                       cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
+def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor, cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
     k = kv
     q = q.float()
     k = k.float()
 
     seq_len_kv = kv.shape[0]
-    mask_lo = torch.arange(0, seq_len_kv, device='cuda')[None, :] >= cu_seqlen_ks[:, None]
-    mask_hi = torch.arange(0, seq_len_kv, device='cuda')[None, :] < cu_seqlen_ke[:, None]
+    mask_lo = torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    mask_hi = torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
     mask = mask_lo & mask_hi
 
-    score = torch.einsum('mhd,nd->hmn', q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
-    logits = logits.masked_fill(~mask, float('-inf'))
+    logits = logits.masked_fill(~mask, float("-inf"))
 
     cost = mask.sum()
     return logits, cost
@@ -265,32 +251,22 @@ def test_fp8_lighting_indexer(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
     weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
     p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
 
-    ks, ke = generate_random_cu_seqlens(
-        per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
 
-    logits_ref, cost_ref = ref_fp8_mqa_logits(
-        q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     q_fp8 = q.to(torch.float8_e4m3fn)
     kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
 
-    logits_tl = mqa_attn_return_logits_interface(
-        q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-    diff = validate_tensor_match(
-        logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
+    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
 
     print(f"diff: {diff}")
 
     from tilelang.profiler import do_bench
 
     def logits_fn():
-        return mqa_attn_return_logits_interface(
-            q=q_fp8,
-            kv=kv_fp8,
-            kv_scales=kv_scales,
-            weights=weights,
-            cu_seqlen_ks=ks,
-            cu_seqlen_ke=ke)
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
         logits_fn()
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index 4ff3b819..1266e70e 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -22,9 +22,9 @@ def preprocess(
 
     @T.prim_func
     def preprocess_kernel(
-            O: T.Tensor(shape, dtype),
-            dO: T.Tensor(shape, dtype),
-            Delta: T.Tensor([B, S, H], accum_dtype),
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([B, S, H], accum_dtype),
     ):
         with T.Kernel(H, T.ceildiv(S, block_ND), B) as (bx, by, bz):
             o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
@@ -33,16 +33,12 @@ def preprocess(
             acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
             T.clear(acc)
             for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
-                T.copy(
-                    O[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    o)
-                T.copy(
-                    dO[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    do)
+                T.copy(O[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
                 for i, j in T.Parallel(block_ND, block_ND):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * block_ND:(by + 1) * block_ND, bx])
+            T.copy(delta, Delta[bz, by * block_ND : (by + 1) * block_ND, bx])
 
     return preprocess_kernel
 
@@ -65,13 +61,13 @@ def postprocess(
 
     @T.prim_func
     def postprocess_kernel(
-            dKV: T.Tensor(dkv_shape, accum_dtype),
-            dKV_out: T.Tensor(dkv_shape, dtype),
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, B, threads=threads) as (bx, by, bz):
             T.copy(
-                dKV[bz, bx * block_N:(bx + 1) * block_N, by, :],
-                dKV_out[bz, bx * block_N:(bx + 1) * block_N, by, :],
+                dKV[bz, bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bz, bx * block_N : (bx + 1) * block_N, by, :],
             )
 
     return postprocess_kernel
@@ -83,7 +79,8 @@ def postprocess(
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
-    })
+    },
+)
 def bwd(
     B,
     S,
@@ -102,14 +99,14 @@ def bwd(
     dtype="bfloat16",
     accum_dtype="float",
 ):
-    assert is_causal == True, 'non-casual is not supported now'
-    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     assert dtype == "bfloat16"
     assert accum_dtype == "float"
     assert indices_dtype == "int32"
 
     if sm_scale is None:
-        sm_scale = (D + D_tail)**(-0.5)
+        sm_scale = (D + D_tail) ** (-0.5)
     sm_scale_mul_reciprocal_log2 = sm_scale * 1.44269504  # log2(e)
 
     H_kv = H // kv_group
@@ -132,14 +129,14 @@ def bwd(
 
     @T.prim_func
     def sparse_mla_bwd_kernel(
-            Q: T.Tensor(q_shape, dtype),
-            KV: T.Tensor(k_shape, dtype),
-            dO: T.Tensor(o_shape, dtype),
-            Indices: T.Tensor(indices_shape, indices_dtype),
-            Lse: T.Tensor(lse_shape, accum_dtype),
-            Delta: T.Tensor(delta_shape, accum_dtype),
-            dQ: T.Tensor(q_shape, dtype),
-            dKV: T.Tensor(k_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
     ):
         with T.Kernel(S, B, kv_group, threads=threads) as (s_i, by, bz):
             Q_shared = T.alloc_shared([padded_H, D], dtype)
@@ -165,17 +162,19 @@ def bwd(
 
             max_kv_i = s_i
 
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
-            T.copy(dO[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+            T.copy(Q[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[by, s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
 
             T.clear(acc_dq)
             T.clear(acc_dq_tail)
 
-            T.annotate_layout({
-                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
-                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
+                    dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
+                }
+            )
 
             # Process each block of indices
             for i_i in T.Pipelined(NS, num_stages=num_stages):
@@ -191,62 +190,31 @@ def bwd(
                 for bi_i, d_i in T.Parallel(BS, D):
                     KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, d_i]
 
-                T.gemm(
-                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for bi_i, d_i in T.Parallel(BS, D_tail):
-                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz,
-                                                   D + d_i]
-                T.gemm(
-                    Q_tail_shared,
-                    KV_tail_shared,
-                    acc_p,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 -
-                                              Lse[by, s_i, bz * padded_H + h_i])
+                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 - Lse[by, s_i, bz * padded_H + h_i])
 
                 T.copy(acc_p, P_shared_cast)
 
-                T.gemm(
-                    dO_shared,
-                    KV_shared,
-                    acc_dp,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
-                        acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
 
                 T.copy(acc_dp, dP_shared_cast)
                 T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
 
-                T.gemm(
-                    dP_shared_cast,
-                    Q_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    P_shared_cast,
-                    dO_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 T.clear(acc_dkv_tail)
-                T.gemm(
-                    dP_shared_cast,
-                    Q_tail_shared,
-                    acc_dkv_tail,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for s in range(split_store):
                     for bi_i, d_i in T.Parallel(BS, D):
@@ -255,41 +223,32 @@ def bwd(
 
                     for bi_i, d_i in T.Parallel(BS, D_tail):
                         if bi_i < BS // split_store:
-                            acc_dkv_tail_shared[bi_i,
-                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
-                                                                    d_i]
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
                     for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, d_i * 4], acc_dkv_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
 
                     # Atomically update dKV, dKV_tail tensors
                     for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, D + d_i * 4], acc_dkv_tail_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
 
             # Store the accumulated dQ
             T.copy(acc_dq, dQ_shared)
             T.copy(acc_dq_tail, dQ_tail_shared)
 
-            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D])
-            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H : (bz + 1) * padded_H, D:])
 
     return sparse_mla_bwd_kernel
 
 
-def sparse_mla_bwd(q,
-                   kv,
-                   o,
-                   do,
-                   indices,
-                   lse,
-                   sm_scale=None,
-                   is_casual=True,
-                   return_kernel=False,
-                   delta=None):
+def sparse_mla_bwd(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
     assert q.is_contiguous()
     assert kv.is_contiguous()
     assert indices.is_contiguous()
@@ -322,6 +281,7 @@ def sparse_mla_bwd(q,
 
 def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True):
     from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
     q = q.detach().clone()
     kv = kv.detach().clone()
     q.requires_grad = True
@@ -331,30 +291,22 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
     return q.grad, kv.grad
 
 
-def test_sparse_mla_bwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=64,
-                        HKV=1,
-                        DQKV=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True):
+def test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True):
     # Prepare data
-    q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, S, H, DV), dtype=dtype, device='cuda')
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
     # Forward
     from sparse_mla_fwd import sparse_mla_fwd_interface
+
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
@@ -365,13 +317,15 @@ def test_sparse_mla_bwd(B=1,
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
 
-    per_token_flop = 2 * sum([
-        H * DV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DV * topk,
-    ])
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
     from tilelang.profiler import do_bench
 
     def fn():
@@ -379,20 +333,9 @@ def test_sparse_mla_bwd(B=1,
 
     ms = do_bench(fn, rep=100, warmup=250)
     print(f"Average time: {ms:.3f} ms")
-    print(f'bwd io bandwidth = ',
-          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
 
 
 if __name__ == "__main__":
-    test_sparse_mla_bwd(
-        B=1,
-        S=4096,
-        SKV=8192,
-        H=64,
-        HKV=1,
-        DQKV=576,
-        DV=512,
-        topk=2048,
-        dtype=torch.bfloat16,
-        check_correctness=True)
+    test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index e65b8901..3b963c75 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -25,15 +25,12 @@ def sparse_mla_fwd(
     num_stages=2,
     threads=256,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
     assert is_causal == True, "non-casual is not supported"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -55,9 +52,9 @@ def sparse_mla_fwd(
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -73,18 +70,17 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                    bz,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -118,16 +114,13 @@ def sparse_mla_fwd(
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
                     mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] <= max_kv_i
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                                  D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -176,15 +169,7 @@ def sparse_mla_fwd(
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             sm_scale=None,
-                             return_p_sum: bool = False,
-                             d_v=512,
-                             block_I=64,
-                             num_stages=2,
-                             threads=256):
+def sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=64, num_stages=2, threads=256):
     is_casual = True
     assert return_p_sum == False, "This kernel file is for fwd only"
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
@@ -201,16 +186,8 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     kernel = sparse_mla_fwd(
-        heads,
-        dim,
-        tail_dim,
-        topk,
-        kv_group,
-        sm_scale,
-        is_casual,
-        block_I=block_I,
-        num_stages=num_stages,
-        threads=threads)
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
     out, lse = kernel(q, kv, indices)
     return out, lse
 
@@ -230,14 +207,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     b, _, _, dim_v = v.shape
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
-            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+        1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+    ).view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :1 - 1, 0] = True
+    mask[:, :, : 1 - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -252,19 +229,21 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=128,
-                        HKV=1,
-                        DQK=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True,
-                        block_I=64,
-                        num_stages=2,
-                        threads=256):
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -274,10 +253,9 @@ def test_sparse_mla_fwd(B=1,
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    tl_out, tl_lse = sparse_mla_fwd_interface(
-        q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     if check_correctness:
         # otherwise may cause out of memory
@@ -286,8 +264,7 @@ def test_sparse_mla_fwd(B=1,
         print("assert_tensors_similar passed")
 
     def fn():
-        return sparse_mla_fwd_interface(
-            q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+        return sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     from tilelang.profiler import do_bench
 
@@ -315,4 +292,5 @@ if __name__ == "__main__":
         check_correctness=True,
         block_I=64,
         num_stages=2,
-        threads=256)
+        threads=256,
+    )
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 1621d85b..972160c9 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -9,10 +9,16 @@ import argparse
 @tilelang.jit(
     out_idx=[-2, -1],
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
 def sparse_mla_fwd(
@@ -32,14 +38,12 @@ def sparse_mla_fwd(
     num_stages=0,
     threads=384,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
-    assert is_causal == True, 'non-casual is not supported'
-    assert topk % block_I == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -57,15 +61,17 @@ def sparse_mla_fwd(
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert kv_group == 1, 'here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)'
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
-    assert NI % 2 == 0, 'NI should be a multiple of 2'
+    assert NI % 2 == 0, "NI should be a multiple of 2"
     D = dim
     D_tail = tail_dim
     KV_stride = kv_stride
     if head_kv > 64:
-        assert head_kv % 64 == 0, 'head_kv should be a multiple of 64'
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
         REPLICATE_H = head_kv // 64
     else:
         REPLICATE_H = 1
@@ -74,18 +80,14 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            q_start_index_s: T.Tensor(1, indices_dtype),
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-            (seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H,
-                batch,
-                kv_group,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel((seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H, batch, kv_group, threads=threads) as (bx, by, bz):
             Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
@@ -122,8 +124,7 @@ def sparse_mla_fwd(
             bar_sScale_and_sS_free = T.alloc_barrier(arrive_count=256)
 
             b_i, g_i = by, bz
-            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (
-                bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
             q_i = q_start_index_s[0] + s_i
             max_kv_i = (q_i + 1 - KV_stride) // KV_stride
 
@@ -132,26 +133,24 @@ def sparse_mla_fwd(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[b_i, s_i, H0:H1, 0:D // 2], Q_shared_l)
-            T.copy(Q[b_i, s_i, H0:H1, D // 2:D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
                 for i_i in T.serial(T.ceildiv(NI, 2)):
-
                     # Buffer 0
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
@@ -187,8 +186,7 @@ def sparse_mla_fwd(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
@@ -227,7 +225,7 @@ def sparse_mla_fwd(
                 for h_i in T.Parallel(H_per_block):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0:D // 2])
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -257,7 +255,7 @@ def sparse_mla_fwd(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2:D])
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
             elif tx >= 256:
                 # producer
                 T.set_max_nreg(80, 0)
@@ -265,70 +263,58 @@ def sparse_mla_fwd(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
+                        indices_local[0] = Indices[b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local[0], g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+                        indices_local[0] = Indices[b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local[0], g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local[0], g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             q_start_index_s,
-                             kv_stride,
-                             sm_scale=None,
-                             is_casual=True,
-                             return_kernel=False,
-                             print_kernel=False):
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
     batch, seq_len, heads, dim_plus_tail_dim = q.shape
     _, seq_len_kv, kv_group, _ = kv.shape
 
-    assert dim_plus_tail_dim == 576, 'you should assign dim otherwise'
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
     dim = 512
 
     assert kv.shape[-1] == dim_plus_tail_dim
@@ -338,29 +324,23 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     if q_start_index_s != 0:
-        assert q_start_index_s > kv_stride, "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
     CP0 = q_start_index_s == 0
 
-    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride,
-                            kv_group, sm_scale, is_casual, CP0)
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
     if print_kernel:
         print(kernel.get_kernel_source())
-    out, lse = kernel(q, kv, indices,
-                      torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    out, lse = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
     if return_kernel:
         return kernel
     if q_start_index_s == 0 and kv_stride > 1:
-        out[:, :kv_stride - 1, :, :] = 0
+        out[:, : kv_stride - 1, :, :] = 0
     return out, lse
 
 
-def ref_sparse_mla_fwd_interface(q,
-                                 kv,
-                                 indices,
-                                 q_start_index_s,
-                                 kv_stride=4,
-                                 sm_scale=None,
-                                 is_casual=True):
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=4, sm_scale=None, is_casual=True):
     q = q.float()
     kv = kv.float()
     indices = indices.transpose(1, 2)
@@ -369,7 +349,7 @@ def ref_sparse_mla_fwd_interface(q,
     if q_start_index_s is None:
         q_start_index_s = sk * kv_stride - sq
 
-    assert kv.shape[-1] == 576, 'you should assign dim otherwise'
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
     dim = 512
     k = kv
     v = kv[..., :dim]
@@ -378,15 +358,14 @@ def ref_sparse_mla_fwd_interface(q,
     num_kv_per_index = 1
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        q_start_index_s, sq + q_start_index_s, dtype=torch.int32,
-        device="cuda").view(-1, 1) >= torch.arange(
-            kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :kv_stride - 1, 0] = True
+    mask[:, :, : kv_stride - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -401,41 +380,32 @@ def ref_sparse_mla_fwd_interface(q,
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd_pipelined(B=1,
-                                  S=4096,
-                                  SKV=8192,
-                                  H=128,
-                                  HKV=1,
-                                  DQK=576,
-                                  DV=512,
-                                  topk=2048,
-                                  dtype=torch.bfloat16,
-                                  q_start_s_index=1024,
-                                  check_correctness=True):
+def test_sparse_mla_fwd_pipelined(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024, check_correctness=True
+):
     KV_stride = 1
 
     torch.random.manual_seed(0)
-    q = torch.randn((B, S, H, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
-    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
     q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
 
     q.clamp_(-10, 10)
     kv.clamp_(-10, 10)
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    kernel = sparse_mla_fwd_interface(
-        q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
 
     def fn():
         out, lse = kernel(q, kv, indices, q_start_s_index_t)
         if q_start_s_index == 0 and KV_stride > 1:
-            out[:, :KV_stride - 1, :, :] = 0
+            out[:, : KV_stride - 1, :, :] = 0
         return out, lse
 
     tl_out, tl_lse = fn()
@@ -446,14 +416,15 @@ def test_sparse_mla_fwd_pipelined(B=1,
     torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
 
     from tilelang.profiler import do_bench
+
     ms = do_bench(
         fn,
         rep=10,
         warmup=10,
     )
     print(f"Average time: {ms:.3f} ms")
-    print(f'fwd io bandwidth = ', (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'fwd tflops = ', (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
 
 
 if __name__ == "__main__":
@@ -464,5 +435,4 @@ if __name__ == "__main__":
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd_pipelined(
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
+    test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index 2dd27048..6b7e879b 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -21,23 +21,20 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    sparse_mla_fwd.test_sparse_mla_fwd(
-        S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd.test_sparse_mla_fwd(S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(
-        S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    sparse_mla_bwd.test_sparse_mla_bwd(
-        S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_bwd.test_sparse_mla_bwd(S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 if __name__ == "__main__":
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index 4a4b4327..cf87f526 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -127,9 +127,9 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 l_num_input = s_num_input[r_idx]
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            "int32", ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
                 # cumsum
@@ -156,23 +156,20 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            "int32", ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         if l_bin_id32 > l_threshold_bin_id:
-                            pos = T.atomic_add(
-                                s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                            pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                             index[bx, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                         elif l_bin_id32 == l_threshold_bin_id and l_new_topk > 0:
                             if round == 3:
-                                l_out_pos = T.atomic_add(
-                                    s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                                l_out_pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                                 if l_out_pos < topk:
                                     index[bx, l_out_pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                             else:
                                 pos = T.atomic_add(s_num_input[r_idx ^ 1], 1, return_prev=True)
-                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx,
-                                                                          s * BLOCK_SIZE + tx]
+                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
 
     return tl_topk_kernel
 
@@ -186,7 +183,6 @@ def tl_topk(input, starts, ends, topk):
 
 
 def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
-
     batch = 64
     seq_len = 32 * 1024
     topk = 2048
@@ -212,8 +208,7 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
         set_ref = set(ref_np)
         set_trt = set(trt_np)
         intersection = set_ref & set_trt
-        print("selected/all:", len(intersection), "/", len(set_ref), "=",
-              len(intersection) / len(set_ref))
+        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
 
     # Performance test with CUDA events
 
diff --git a/examples/deepseek_v32/utils.py b/examples/deepseek_v32/utils.py
index 2ea34b14..d7252e17 100644
--- a/examples/deepseek_v32/utils.py
+++ b/examples/deepseek_v32/utils.py
@@ -23,8 +23,7 @@ def _is_equal(a, b):
     if isinstance(a, torch.Tensor):
         return a is b
     # Whitelist of types that are safe to compare by value for caching.
-    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(
-            b, (int, float, str, bool, type(None))):
+    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(b, (int, float, str, bool, type(None))):
         return a == b
     # For other types, we cannot guarantee a cheap and safe comparison, so we fail the cache check.
     return False
@@ -58,9 +57,11 @@ def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]
             if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
                 # For Tensors, check for object identity. For other types, check for equality.
                 # Python caches small integers, so `is` works for them but not for large integers like 4096.
-                if all(_is_equal(a, b) for a, b in zip(args, last_args)) and \
-                   set(kwargs.keys()) == set(last_kwargs.keys()) and \
-                   all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items()):
+                if (
+                    all(_is_equal(a, b) for a, b in zip(args, last_args))
+                    and set(kwargs.keys()) == set(last_kwargs.keys())
+                    and all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items())
+                ):
                     return last_result
 
         result = fn(*args, **kwargs)
@@ -79,73 +80,68 @@ def cal_seq_idx_from_cu_seqlens(cu_seqlens: torch.LongTensor, seq_len: int):
 
 
 @tensor_cache
-def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                      seq_len: int) -> torch.IntTensor:
-    seq_idx_for_q = torch.full((seq_len,),
-                               len(cu_seqlens_qs),
-                               dtype=torch.int32,
-                               device=cu_seqlens_qs.device)
+def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+    seq_idx_for_q = torch.full((seq_len,), len(cu_seqlens_qs), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        seq_idx_for_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = i
+        seq_idx_for_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = i
     return seq_idx_for_q
 
 
 @tensor_cache
-def cal_cu_seqlen_ks_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+def cal_cu_seqlen_ks_for_q(
+    cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, cu_seqlens_ks: torch.LongTensor, seq_len: int
+) -> torch.IntTensor:
     cu_seqlen_ks_for_each_q = torch.gather(
-        input=torch.cat([
-            cu_seqlens_ks,
-            torch.full((1,),
-                       torch.iinfo(torch.int32).max,
-                       dtype=torch.int32,
-                       device=cu_seqlens_qs.device)
-        ]),
+        input=torch.cat([cu_seqlens_ks, torch.full((1,), torch.iinfo(torch.int32).max, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
     return cu_seqlen_ks_for_each_q.int()
 
 
 @tensor_cache
-def cal_cu_seqlen_ke_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, cu_seqlens_ke: torch.LongTensor,
-                           q_start_idxs: torch.LongTensor, seq_len: int,
-                           kv_stride: int) -> torch.IntTensor:
+def cal_cu_seqlen_ke_for_q(
+    cu_seqlens_qs: torch.LongTensor,
+    cu_seqlens_qe: torch.LongTensor,
+    cu_seqlens_ks: torch.LongTensor,
+    cu_seqlens_ke: torch.LongTensor,
+    q_start_idxs: torch.LongTensor,
+    seq_len: int,
+    kv_stride: int,
+) -> torch.IntTensor:
     cu_seqlen_ke_for_each_q = torch.gather(
-        input=torch.cat(
-            [cu_seqlens_ke,
-             torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        input=torch.cat([cu_seqlens_ke, torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
-    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,),
-                                                 dtype=torch.int32,
-                                                 device=cu_seqlens_qs.device)
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = (torch.arange(
-            q_start_idxs[i],
-            q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i],
-            dtype=torch.int32,
-            device=cu_seqlens_qs.device) + 1) // kv_stride + cu_seqlens_ks[i]
+        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = (
+            torch.arange(
+                q_start_idxs[i], q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i], dtype=torch.int32, device=cu_seqlens_qs.device
+            )
+            + 1
+        ) // kv_stride + cu_seqlens_ks[i]
     cu_seqlen_ke_for_each_q = torch.minimum(casual_cu_seqlen_ke_for_each_q, cu_seqlen_ke_for_each_q)
     return cu_seqlen_ke_for_each_q.int()
 
 
 @tensor_cache
-def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
-                                cu_seqlens_k: torch.LongTensor = None,
-                                offs_q: torch.LongTensor = None,
-                                *,
-                                seq_len: int,
-                                kv_stride: int = 1,
-                                cp_rank: int = 0,
-                                cp_size: int = 1,
-                                balanced_cp=False):
-    '''
+def cal_ks_ke_from_cu_seqlen_qk(
+    cu_seqlens_q: torch.LongTensor,
+    cu_seqlens_k: torch.LongTensor = None,
+    offs_q: torch.LongTensor = None,
+    *,
+    seq_len: int,
+    kv_stride: int = 1,
+    cp_rank: int = 0,
+    cp_size: int = 1,
+    balanced_cp=False,
+):
+    """
     seq_len: seq len per cp rank
     balanced cp slice assignment: 0 1 2 3 3 2 1 0
-    '''
+    """
     n_seq = len(cu_seqlens_q) - 1
     assert n_seq > 0
     assert cu_seqlens_q.shape == (n_seq + 1,)
@@ -170,10 +166,12 @@ def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
 
         def f(x: torch.Tensor):
             chunks = x.chunk(cp_size * 2)
-            return torch.cat([
-                chunks[cp_rank],
-                chunks[cp_size - cp_rank - 1],
-            ])
+            return torch.cat(
+                [
+                    chunks[cp_rank],
+                    chunks[cp_size - cp_rank - 1],
+                ]
+            )
 
         ks = f(ks)
         ke = f(ke)
@@ -189,8 +187,7 @@ def ceil_to_ue8m0(x: torch.Tensor):
     return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
 
 
-def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int],
-                                use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int], use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
     excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
     x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
     sf = x_amax / 448.0
@@ -239,14 +236,18 @@ def generate_random_cu_seqlens(per_cp_seqlen, cp_size=4, cp_rank=3, kv_stride=1,
         total_seqlen - (cp_rank + 1) * per_chunk_seqlen,
         total_seqlen - cp_rank * per_chunk_seqlen,
     )
-    ks = torch.cat([
-        cu_seqlens_ks_for_each_q[slice_short],
-        cu_seqlens_ks_for_each_q[slice_long],
-    ])
-    ke = torch.cat([
-        cu_seqlens_ke_for_each_q[slice_short],
-        cu_seqlens_ke_for_each_q[slice_long],
-    ])
+    ks = torch.cat(
+        [
+            cu_seqlens_ks_for_each_q[slice_short],
+            cu_seqlens_ks_for_each_q[slice_long],
+        ]
+    )
+    ke = torch.cat(
+        [
+            cu_seqlens_ke_for_each_q[slice_short],
+            cu_seqlens_ke_for_each_q[slice_long],
+        ]
+    )
     assert len(ks) == len(ke) == per_cp_seqlen
     return ks, ke
 
@@ -302,11 +303,9 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
         raise_assert: Whether to raise assertion error on failure
     """
     sim = calculate_tensor_similarity(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print(
-            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
-        )
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
         if raise_assert:
             assert False  # noqa: B011
 
@@ -316,11 +315,8 @@ if __name__ == "__main__":
     cu_seqlens = torch.randint(128, 4096, (1000,), dtype=torch.int32, device="cuda")
     last_idx = torch.where(cu_seqlens.cumsum(dim=0) >= seq_len)[0][0]
     cu_seqlens_cumsum = cu_seqlens[:last_idx].cumsum(dim=0)
-    cu_seqlens_qs = torch.cat(
-        [torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
-    cu_seqlens_qe = torch.cat(
-        [cu_seqlens_cumsum,
-         torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
+    cu_seqlens_qs = torch.cat([torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
+    cu_seqlens_qe = torch.cat([cu_seqlens_cumsum, torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
 
     from tilelang.profiler import do_bench
 
diff --git a/examples/dequantize_gemm/dequantize_utils.py b/examples/dequantize_gemm/dequantize_utils.py
index b14c0aee..90a6265f 100644
--- a/examples/dequantize_gemm/dequantize_utils.py
+++ b/examples/dequantize_gemm/dequantize_utils.py
@@ -39,12 +39,10 @@ def torch_convert_bit_twiddling(tensor):
     res0 = val_concat_expanded & mask
     res1 = (val_concat_expanded << 3) & mask
     res2 = (val_concat_expanded << 6) & mask
-    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | (
-        (val_concat_expanded >> 7) & mask3)
+    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | ((val_concat_expanded >> 7) & mask3)
 
     # Select the correct result based on position
-    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1,
-                                                   torch.where(pos == 2, res2, res3)))
+    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1, torch.where(pos == 2, res2, res3)))
 
     # Convert to uint16 for .view(torch.bfloat16)
     bf16_uint16 = (bf16 & 0xFFFF).to(torch.uint16)
@@ -110,7 +108,7 @@ def print_bit(name, val):
         val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.
     """
     val_cpu = val.cpu().item()
-    binary_repr = f'{val_cpu:032b}'
+    binary_repr = f"{val_cpu:032b}"
     print(name, binary_repr)
 
 
@@ -122,7 +120,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -132,21 +130,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = (1. - sim).item()
-    print(f'{diff=}')
+    diff = (1.0 - sim).item()
+    print(f"{diff=}")
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff=}')
+        print_red_warning(f"{name} Error: {diff=}")
         if raise_assert:
             raise AssertionError
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index e30845b8..ba3e0b4a 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -24,6 +24,7 @@ def get_configs():
         the parameter name to its chosen value.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -32,63 +33,62 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           fast_dequant=True,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format="uint",
+    num_bits=4,
+    fast_dequant=True,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
+    """
+    Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
+
+    This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
+    - A: dense input of shape (M, K) with dtype `in_dtype`.
+    - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
+    - C: output of shape (M, N) with dtype `out_dtype`.
+
+    The generated kernel supports two dequantization paths:
+    - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
+    - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
+
+    Important behavior and requirements:
+    - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
+    - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
+    - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
+    - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
+    - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
+
+    Parameters that alter kernel layout/behavior (brief):
+    - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
+    - num_stages: number of software pipeline stages for the K-loop.
+    - threads: number of threads used per kernel block.
+    - split: extra K-splitting factor; K must be divisible by block_K * split.
+    - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
+
+    Returns:
+        A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
     """
-           Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
-
-           This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
-           - A: dense input of shape (M, K) with dtype `in_dtype`.
-           - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
-           - C: output of shape (M, N) with dtype `out_dtype`.
-
-           The generated kernel supports two dequantization paths:
-           - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
-           - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
-
-           Important behavior and requirements:
-           - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
-           - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
-           - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
-           - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
-           - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
-
-           Parameters that alter kernel layout/behavior (brief):
-           - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
-           - num_stages: number of software pipeline stages for the K-loop.
-           - threads: number of threads used per kernel block.
-           - split: extra K-splitting factor; K must be divisible by block_K * split.
-           - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
-
-           Returns:
-               A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
-           """
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "uint8"
 
@@ -189,8 +189,7 @@ def matmul(M,
                 # Finally, store the dequantized data to shared memory.
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
@@ -215,30 +214,29 @@ def matmul(M,
         assert in_dtype in ["fp4"]
         assert out_dtype in ["bfloat16"]
 
-        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr,
-                                  scale: tir.PrimExpr, dtype: str):
+        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
             """
-                Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
-
-                This helper extracts the 4-bit field located at the bit position `pos` within the
-                byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
-                exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
-                resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
-
-                Parameters:
-                    nbit (int): Number of bits in the packed element; must be 4.
-                    val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
-                    pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
-                    scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
-                    dtype (str): Target dtype string; must be "bfloat16".
-
-                Returns:
-                    tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
-
-                Notes:
-                    - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
-                    - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
-                    bit fields and clamps the computed exponent to fit into 8 bits.
+            Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
+
+            This helper extracts the 4-bit field located at the bit position `pos` within the
+            byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
+            exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
+            resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
+
+            Parameters:
+                nbit (int): Number of bits in the packed element; must be 4.
+                val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
+                pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
+                scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
+                dtype (str): Target dtype string; must be "bfloat16".
+
+            Returns:
+                tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
+
+            Notes:
+                - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
+                - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
+                bit fields and clamps the computed exponent to fit into 8 bits.
             """
             assert nbit == 4
             assert dtype == "bfloat16"
@@ -254,8 +252,9 @@ def matmul(M,
             e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
             m_f4 = f4 & tir.const(1, "uint16")
             val_bf16 = tir.reinterpret(
-                "bfloat16", ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                             | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+                "bfloat16",
+                ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+            )
             return val_bf16
 
         @T.macro
@@ -292,32 +291,32 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
-
-            This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
-            - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
-            - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
-            - Pipelines over K in chunks of `block_K` for `num_stages` stages:
-              - Loads A and packed B tiles into shared memory.
-              - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
-              - Performs a GEMM accumulating into C_local with B transposed.
-            - Stores the accumulated block from C_local back to the global output C via C_shared.
-
-            Parameters:
-            - A: input tile of shape (M, K) with dtype `in_dtype`.
-            - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
-            - C: output tensor of shape (M, N) with dtype `out_dtype`.
-
-            Side effects:
-            - Writes the computed output block into the global tensor `C`.
-            - Uses and updates shared memory buffers and per-thread accumulators.
-
-            No value is returned.
+        Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
+
+        This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
+        - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
+        - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
+        - Pipelines over K in chunks of `block_K` for `num_stages` stages:
+          - Loads A and packed B tiles into shared memory.
+          - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
+          - Performs a GEMM accumulating into C_local with B transposed.
+        - Stores the accumulated block from C_local back to the global output C via C_shared.
+
+        Parameters:
+        - A: input tile of shape (M, K) with dtype `in_dtype`.
+        - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
+        - C: output tensor of shape (M, N) with dtype `out_dtype`.
+
+        Side effects:
+        - Writes the computed output block into the global tensor `C`.
+        - Uses and updates shared memory buffers and per-thread accumulators.
+
+        No value is returned.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -327,9 +326,11 @@ def matmul(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -344,7 +345,7 @@ def matmul(M,
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -409,8 +410,7 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     """
     total_flops = 2 * m * n * k
     if tune:
-        kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
+        kernel = matmul(m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
     else:
         kernel = matmul(
             m,
@@ -426,7 +426,8 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
             block_K=128,
             num_stages=2,
             threads=256,
-            split=1)
+            split=1,
+        )
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
     if fast_dequant:
         profiler.assert_allclose(ref_program_twiddling, rtol=0.01, atol=0.01)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index ac1417ae..1091306c 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -7,29 +7,28 @@ import torch
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be "bfloat16").
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
     assert dtype == "bfloat16"
     assert val.dtype == "uint8"
@@ -43,9 +42,10 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
     m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    val_bf16 = tir.reinterpret(
+        "bfloat16",
+        ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,67 +74,71 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format="uint",
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., "bfloat16").
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "uint8"
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -252,8 +258,7 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
@@ -301,33 +306,32 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale[
-                        bx * block_N + i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        bx * block_N + i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
-                ) * T.shift_left(
-                    1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
+                ) * T.shift_left(1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
 
         return simple_dequant_bf16_fp4
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -337,23 +341,26 @@ def matmul(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
             if with_bias:
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       Bias_shared)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], Bias_shared)
                 T.copy(Bias_shared, C_local)
             else:
                 T.clear(C_local)
@@ -368,7 +375,7 @@ def matmul(M,
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -389,7 +396,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     """
     dtypeC = "bfloat16"
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -412,7 +419,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     """
     dtypeC = "bfloat16"
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -436,7 +443,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
     """
     dtypeC = "bfloat16"
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -464,7 +471,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
     """
     dtypeC = "bfloat16"
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -491,16 +498,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
@@ -518,7 +517,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
index 7dad7959..12395df0 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
@@ -7,29 +7,28 @@ import torch
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be "bfloat16").
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
     assert dtype == "bfloat16"
     assert val.dtype == "uint8"
@@ -43,9 +42,10 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
     m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    val_bf16 = tir.reinterpret(
+        "bfloat16",
+        ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,67 +74,71 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format="uint",
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., "bfloat16").
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "uint8"
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -252,8 +258,7 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
@@ -301,8 +306,8 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -311,22 +316,22 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -339,16 +344,20 @@ def matmul(M,
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
@@ -357,26 +366,24 @@ def matmul(M,
                 # T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
                 #        Bias_shared)
                 # T.copy(Bias_shared, C_local)
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       C_local)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], C_local)
             else:
                 T.clear(C_local)
 
             # Use 1D TMA to load Scale
-            T.copy(Scale[bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -399,7 +406,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -424,7 +431,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -450,7 +457,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -480,7 +487,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -507,16 +514,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
@@ -534,7 +533,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index 727d6d3b..c2b972a0 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -24,6 +24,7 @@ def matmul(
     num_bits=4,
 ):
     from tilelang.quantize import _tir_packed_to_unsigned_convert
+
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "int8"
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
@@ -39,9 +40,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -58,21 +59,19 @@ def matmul(
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
                         index = i * threads * local_size_compressed + tx * local_size_compressed + v
                         vi = index // (block_K // num_elems_per_byte)
                         vj = index % (block_K // num_elems_per_byte)
                         B_local[v] = B_shared[vi, vj]
                     for v in T.serial(0, local_size):
-                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(
-                            storage_type, storage_nbit)(
-                                num_bits,
-                                B_local[v // num_elems_per_byte],
-                                v % num_elems_per_byte,
-                                dtype=in_dtype,
-                            )
+                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(storage_type, storage_nbit)(
+                            num_bits,
+                            B_local[v // num_elems_per_byte],
+                            v % num_elems_per_byte,
+                            dtype=in_dtype,
+                        )
                     for v in T.vectorized(0, local_size):
                         index = i * threads * local_size + tx * local_size + v
                         vi = index // block_K
@@ -121,9 +120,7 @@ def run_gemm(
     def ref_program(A, qB):
         import torch
 
-        B = (
-            torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                        dtype=torch.half).to(torch.half).to(A.device))
+        B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
                 B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -146,9 +143,11 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 ):
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
     from tilelang.intrinsics.mma_macro_generator import (
-        TensorCoreIntrinEmitterWithLadderTransform,)
+        TensorCoreIntrinEmitterWithLadderTransform,
+    )
 
     from bitblas.gpu.intrin.lop3 import decode_i4_to_f16
+
     assert in_dtype in [
         "float16",
         "int8",
@@ -192,8 +191,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     pad_factor = 8
 
     A_shape = (M, K)
-    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y,
-               micro_size_k // num_elems_per_byte)
+    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y, micro_size_k // num_elems_per_byte)
     A_shared_shape = (block_M, (block_K + pad_factor) if apply_pad_a else block_K)
     B_shared_shape = (
         block_N // micro_size_y,
@@ -228,7 +226,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
         chunk=chunk,
         reduce_k=reduce_k,
         transform_kind_b=transform_b,
-        num_elems_per_byte=num_elems_per_byte)
+        num_elems_per_byte=num_elems_per_byte,
+    )
 
     vec_load_qb = 16
     if block_N * (block_K // reduce_k) // num_elems_per_byte // threads < vec_load_qb:
@@ -236,14 +235,11 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads,
-                prelude=decode_i4_to_f16) as (bx, by):
-
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads, prelude=decode_i4_to_f16) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -255,40 +251,36 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
             thread_binding = T.get_thread_binding(0)
             rk = T.get_thread_binding(1)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             T.use_swizzle(panel_size=10)
 
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, (block_K // reduce_k)):
                     vk = rk * (block_K // reduce_k) + k
                     A_shared[i, vk] = A[by * block_M + i, ko * block_K + vk]
 
                 # TODO(lei): Layout Inference Pass is not efficient to handle the four dims int8 load
-                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
-                                  (threads * vec_load_qb)):
+                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte // (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
                         t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
-                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (
-                            block_K // micro_size_k)
-                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y //
-                              (block_K // micro_size_k)) % (
-                                  block_N // micro_size_y)
-                        B_shared[vj, vk, vjj,
-                                 vkk] = B[bx * (block_N // micro_size_y) + vj,
-                                          ko * (block_K // micro_size_k) + vk, vjj, vkk]
+                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (block_K // micro_size_k)
+                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y // (block_K // micro_size_k)) % (
+                            block_N // micro_size_y
+                        )
+                        B_shared[vj, vk, vjj, vkk] = B[bx * (block_N // micro_size_y) + vj, ko * (block_K // micro_size_k) + vk, vjj, vkk]
 
                 for ki in T.serial(0, (block_K // (micro_size_k * reduce_k))):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -307,9 +299,13 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
                     for j in T.serial(warp_cols):
                         local_size_b = mma_emitter.local_size_b
-                        T.call_extern('handle', 'decode_i4u_to_f16',
-                                      T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
-                                      T.address_of(B_dequantize_local[j * local_size_b]), 8)
+                        T.call_extern(
+                            "handle",
+                            "decode_i4u_to_f16",
+                            T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
+                            T.address_of(B_dequantize_local[j * local_size_b]),
+                            8,
+                        )
 
                     mma_emitter.mma(A_local, B_dequantize_local, C_local)
 
@@ -328,7 +324,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
                             reduced_accum_res[0],
                             rk,
                             dtype="handle",
-                        ))
+                        )
+                    )
                     if rk == 0:
                         C_local[n] = reduced_accum_res[0]
 
@@ -340,9 +337,9 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):
                 vj = rk * (block_N // reduce_k) + j
-                C[by * block_M + i,
-                  bx * block_N + vj] = C_shared[i // micro_size_x, vj // micro_size_y,
-                                                i % micro_size_x, vj % micro_size_y]
+                C[by * block_M + i, bx * block_N + vj] = C_shared[
+                    i // micro_size_x, vj // micro_size_y, i % micro_size_x, vj % micro_size_y
+                ]
 
     return main
 
@@ -357,8 +354,8 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     transform_b,
 ):
     import bitblas
-    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
-        M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
+
+    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
 
     kernel = tilelang.compile(matmul, out_idx=[2])
     src_code = kernel.get_kernel_source()
@@ -371,8 +368,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     storage_dtype = "int8"
 
     A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
     C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
 
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
@@ -407,9 +403,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     # Ensure that the latency is not None
     assert latency is not None
 
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
             B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -429,8 +423,7 @@ def test_run_dequantize_gemm():
 
 @tilelang.testing.requires_package("bitblas")
 def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
-    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(
-        256, 1024, 512, "float16", "float16", "float16", 3)
+    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, "float16", "float16", "float16", 3)
 
 
 def main():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index c5588d51..352637de 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -21,18 +21,17 @@ def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype:
     e_f16 = e_f4 + tir.const(14, "uint16")
     m_f4 = f4 & tir.const(1, "uint16")
     m_f16 = m_f4
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")
-                               | m_f16 << tir.const(9, "uint16")).astype("uint16"))
+    val_f16 = tir.reinterpret(
+        "float16", ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16") | m_f16 << tir.const(9, "uint16")).astype("uint16")
+    )
     # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
     return val_f16
 
 
 def torch_convert(tensor):
-
     def print_bit(name, val):
         val_cpu = val.cpu().item()
-        binary_repr = f'{val_cpu:032b}'
+        binary_repr = f"{val_cpu:032b}"
         print(name, binary_repr)
 
     def _convert(val, pos):
@@ -68,8 +67,8 @@ def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -118,19 +117,11 @@ def get_configs():
     splits = [1]
     _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
 
-    configs = [{
-        'block_M': c[0],
-        'block_N': c[1],
-        'block_K': c[2],
-        'num_stages': c[3],
-        'threads': c[4],
-        'split': c[5]
-    } for c in _configs]
+    configs = [{"block_M": c[0], "block_N": c[1], "block_K": c[2], "num_stages": c[3], "threads": c[4], "split": c[5]} for c in _configs]
     return configs
 
 
 def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
@@ -145,17 +136,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @T.prim_func
         def main_split(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            SplitC = T.alloc_buffer([
-                split, (N + block_N - 1) // block_N * block_N,
-                (M + block_M - 1) // block_M * block_M
-            ], out_dtype)
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
-                    threads=threads) as (bx, by, bz):
+            SplitC = T.alloc_buffer([split, (N + block_N - 1) // block_N * block_N, (M + block_M - 1) // block_M * block_M], out_dtype)
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split, threads=threads) as (bx, by, bz):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -164,10 +150,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
@@ -183,8 +171,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                         )
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
-                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
-                                        by * block_M:(by + 1) * block_M])
+                T.copy(Ct_local, SplitC[bz, bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
                 acc = T.alloc_fragment((block_N, block_M), out_dtype)
                 T.clear(acc)
@@ -195,12 +182,11 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -209,10 +195,12 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -229,8 +217,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         if split == 1:
             return main
@@ -241,12 +228,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
         @autotune(configs=get_configs(), warmup=10, rep=10)
         @tilelang.jit(out_idx=[2])
-        def kernel(block_M=None,
-                   block_N=None,
-                   block_K=None,
-                   num_stages=None,
-                   threads=None,
-                   split=None):
+        def kernel(block_M=None, block_N=None, block_K=None, num_stages=None, threads=None, split=None):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split).prim_func
 
         return kernel()
@@ -269,10 +251,10 @@ def ref_program(A, qB):
 def main(m=256, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
 
-    if (not tune):
-        kernel = matmul(
-            m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
-                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+    if not tune:
+        kernel = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
+            block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
@@ -293,10 +275,10 @@ def main(m=256, n=256, k=256, tune=False):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--m', type=int, default=256, help='M')
-    parser.add_argument('--n', type=int, default=256, help='N')
-    parser.add_argument('--k', type=int, default=256, help='K')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--m", type=int, default=256, help="M")
+    parser.add_argument("--n", type=int, default=256, help="N")
+    parser.add_argument("--k", type=int, default=256, help="K")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     M, N, K = args.m, args.n, args.k
     main(M, N, K, args.tune)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index 52ee8216..3ff72673 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -42,8 +42,8 @@ def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -66,13 +66,12 @@ def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
 
 
 def torch_convert(tensor):
-
     def _convert(val, pos):
         assert val.dtype == torch.uint8
         val = val.view(torch.int8)
         mask = (1 << 4) - 1
-        i4_shifted = ((val >> (pos * 4)) & mask)
-        i4 = ((i4_shifted << 4) >> 4)
+        i4_shifted = (val >> (pos * 4)) & mask
+        i4 = (i4_shifted << 4) >> 4
 
         return i4.view(torch.int8)
 
@@ -94,7 +93,6 @@ def ref_program(A, qB):
 
 
 def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads):
         num_elems_per_byte = 8 // num_bits
@@ -109,12 +107,11 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -123,10 +120,12 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -143,8 +142,7 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         return main
 
@@ -167,10 +165,10 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
 
 def main(m=128, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
-    if (not tune):
-        kernel = matmul_int8xint4(
-            m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
-                block_M=32, block_N=32, block_K=128, num_stages=1, threads=128)
+    if not tune:
+        kernel = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
+            block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+        )
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
         print("All checks pass.")
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index d3e90ec9..3f121467 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -4,7 +4,8 @@ from typing import Optional, Callable, Any
 import torch
 from tilelang import DataType
 from tilelang.quantize import (
-    _tir_packed_int_to_int_convert,)
+    _tir_packed_int_to_int_convert,
+)
 
 
 @tilelang.jit
@@ -26,11 +27,10 @@ def dequantize_gemv(
     group_size: int = -1,
     with_scaling: bool = False,
 ) -> Callable[..., Any]:
-
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert trans_A is False, "Dequantize only implement for trans_A=False currently"
     assert trans_B is True, "Dequantize only implement for trans_B=TRue currently"
@@ -81,12 +81,12 @@ def dequantize_gemv(
         C: T.Tensor[C_shape, out_dtype],
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -107,8 +107,7 @@ def dequantize_gemv(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 if fast_decoding:
@@ -120,10 +119,9 @@ def dequantize_gemv(
                     )
                 else:
                     for ki in T.serial(micro_size_k):
-                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(
-                            storage_type,
-                            storage_nbit)(num_bits, B_quant_local[ki // num_elems_per_byte],
-                                          ki % num_elems_per_byte, in_dtype)
+                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(storage_type, storage_nbit)(
+                            num_bits, B_quant_local[ki // num_elems_per_byte], ki % num_elems_per_byte, in_dtype
+                        )
 
                 if use_dp4a:
                     for ki in T.serial(micro_size_k // dp4a_size):
@@ -137,9 +135,9 @@ def dequantize_gemv(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -149,7 +147,8 @@ def dequantize_gemv(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -174,26 +173,39 @@ def main() -> None:
     group_size = -1
     with_scaling = False
 
-    kernel = dequantize_gemv(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits, storage_dtype,
-                             source_format, n_partition, reduce_thread, fast_decoding, trans_A,
-                             trans_B, group_size, with_scaling)
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
 
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     num_elems_per_byte = storage_nbit // num_bits
     A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
 
     if fast_decoding:
         from tilelang.quantize.utils import interleave_weight
+
         qB = interleave_weight(qB, num_bits, in_dtype)
     kernel(A, qB, C)
 
     # int4 reference
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for j in range(B.shape[1]):
         B[:, j] = ((qB[:, j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
 
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
index c4cf5fb5..098f814c 100644
--- a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -25,6 +25,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[128],
         block_N=[64, 128, 256],
@@ -33,33 +34,33 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           topk,
-           E,
-           padding_M,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=128,
-           block_N=256,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    topk,
+    E,
+    padding_M,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format="uint",
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=128,
+    block_N=256,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
     Construct and return a grouped (Mixture-of-Experts) matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized, expert-grouped B (shape ExNxQK) and writes an output of shape (M, topk, N) in out_dtype.
 
@@ -115,11 +116,12 @@ def matmul(M,
     Block_QK = block_K // num_elems_per_byte
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, Block_QK)
-    Bias_shared_shape = (block_N)
+    Bias_shared_shape = block_N
     B_dequantize_shared_shape = (block_N, block_K)
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -221,19 +223,16 @@ def matmul(M,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
     def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
-
         assert in_dtype in ["fp4"]
         assert out_dtype in ["bfloat16"]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
-
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
 
@@ -244,8 +243,8 @@ def matmul(M,
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -254,19 +253,17 @@ def matmul(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((E, N, QK), storage_dtype),
-            Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
-            Bias: T.Tensor((E, N), out_dtype),
-            # Add fusedmoe tensors
-            topk_weights: T.Tensor((M * topk), out_dtype),
-            sorted_token_ids: T.Tensor((padding_M), "int32"),
-            expert_ids: T.Tensor((padding_M // block_M), "int32"),
-            C: T.Tensor((M, topk, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((E, N, QK), storage_dtype),
+        Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
+        Bias: T.Tensor((E, N), out_dtype),
+        # Add fusedmoe tensors
+        topk_weights: T.Tensor((M * topk), out_dtype),
+        sorted_token_ids: T.Tensor((padding_M), "int32"),
+        expert_ids: T.Tensor((padding_M // block_M), "int32"),
+        C: T.Tensor((M, topk, N), out_dtype),
     ):
-
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
             B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
@@ -280,17 +277,19 @@ def matmul(M,
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
             T.use_swizzle(10)
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
-            T.copy(sorted_token_ids[by * block_M:(by + 1) * block_M], sorted_token_ids_shared)
+            T.copy(sorted_token_ids[by * block_M : (by + 1) * block_M], sorted_token_ids_shared)
             expert_id[0] = expert_ids[by]
 
             # Get the topk weights of each token in the current block
@@ -300,11 +299,11 @@ def matmul(M,
 
             # Get bias and scale based on the expert id
             if with_bias:
-                T.copy(Bias[expert_id[0], bx * block_N:(bx + 1) * block_N], Bias_shared)
+                T.copy(Bias[expert_id[0], bx * block_N : (bx + 1) * block_N], Bias_shared)
             else:
                 T.clear(Bias_shared)
 
-            T.copy(Scale[expert_id[0], bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[expert_id[0], bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 C_local[i, j] = Bias_shared[j]
@@ -317,14 +316,13 @@ def matmul(M,
                     base = copy_i * threads * 16 + tx * 16
                     if sorted_token_ids_shared[base // block_K] != -1:
                         for copy_j in T.vectorized(16):
-                            A_shared[base // block_K, base % block_K +
-                                     copy_j] = A[sorted_token_ids_shared[base // block_K] // topk,
-                                                 k * block_K + base % block_K + copy_j]
+                            A_shared[base // block_K, base % block_K + copy_j] = A[
+                                sorted_token_ids_shared[base // block_K] // topk, k * block_K + base % block_K + copy_j
+                            ]
 
                 T.copy(B[expert_id[0], bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
 
@@ -338,10 +336,11 @@ def matmul(M,
                 base = copy_i * threads * 16 + tx * 16
                 if sorted_token_ids_shared[base // block_N] != -1:
                     for copy_j in T.vectorized(16):
-                        C[sorted_token_ids_shared[base // block_N] // topk,
-                          sorted_token_ids_shared[base // block_N] % topk, bx * block_N +
-                          base % block_N + copy_j] = C_shared[base // block_N,
-                                                              base % block_N + copy_j]
+                        C[
+                            sorted_token_ids_shared[base // block_N] // topk,
+                            sorted_token_ids_shared[base // block_N] % topk,
+                            bx * block_N + base % block_N + copy_j,
+                        ] = C_shared[base // block_N, base % block_N + copy_j]
 
     return main
 
@@ -355,7 +354,7 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
     assert scale_size == 32  # MXFP4
 
     # Initialize output tensor
-    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device='cuda')
+    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device="cuda")
 
     # Iterate over sorted_token_ids
     for idx in range(len(sorted_token_ids)):  # padding_M
@@ -370,14 +369,11 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
         # Dequantize the expert weights
         B = torch_convert_bit_twiddling(qB[expert_id])  # shape: (N, K)
-        B *= 2**(
-            Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(
-                torch.bfloat16))
+        B *= 2 ** (Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(torch.bfloat16))
 
         # Compute the output for this token-expert pair
         # token_embedding @ B.T + bias
-        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(
-            torch.bfloat16)) + Bias[expert_id]
+        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(torch.bfloat16)) + Bias[expert_id]
         output = output.to(torch.__getattribute__(dtypeC))
 
         # Apply the topk weight
@@ -391,14 +387,12 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
 
 def get_data(m, n, k, qk, scale_size, topk, E, block_M):
-    A = torch.empty(m, k, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-    qB = torch.randint(
-        0, 256, (E, n, qk), dtype=torch.uint8,
-        device='cuda')  #  Quantized weight tensor for E experts.
-    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device='cuda')
-    Bias = torch.empty(E, n, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-
-    weights = torch.empty(m, E, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
+    A = torch.empty(m, k, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+    qB = torch.randint(0, 256, (E, n, qk), dtype=torch.uint8, device="cuda")  #  Quantized weight tensor for E experts.
+    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device="cuda")
+    Bias = torch.empty(E, n, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+
+    weights = torch.empty(m, E, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
     # topk_weights: Router weights for the top-k experts for each token.
     # Shape: (m, topk)
     # tokens_experts: A flattened tensor of expert assignments for each token.
@@ -420,10 +414,7 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
         pad_len = ((cnt + block_M - 1) // block_M) * block_M - cnt
         if pad_len > 0:
             # -1 for padding (`M` instead in vLLM moe_align_block_size())
-            group_token_ids = torch.cat([
-                group_token_ids,
-                torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device='cuda')
-            ])
+            group_token_ids = torch.cat([group_token_ids, torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device="cuda")])
         padded_token_ids.append(group_token_ids)
         expert_ids.extend([eid] * ((cnt + block_M - 1) // block_M))
         start = end
@@ -431,21 +422,13 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
     # sorted_token_ids: The final flattened and padded tensor of token indices.
     sorted_token_ids = torch.cat(padded_token_ids, dim=0).to(torch.int32)  # (padding_M,)
     # expert_ids: The final tensor of expert IDs corresponding to `sorted_token_ids`.
-    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device='cuda')  # （padding_M,）
+    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device="cuda")  # （padding_M,）
     padding_M = sorted_token_ids.shape[0]  # padding_M: token number after padding
 
     return A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M
 
 
-def main(m=256,
-         n=256,
-         k=256,
-         scale_size=32,
-         topk=4,
-         E=32,
-         fast_dequant=True,
-         with_bias=False,
-         tune=False):
+def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
     # Tunable parameters
     block_M, block_N, block_K = 128, 256, 128  # noqa: F841
     num_stages = 1  # noqa: F841
@@ -456,8 +439,7 @@ def main(m=256,
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
     qk = k // num_elems_per_byte
-    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(
-        m, n, k, qk, scale_size, topk, E, block_M)
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
 
     if tune:
         with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
@@ -510,14 +492,11 @@ def main(m=256,
         expert_ids,
     )
 
-    print('Tilelang kernel run finished.')
+    print("Tilelang kernel run finished.")
 
-    ref_output = ref_moe(
-        A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids,
-        block_M=block_M)  # Maybe a little bit slow...
+    ref_output = ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=block_M)  # Maybe a little bit slow...
 
-    latency = tilelang.profiler.do_bench(
-        lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
+    latency = tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
     print("Tilelang: {:.2f} ms".format(latency))
     print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
@@ -525,32 +504,19 @@ def main(m=256,
     max_val = diff.max()
     max_idx = diff.argmax()
     print(f"max abs diff: {max_val} at index: {max_idx}")
-    assert_similar(
-        output, ref_output, name="output",
-        eps=2e-5)  # We care about the similarity rather than abs. difference
+    assert_similar(output, ref_output, name="output", eps=2e-5)  # We care about the similarity rather than abs. difference
     print("All checks pass. ✅")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
+    parser.add_argument("--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
     parser.add_argument("--N", type=int, default=5760, help="N")
     parser.add_argument("--K", type=int, default=2944, help="K")
     parser.add_argument("--scale_size", type=int, default=32, help="scale size")
-    parser.add_argument(
-        "--topk", type=int, default=4, help="topk")  # experts activated for each token
+    parser.add_argument("--topk", type=int, default=4, help="topk")  # experts activated for each token
     parser.add_argument("--E", type=int, default=32, help="E")  # number of experts
     parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
 
-    main(
-        args.M,
-        args.N,
-        args.K,
-        args.scale_size,
-        topk=args.topk,
-        E=args.E,
-        fast_dequant=True,
-        with_bias=True,
-        tune=args.tune)
+    main(args.M, args.N, args.K, args.scale_size, topk=args.topk, E=args.E, fast_dequant=True, with_bias=True, tune=args.tune)
diff --git a/examples/dsa_sparse_finetune/dsa.py b/examples/dsa_sparse_finetune/dsa.py
index 1ca28241..9fae8e5e 100644
--- a/examples/dsa_sparse_finetune/dsa.py
+++ b/examples/dsa_sparse_finetune/dsa.py
@@ -11,7 +11,6 @@ from utils import get_abs_err, get_err_ratio
 
 
 class RegsiterLossFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, x, loss):
         ctx.save_for_backward(loss)
@@ -38,49 +37,43 @@ def ref_deepseek_sparse_attention_innner(
     index_sm_scale: Optional[float] = None,
 ):
     dtype = q.dtype
-    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32),
-                                           (q, kv, index_q, index_k, weights))
+    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32), (q, kv, index_q, index_k, weights))
 
-    index_sm_scale = index_q.shape[-1]**-0.5
+    index_sm_scale = index_q.shape[-1] ** -0.5
     b, s = index_q.shape[:2]
 
     # tl_topk_indices = tl_topk_indices.to(torch.int64)
     # tl_topk_indices[tl_topk_indices == -1] = s
 
     casual_mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
-    index_logits = einsum(index_q, index_k, 'b s1 h k, b s2 k -> b s1 h s2')
+    index_logits = einsum(index_q, index_k, "b s1 h k, b s2 k -> b s1 h s2")
     index_logits = F.relu(index_logits)
-    index_logits = (index_logits * weights.unsqueeze(-1)).sum(
-        dim=-2, dtype=torch.float32) * index_sm_scale
-    index_logits = torch.where(casual_mask, index_logits, float('-inf'))
+    index_logits = (index_logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * index_sm_scale
+    index_logits = torch.where(casual_mask, index_logits, float("-inf"))
     topk_indices = torch.topk(index_logits, k=topk, dim=-1).indices
-    topk_logits = torch.gather(
-        F.pad(index_logits, (0, 1), value=float('-inf')), dim=-1, index=topk_indices)
+    topk_logits = torch.gather(F.pad(index_logits, (0, 1), value=float("-inf")), dim=-1, index=topk_indices)
     topk_score = F.log_softmax(topk_logits, dim=-1, dtype=torch.float32)
     index_topk_score = topk_score
 
     if sm_scale is None:
-        sm_scale = kv.shape[-1]**-0.5
+        sm_scale = kv.shape[-1] ** -0.5
 
     h = q.shape[-2]
-    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda")\
-        .scatter_(dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool))[:, :, :-1]
-    mask = repeat(casual_mask & index_mask, 'b s1 s2 -> b s1 h s2', h=h)
+    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda").scatter_(
+        dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool)
+    )[:, :, :-1]
+    mask = repeat(casual_mask & index_mask, "b s1 s2 -> b s1 h s2", h=h)
     k, v = kv, kv[..., :dim_v]
-    logits = einsum(q, k, 'b s1 h d, b s2 d -> b s1 h s2') * sm_scale
-    logits = torch.where(mask, logits, float('-inf'))
+    logits = einsum(q, k, "b s1 h d, b s2 d -> b s1 h s2") * sm_scale
+    logits = torch.where(mask, logits, float("-inf"))
     attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
-    o = einsum(attn_score, v, 'b s1 h s2, b s2 d -> b s1 h d')
+    o = einsum(attn_score, v, "b s1 h s2, b s2 d -> b s1 h d")
 
     attn_score = attn_score.sum(dim=-2)  # [b, s1, s2]
     attn_topk_score = torch.gather(F.pad(attn_score, (0, 1)), dim=-1, index=topk_indices)
     attn_topk_score = attn_topk_score / attn_topk_score.sum(dim=-1, keepdim=True)
 
-    loss = F.kl_div(
-        index_topk_score.clip(-100, 0),
-        attn_topk_score.detach().log().clip(-100, 0),
-        log_target=True,
-        reduction="sum")
+    loss = F.kl_div(index_topk_score.clip(-100, 0), attn_topk_score.detach().log().clip(-100, 0), log_target=True, reduction="sum")
     o = register_loss(o, loss)
 
     return o.to(dtype), topk_indices
@@ -101,11 +94,11 @@ def ref_deepseek_sparse_attention(
     all_o, all_topk_indices = [], []
     for i in range(offsets.shape[0] - 1):
         o, topk_indices = ref_deepseek_sparse_attention_innner(
-            q[None, offsets[i]:offsets[i + 1]],
-            kv[None, offsets[i]:offsets[i + 1]],
-            index_q[None, offsets[i]:offsets[i + 1]],
-            index_k[None, offsets[i]:offsets[i + 1]],
-            weights[None, offsets[i]:offsets[i + 1]],
+            q[None, offsets[i] : offsets[i + 1]],
+            kv[None, offsets[i] : offsets[i + 1]],
+            index_q[None, offsets[i] : offsets[i + 1]],
+            index_k[None, offsets[i] : offsets[i + 1]],
+            weights[None, offsets[i] : offsets[i + 1]],
             topk,
             dim_v,
             sm_scale,
@@ -119,7 +112,6 @@ def ref_deepseek_sparse_attention(
 
 
 class DSAFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(
         ctx,
@@ -134,12 +126,9 @@ class DSAFunction(torch.autograd.Function):
         sm_scale: Optional[float] = None,
     ):
         # topk_indices, index_score = ref_index_score(index_q, weights, index_k, topk)
-        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k,
-                                                                     topk, offsets)
-        o, lse = sparse_mla_fwd_interface(
-            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
-        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse,
-                              offsets)
+        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k, topk, offsets)
+        o, lse = sparse_mla_fwd_interface(q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
+        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets)
         ctx.topk = topk
         ctx.dim_v = dim_v
         ctx.sm_scale = sm_scale
@@ -153,19 +142,10 @@ class DSAFunction(torch.autograd.Function):
     ):
         q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets = ctx.saved_tensors
         attn_score = sparse_mla_topk_reducesum_interface(
-            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets,
-            dim_v=ctx.dim_v).squeeze(-2)
-        dq, dkv = sparse_mla_bwd(
-            q,
-            kv.unsqueeze(-2),
-            o,
-            do,
-            topk_indices.unsqueeze(-2),
-            lse,
-            offsets,
-            sm_scale=ctx.sm_scale)
-        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score,
-                                                             index_score, topk_indices, offsets)
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets, dim_v=ctx.dim_v
+        ).squeeze(-2)
+        dq, dkv = sparse_mla_bwd(q, kv.unsqueeze(-2), o, do, topk_indices.unsqueeze(-2), lse, offsets, sm_scale=ctx.sm_scale)
+        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score, index_score, topk_indices, offsets)
         return dq, dkv.squeeze(-2), dindex_q, dindex_k, dweights, None, None, None, None
 
 
@@ -209,8 +189,7 @@ def test_kernel(
     index_k_grad, index_k.grad = index_k.grad, None
     weights_grad, weights.grad = weights.grad, None
 
-    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights,
-                                                            offsets, topk, D)
+    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
     ref_o.backward(do)
     ref_q_grad, q.grad = q.grad, None
     ref_kv_grad, kv.grad = kv.grad, None
@@ -219,28 +198,20 @@ def test_kernel(
     ref_weights_grad, weights.grad = weights.grad, None
 
     print(f"o err: {get_abs_err(o, ref_o):.6f} ratio: {get_err_ratio(o, ref_o):.6f}")
-    print(
-        f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}"
-    )
-    print(
-        f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}"
-    )
+    print(f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}")
+    print(f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}")
     print(
         f"index_q.grad err: {get_abs_err(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f} ratio: {get_err_ratio(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f}"
     )
-    print(
-        f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}"
-    )
-    print(
-        f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}"
-    )
+    print(f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}")
+    print(f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}")
 
     intersections = []
     for j in range(S):
         ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
         trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
 
-        mask = (trt_np != -1)
+        mask = trt_np != -1
 
         set_ref = set(ref_np[mask])
         set_trt = set(trt_np[mask])
diff --git a/examples/dsa_sparse_finetune/index.py b/examples/dsa_sparse_finetune/index.py
index 92ce687f..5e480041 100644
--- a/examples/dsa_sparse_finetune/index.py
+++ b/examples/dsa_sparse_finetune/index.py
@@ -5,7 +5,9 @@ import functools
 from typing import Callable, Any
 
 
-def tensor_cache(fn: Callable[..., torch.Tensor],) -> Callable[..., torch.Tensor]:
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
     """
     A decorator that caches the most recent result of a function with tensor inputs.
 
@@ -29,10 +31,12 @@ def tensor_cache(fn: Callable[..., torch.Tensor],) -> Callable[..., torch.Tensor
     def wrapper(*args: Any, **kwargs: Any) -> Any:
         nonlocal last_args, last_kwargs, last_result
 
-        if (last_args is not None and last_kwargs is not None) and \
-            (len(args) == len(last_args) and len(kwargs) == len(last_kwargs)) and \
-                all(a is b for a, b in zip(args, last_args, strict=False)) and \
-                    all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()):
+        if (
+            (last_args is not None and last_kwargs is not None)
+            and (len(args) == len(last_args) and len(kwargs) == len(last_kwargs))
+            and all(a is b for a, b in zip(args, last_args, strict=False))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
             return last_result
 
         result = fn(*args, **kwargs)
@@ -56,16 +60,15 @@ def prepare_cu_seqlens_from_lens(
 
 
 @tensor_cache
-def prepare_lens_from_cu_seqlens(cu_seqlens: torch.LongTensor,) -> torch.LongTensor:
+def prepare_lens_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> torch.LongTensor:
     return torch.diff(cu_seqlens)
 
 
 @tensor_cache
 def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
-    return torch.cat([
-        torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device)
-        for n in prepare_lens(cu_seqlens).unbind()
-    ])
+    return torch.cat([torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device) for n in prepare_lens(cu_seqlens).unbind()])
 
 
 @tensor_cache
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
index 5430c1c0..5d8132d9 100644
--- a/examples/dsa_sparse_finetune/indexer_bwd.py
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -49,17 +49,17 @@ def tl_indexer_bwd_impl(
 
     @T.prim_func
     def tl_indexer_bwd_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),
-            Weights: T.Tensor(weights_shape, dtype),
-            IndexK: T.Tensor(index_k_shape, dtype),
-            dIndexQ: T.Tensor(index_q_shape, dtype),
-            dWeights: T.Tensor(weights_shape, dtype),
-            dIndexK: T.Tensor(index_k_shape, dtype),
-            AttnScore: T.Tensor(shape_p, FP32),
-            IndexScore: T.Tensor(shape_p, FP32),
-            TopkIndices: T.Tensor(topk_indices_shape, INT32),
-            Offsets: T.Tensor(offsets_shape, INT32),
-            TokenIndices: T.Tensor(token_indices_shape, INT32),
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        dIndexQ: T.Tensor(index_q_shape, dtype),
+        dWeights: T.Tensor(weights_shape, dtype),
+        dIndexK: T.Tensor(index_k_shape, dtype),
+        AttnScore: T.Tensor(shape_p, FP32),
+        IndexScore: T.Tensor(shape_p, FP32),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
     ):
         with T.Kernel(seq_len, threads=num_threads) as (bx):
             i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
@@ -81,7 +81,6 @@ def tl_indexer_bwd_impl(
                 index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
 
             for bi_i in T.Pipelined(num_blocks, num_stages=num_stages):
-
                 i_st = bi_i * block_I
                 i_ed = (bi_i + 1) * block_I
 
@@ -91,8 +90,7 @@ def tl_indexer_bwd_impl(
                 index_k_shared = T.alloc_shared([block_I, dim], dtype=dtype)
                 for i, j in T.Parallel(block_I, dim):
                     pos = indices_shared[i]
-                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t),
-                                                          IndexK[bos + pos, j], 0)
+                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t), IndexK[bos + pos, j], 0)
 
                 attn_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
                 index_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
@@ -115,8 +113,7 @@ def tl_indexer_bwd_impl(
                 # dw
                 d_weights_i = T.alloc_fragment((block_I, heads), accum_dtype)
                 for i, j in T.Parallel(block_I, heads):
-                    d_weights_i[i,
-                                j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
+                    d_weights_i[i, j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
                 T.reduce_sum(d_weights_i, d_weights_frag, dim=0, clear=False)
 
                 d_logits_qk = T.alloc_shared((block_I, heads), accum_dtype)
@@ -129,8 +126,7 @@ def tl_indexer_bwd_impl(
                         d_relu = 1.0
                     else:
                         d_relu = 0.0
-                    d_logits_qk[i, j] = (index_score_shared[i] -
-                                         attn_score_shared[i]) * d_relu * weights_shared[j]
+                    d_logits_qk[i, j] = (index_score_shared[i] - attn_score_shared[i]) * d_relu * weights_shared[j]
 
                 # dq
                 T.copy(d_logits_qk, d_logits_qk_cast1)
@@ -157,7 +153,7 @@ def tl_indexer_bwd_impl(
 
                 for i, j in T.Parallel(block_I, dim):
                     pos = indices_shared[i]
-                    if ((pos > -1) & (pos <= i_t)):
+                    if (pos > -1) & (pos <= i_t):
                         T.atomic_add(dIndexK[bos + pos, j], d_index_k_frag[i, j])
 
             for i, j in T.Parallel(heads, dim):
@@ -184,40 +180,35 @@ def indexer_bwd_interface(
     dweights = torch.zeros_like(weights)
     dk = torch.zeros_like(k)
     kernel = tl_indexer_bwd_impl(heads, dim, topk)
-    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets,
-           token_indices)
+    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets, token_indices)
     return dq, dweights, dk
 
 
-def ref_indexer_bwd(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor,
-                    TopkIndices: torch.Tensor, AttnScore: torch.Tensor,
-                    offsets: torch.Tensor) -> torch.Tensor:
+def ref_indexer_bwd(
+    Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, AttnScore: torch.Tensor, offsets: torch.Tensor
+) -> torch.Tensor:
     Q.requires_grad_(True)
     Weights.requires_grad_(True)
     K.requires_grad_(True)
-    softmax_scale = Q.shape[-1]**-0.5
+    softmax_scale = Q.shape[-1] ** -0.5
     all_loss = []
     all_log_topk_prob = []
     for i in range(offsets.shape[0] - 1):
         assert (offsets[i + 1] - offsets[i]).item() >= TopkIndices.shape[-1]
-        q = Q[offsets[i]:offsets[i + 1]]
-        weights = Weights[offsets[i]:offsets[i + 1]]
-        k = K[offsets[i]:offsets[i + 1]]
-        topk_indices = TopkIndices[offsets[i]:offsets[i + 1]]
-        attn_score = AttnScore[offsets[i]:offsets[i + 1]]
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        attn_score = AttnScore[offsets[i] : offsets[i + 1]]
         s = q.shape[0]
         mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
-        logits = einsum(q, k, 's1 h k, s2 k -> s1 h s2') * softmax_scale
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2") * softmax_scale
         logits = F.relu(logits)
         score = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32)
-        score = torch.where(mask, score, float('-inf'))
+        score = torch.where(mask, score, float("-inf"))
         topk_value = torch.gather(score, dim=-1, index=topk_indices.to(torch.int64))
         log_topk_prob = F.log_softmax(topk_value, dim=-1, dtype=torch.float32)
-        loss = F.kl_div(
-            log_topk_prob.clip(-100, 0),
-            attn_score.log().clip(-100, 0),
-            log_target=True,
-            reduction="sum")
+        loss = F.kl_div(log_topk_prob.clip(-100, 0), attn_score.log().clip(-100, 0), log_target=True, reduction="sum")
         all_loss.append(loss)
         all_log_topk_prob.append(log_topk_prob)
     loss = torch.stack(all_loss).sum()
@@ -244,15 +235,13 @@ def test_kernel(
         seq_len = (offsets[i + 1] - offsets[i]).item()
         mask = (torch.arange(seq_len)[:, None] >= torch.arange(topk)[None, :]).to(q.device)
         logits = torch.ones(seq_len, topk).cuda()
-        logits = torch.where(mask, logits, float('-inf'))
+        logits = torch.where(mask, logits, float("-inf"))
         attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
         all_attn_score.append(attn_score)
     attn_score = torch.cat(all_attn_score, dim=0)
 
-    topk_indices = repeat(
-        torch.arange(topk, dtype=torch.int32).cuda(), 'k -> s k', s=S).contiguous()
-    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score,
-                                                          offsets)
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score, offsets)
 
     dq, dw, dk = indexer_bwd_interface(q, w, k, attn_score, index_score, topk_indices, offsets)
 
@@ -261,5 +250,5 @@ def test_kernel(
     print(f"dq err: {get_abs_err(dk, ref_dk):.6f} ratio: {get_err_ratio(dk, ref_dk):.6f}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test_kernel()
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
index b7fa6627..8e2f82ba 100644
--- a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -53,8 +53,8 @@ def tl_indexer_topk_reducesum_impl(
 
     @T.macro
     def bitonic_sort(
-            topk_index_shared: T.SharedBuffer([N], dtype=INT32),
-            topk_value_shared: T.SharedBuffer([N], dtype=FP32),
+        topk_index_shared: T.SharedBuffer([N], dtype=INT32),
+        topk_value_shared: T.SharedBuffer([N], dtype=FP32),
     ):
         T.sync_threads()
         for i1 in T.serial(num_iters):
@@ -62,9 +62,10 @@ def tl_indexer_topk_reducesum_impl(
                 for i in T.Parallel(N):
                     ascending = (i & (1 << (i1 + 1))) != 0
                     j = i ^ (1 << (i1 - i2))
-                    if i < j and \
-                        ((ascending and topk_value_shared[i] > topk_value_shared[j]) or (
-                                not ascending and topk_value_shared[i] < topk_value_shared[j])):
+                    if i < j and (
+                        (ascending and topk_value_shared[i] > topk_value_shared[j])
+                        or (not ascending and topk_value_shared[i] < topk_value_shared[j])
+                    ):
                         val = topk_value_shared[i]
                         topk_value_shared[i] = topk_value_shared[j]
                         topk_value_shared[j] = val
@@ -75,13 +76,13 @@ def tl_indexer_topk_reducesum_impl(
 
     @T.prim_func
     def tl_indexer_topk_reducesum_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),
-            Weights: T.Tensor(weights_shape, dtype),
-            IndexK: T.Tensor(index_k_shape, dtype),
-            TopkIndices: T.Tensor(topk_indices_shape, INT32),
-            ReduceSum: T.Tensor(topk_indices_shape, FP32),
-            Offsets: T.Tensor(offsets_shape, INT32),
-            TokenIndices: T.Tensor(token_indices_shape, INT32),
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        ReduceSum: T.Tensor(topk_indices_shape, FP32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
     ):
         with T.Kernel(seq_len, threads=num_threads) as (bx):
             i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
@@ -92,7 +93,7 @@ def tl_indexer_topk_reducesum_impl(
             topk_value_shared = T.alloc_shared([N], dtype=FP32)
 
             T.fill(topk_index_shared, -1)
-            T.fill(topk_value_shared, float('-inf'))
+            T.fill(topk_value_shared, float("-inf"))
             T.sync_threads()
 
             index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
@@ -113,8 +114,7 @@ def tl_indexer_topk_reducesum_impl(
 
                 index_k_shared = T.alloc_shared([block_K, dim], dtype=dtype)
                 for i, j in T.Parallel(block_K, dim):
-                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i,
-                                                                                  j], 0)
+                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i, j], 0)
                 T.sync_threads()
 
                 logits = T.alloc_fragment((block_K, heads), FP32)
@@ -144,7 +144,7 @@ def tl_indexer_topk_reducesum_impl(
                 T.sync_threads()
                 for i in T.Parallel(block_K):
                     if k_st + i > i_t:
-                        logits_sum[i] = float('-inf')
+                        logits_sum[i] = float("-inf")
                     j = offset + i
                     topk_index_shared[j] = k_st + i
                     topk_value_shared[j] = logits_sum[i]
@@ -209,22 +209,21 @@ def indexer_topk_reducesum_interface(
     return topk_indices, topk_score
 
 
-def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int,
-                    offsets: torch.Tensor) -> torch.Tensor:
+def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int, offsets: torch.Tensor) -> torch.Tensor:
     all_topk_indices = []
     all_topk_score = []
     for i in range(offsets.shape[0] - 1):
         assert (offsets[i + 1] - offsets[i]).item() >= topk
-        q = Q[offsets[i]:offsets[i + 1]]
-        weights = Weights[offsets[i]:offsets[i + 1]]
-        k = K[offsets[i]:offsets[i + 1]]
-        softmax_scale = q.shape[-1]**-0.5
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        softmax_scale = q.shape[-1] ** -0.5
         s = q.shape[0]
         mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
-        logits = einsum(q, k, 's1 h k, s2 k -> s1 h s2')
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2")
         logits = F.relu(logits)
         logits = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * softmax_scale
-        logits = torch.where(mask, logits, float('-inf'))
+        logits = torch.where(mask, logits, float("-inf"))
         topk_logits, topk_indices = torch.topk(logits, k=topk, dim=-1)
         topk_score = F.softmax(topk_logits, dim=-1, dtype=torch.float32)
         all_topk_indices.append(topk_indices)
@@ -265,13 +264,10 @@ def test_kernel(
         set_trt = set(trt_np[mask])
         intersection = set_ref & set_trt
 
-        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=",
-              len(intersection) / len(set_ref))
+        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
 
-        print(
-            f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}"
-        )
+        print(f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test_kernel()
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
index 33c21cb4..0b085516 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_bwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -19,15 +19,15 @@ def preprocess(
     assert dtype == "bfloat16"
     assert accum_dtype == "float"
 
-    S = T.symbolic('S')
+    S = T.symbolic("S")
 
     shape = [S, H, D]
 
     @T.prim_func
     def preprocess_kernel(
-            O: T.Tensor(shape, dtype),
-            dO: T.Tensor(shape, dtype),
-            Delta: T.Tensor([S, H], accum_dtype),
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([S, H], accum_dtype),
     ):
         with T.Kernel(H, T.ceildiv(S, block_ND)) as (bx, by):
             o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
@@ -36,13 +36,12 @@ def preprocess(
             acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
             T.clear(acc)
             for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
-                T.copy(O[by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND], o)
-                T.copy(dO[by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                       do)
+                T.copy(O[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
                 for i, j in T.Parallel(block_ND, block_ND):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[by * block_ND:(by + 1) * block_ND, bx])
+            T.copy(delta, Delta[by * block_ND : (by + 1) * block_ND, bx])
 
     return preprocess_kernel
 
@@ -59,19 +58,19 @@ def postprocess(
 ):
     assert dtype == "bfloat16"
     assert accum_dtype == "float"
-    S_kv = T.symbolic('S_kv')
+    S_kv = T.symbolic("S_kv")
 
     dkv_shape = [S_kv, kv_group, D + D_tail]
 
     @T.prim_func
     def postprocess_kernel(
-            dKV: T.Tensor(dkv_shape, accum_dtype),
-            dKV_out: T.Tensor(dkv_shape, dtype),
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, threads=threads) as (bx, by):
             T.copy(
-                dKV[bx * block_N:(bx + 1) * block_N, by, :],
-                dKV_out[bx * block_N:(bx + 1) * block_N, by, :],
+                dKV[bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bx * block_N : (bx + 1) * block_N, by, :],
             )
 
     return postprocess_kernel
@@ -82,7 +81,8 @@ def postprocess(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 def bwd(
     H,
     D,
@@ -98,17 +98,17 @@ def bwd(
     dtype="bfloat16",
     accum_dtype="float",
 ):
-    assert is_causal == True, 'non-casual is not supported now'
-    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     assert dtype == "bfloat16"
     assert accum_dtype == "float"
     assert indices_dtype == "int32"
 
     if sm_scale is None:
-        sm_scale = (D + D_tail)**(-0.5)
+        sm_scale = (D + D_tail) ** (-0.5)
 
-    B_plus_one = T.symbolic('B_plus_one')
-    S = T.symbolic('S')
+    B_plus_one = T.symbolic("B_plus_one")
+    S = T.symbolic("S")
 
     H_kv = H // kv_group
     q_shape = [S, H, D + D_tail]
@@ -132,16 +132,16 @@ def bwd(
 
     @T.prim_func
     def sparse_mla_bwd_kernel(
-            Q: T.Tensor(q_shape, dtype),
-            KV: T.Tensor(k_shape, dtype),
-            dO: T.Tensor(o_shape, dtype),
-            Indices: T.Tensor(indices_shape, indices_dtype),
-            Lse: T.Tensor(lse_shape, accum_dtype),
-            Delta: T.Tensor(delta_shape, accum_dtype),
-            Offsets: T.Tensor(offsets_shape, indices_dtype),
-            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
-            dQ: T.Tensor(q_shape, dtype),
-            dKV: T.Tensor(k_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        Offsets: T.Tensor(offsets_shape, indices_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
     ):
         with T.Kernel(S, kv_group, threads=threads) as (b_s_i, bz):
             Q_shared = T.alloc_shared([padded_H, D], dtype)
@@ -163,32 +163,32 @@ def bwd(
             acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
             acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
             acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
-            acc_dkv_tail_shared = T.view(
-                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+            acc_dkv_tail_shared = T.view(KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
 
             b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
             bos, eos = Offsets[b_i], Offsets[b_i + 1]
 
             max_kv_i = s_i
 
-            T.copy(Q[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
-            T.copy(Q[bos + s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
-            T.copy(dO[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
 
             T.clear(acc_dq)
             T.clear(acc_dq_tail)
 
-            T.annotate_layout({
-                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
-                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
+                    dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
+                }
+            )
 
             # Process each block of indices
             for i_i in T.Pipelined(NS, num_stages=num_stages):
                 # Check which indices are valid
                 for bi_i in T.Parallel(BS):
-                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (
-                        Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
+                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
 
                 # Compute attention scores
                 for h_i, bi_i in T.Parallel(padded_H, BS):
@@ -196,65 +196,33 @@ def bwd(
 
                 # Load KV, V for this block of indices
                 for bi_i, d_i in T.Parallel(BS, D):
-                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, d_i]
 
-                T.gemm(
-                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for bi_i, d_i in T.Parallel(BS, D_tail):
-                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i],
-                                                   bz, D + d_i]
-                T.gemm(
-                    Q_tail_shared,
-                    KV_tail_shared,
-                    acc_p,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale -
-                                             Lse[bos + s_i, bz * padded_H + h_i])
+                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale - Lse[bos + s_i, bz * padded_H + h_i])
 
                 T.copy(acc_p, P_shared_cast)
 
-                T.gemm(
-                    dO_shared,
-                    KV_shared,
-                    acc_dp,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
 
                 for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
-                        acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
 
                 T.copy(acc_dp, dP_shared_cast)
                 T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
 
-                T.gemm(
-                    dP_shared_cast,
-                    Q_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    P_shared_cast,
-                    dO_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 T.clear(acc_dkv_tail)
-                T.gemm(
-                    dP_shared_cast,
-                    Q_tail_shared,
-                    acc_dkv_tail,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for s in range(split_store):
                     for bi_i, d_i in T.Parallel(BS, D):
@@ -263,44 +231,32 @@ def bwd(
 
                     for bi_i, d_i in T.Parallel(BS, D_tail):
                         if bi_i < BS // split_store:
-                            acc_dkv_tail_shared[bi_i,
-                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
-                                                                    d_i]
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
                     for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
                         T.atomic_addx4(
-                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s *
-                                              (BS // split_store)], bz, d_i * 4],
-                            acc_dkv_shared[bi_i, d_i * 4])
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
 
                     # Atomically update dKV, dKV_tail tensors
                     for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
                         T.atomic_addx4(
-                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s *
-                                              (BS // split_store)], bz, D + d_i * 4],
-                            acc_dkv_tail_shared[bi_i, d_i * 4])
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
 
             # Store the accumulated dQ
             T.copy(acc_dq, dQ_shared)
             T.copy(acc_dq_tail, dQ_tail_shared)
 
-            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H:(bz + 1) * padded_H, :D])
-            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:])
 
     return sparse_mla_bwd_kernel
 
 
-def sparse_mla_bwd(q,
-                   kv,
-                   o,
-                   do,
-                   indices,
-                   lse,
-                   offsets,
-                   sm_scale=None,
-                   is_casual=True,
-                   return_kernel=False,
-                   delta=None):
+def sparse_mla_bwd(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
     assert q.is_contiguous()
     assert kv.is_contiguous()
     assert indices.is_contiguous()
@@ -333,16 +289,9 @@ def sparse_mla_bwd(q,
     return dq, dkv
 
 
-def ref_sparse_mla_bwd_interface(q,
-                                 kv,
-                                 o,
-                                 do,
-                                 indices,
-                                 lse,
-                                 offsets,
-                                 sm_scale=None,
-                                 is_casual=True):
+def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True):
     from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
     q = q.detach().clone()
     kv = kv.detach().clone()
     q.requires_grad = True
@@ -352,32 +301,25 @@ def ref_sparse_mla_bwd_interface(q,
     return q.grad, kv.grad
 
 
-def test_sparse_mla_bwd(B=1,
-                        S=2048,
-                        H=64,
-                        HKV=1,
-                        DQKV=576,
-                        DV=512,
-                        topk=512,
-                        dtype=torch.bfloat16,
-                        check_correctness=True):
+def test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True):
     # Prepare data
-    q = torch.randn((S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((S, H, DV), dtype=dtype, device='cuda')
+    q = torch.randn((S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((S, H, DV), dtype=dtype, device="cuda")
     offsets = torch.tensor([0, S], dtype=torch.int32, device="cuda")
 
-    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device='cuda')
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
     for i in range(offsets.shape[0] - 1):
         seq_len = (offsets[i + 1] - offsets[i]).item()
         assert seq_len >= topk
         for t in range(seq_len):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[offsets[i] + t, h, :len(i_i)] = i_i
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
 
     # Forward
     from sparse_mla_fwd import sparse_mla_fwd_interface
+
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets)
 
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
@@ -388,13 +330,15 @@ def test_sparse_mla_bwd(B=1,
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
 
-    per_token_flop = 2 * sum([
-        H * DV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DV * topk,
-    ])
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
     from tilelang.profiler import do_bench
 
     def fn():
@@ -402,19 +346,9 @@ def test_sparse_mla_bwd(B=1,
 
     ms = do_bench(fn, rep=100, warmup=250)
     print(f"Average time: {ms:.3f} ms")
-    print(f'bwd io bandwidth = ',
-          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
 
 
 if __name__ == "__main__":
-    test_sparse_mla_bwd(
-        B=1,
-        S=2048,
-        H=64,
-        HKV=1,
-        DQKV=576,
-        DV=512,
-        topk=512,
-        dtype=torch.bfloat16,
-        check_correctness=True)
+    test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
index 5f03dfbb..6ec3caa7 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_fwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -27,15 +27,12 @@ def sparse_mla_fwd(
     num_stages=2,
     threads=128,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
     assert is_causal == True, "non-casual is not supported"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
     else:
         sm_scale = sm_scale
 
@@ -58,9 +55,9 @@ def sparse_mla_fwd(
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -76,19 +73,18 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
-            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -122,17 +118,13 @@ def sparse_mla_fwd(
             T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
-                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (
-                        Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i],
-                                                  g_i, D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -177,16 +169,9 @@ def sparse_mla_fwd(
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             offsets,
-                             sm_scale=None,
-                             return_p_sum: bool = False,
-                             d_v=512,
-                             block_I=32,
-                             num_stages=2,
-                             threads=128):
+def sparse_mla_fwd_interface(
+    q, kv, indices, offsets, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=32, num_stages=2, threads=128
+):
     is_casual = True
     assert return_p_sum == False, "This kernel file is for fwd only"
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
@@ -205,16 +190,8 @@ def sparse_mla_fwd_interface(q,
     token_indices = prepare_token_indices(offsets)
 
     kernel = sparse_mla_fwd(
-        heads,
-        dim,
-        tail_dim,
-        topk,
-        kv_group,
-        sm_scale,
-        is_casual,
-        block_I=block_I,
-        num_stages=num_stages,
-        threads=threads)
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
     out, lse = kernel(q, kv, indices, offsets, token_indices)
     return out, lse
 
@@ -224,9 +201,9 @@ def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casu
     KV = KV.float()
     all_o = []
     for i in range(offsets.shape[0] - 1):
-        q = Q[None, offsets[i]:offsets[i + 1]]
-        kv = KV[None, offsets[i]:offsets[i + 1]]
-        indices = Indices[None, offsets[i]:offsets[i + 1]].clone()
+        q = Q[None, offsets[i] : offsets[i + 1]]
+        kv = KV[None, offsets[i] : offsets[i + 1]]
+        indices = Indices[None, offsets[i] : offsets[i + 1]].clone()
 
         indices = indices.transpose(1, 2)
         b, sq, h, dim_q = q.shape
@@ -240,15 +217,15 @@ def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casu
         b, _, _, dim_v = v.shape
         g_index = g
         h_index = h // g
-        compressed_casual_mask = torch.arange(
-            0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
-                1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+        compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+        ).view(1, -1)
 
         indices[indices > sk] = sk
         mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
         mask = mask[..., :-1]
         mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-        mask[:, :, :1 - 1, 0] = True
+        mask[:, :, : 1 - 1, 0] = True
         mask = mask.view(b, g_index, 1, sq, sk)
 
         q = q.view(b, sq, g, -1, dim_q)
@@ -265,18 +242,20 @@ def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casu
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd(B=1,
-                        S=4096,
-                        H=128,
-                        HKV=1,
-                        DQK=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True,
-                        block_I=64,
-                        num_stages=2,
-                        threads=256):
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
     torch.random.manual_seed(0)
     q = torch.randn((S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((S, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -289,10 +268,9 @@ def test_sparse_mla_fwd(B=1,
         for t in range(seq_len):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[offsets[i] + t, h, :len(i_i)] = i_i
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
 
-    tl_out, tl_lse = sparse_mla_fwd_interface(
-        q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
 
     if check_correctness:
         # otherwise may cause out of memory
@@ -301,8 +279,7 @@ def test_sparse_mla_fwd(B=1,
         print("assert_tensors_similar passed")
 
     def fn():
-        return sparse_mla_fwd_interface(
-            q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+        return sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
 
     from tilelang.profiler import do_bench
 
@@ -329,4 +306,5 @@ if __name__ == "__main__":
         check_correctness=True,
         block_I=64,
         num_stages=2,
-        threads=256)
+        threads=256,
+    )
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
index 94bdb8fb..6675215c 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -30,14 +30,11 @@ def tl_sparse_mla_topk_reducesum_impl(
     num_stages=2,
     threads=128,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
 
     batch_plus_one = T.symbolic("batch_plus_one")
     seq_len = T.symbolic("seq_len")
@@ -52,9 +49,9 @@ def tl_sparse_mla_topk_reducesum_impl(
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -78,19 +75,18 @@ def tl_sparse_mla_topk_reducesum_impl(
 
     @T.prim_func
     def tl_sparse_mla_topk_reducesum_kernel(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
-            Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
-            TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
-            ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -119,17 +115,13 @@ def tl_sparse_mla_topk_reducesum_impl(
             T.copy(Lse[bos + s_i, H0:H1], lse)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
-                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (
-                        Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i],
-                                                  g_i, D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -150,7 +142,7 @@ def tl_sparse_mla_topk_reducesum_impl(
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - lse[h_i])
                 T.reduce_sum(acc_s, reducesum, dim=0)
-                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI:i_i * BI + BI])
+                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI : i_i * BI + BI])
 
     return tl_sparse_mla_topk_reducesum_kernel
 
@@ -178,29 +170,26 @@ def sparse_mla_topk_reducesum_interface(
     return attn_score
 
 
-def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor,
-                         offsets: torch.Tensor):
+def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, offsets: torch.Tensor):
     # q: [batch, seq_len, heads, dim]
     # k: [batch, seq_len, dim]
-    sm_scale = Q.shape[-1]**-0.5
+    sm_scale = Q.shape[-1] ** -0.5
     all_lse = []
     all_topk_score = []
     for i in range(offsets.shape[0] - 1):
-        q = Q[offsets[i]:offsets[i + 1]]
-        k = K[offsets[i]:offsets[i + 1]]
-        topk_indices = TopkIndices[offsets[i]:offsets[i + 1]]
+        q = Q[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
         seq_len = q.shape[0]
-        mask = (torch.arange(seq_len)[:, None]
-                >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
-        logits = einsum(q, k, 's1 h d, s2 d -> s1 h s2') * sm_scale
-        logits = torch.where(mask, logits, float('-inf'))
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
+        logits = einsum(q, k, "s1 h d, s2 d -> s1 h s2") * sm_scale
+        logits = torch.where(mask, logits, float("-inf"))
         score = F.softmax(logits, dim=-1, dtype=torch.float32)
         score_sum = score.sum(dim=-2)
         topk_score = torch.gather(score_sum, dim=-1, index=topk_indices.to(torch.int64))
         topk_score = topk_score / topk_score.sum(dim=-1, keepdim=True)
         max_logits = logits.amax(dim=-1).to(torch.float32)
-        lse = torch.log(
-            (logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
+        lse = torch.log((logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
         all_lse.append(lse)
         all_topk_score.append(topk_score)
     lse = torch.cat(all_lse, dim=0)
@@ -222,20 +211,16 @@ def test_kernel(
     kv = torch.randn((S, D + tail_D)).cuda().bfloat16()
     offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
 
-    topk_indices = repeat(
-        torch.arange(topk, dtype=torch.int32).cuda(), 'k -> s k', s=S).contiguous()
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
 
     lse, ref_attn_score = ref_mla_topk_softmax(q, kv, topk_indices, offsets)
 
     kv = kv.unsqueeze(-2)
     topk_indices = topk_indices.unsqueeze(-2)
 
-    attn_score = sparse_mla_topk_reducesum_interface(
-        q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
-    print(
-        f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}"
-    )
+    attn_score = sparse_mla_topk_reducesum_interface(q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
+    print(f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     test_kernel()
diff --git a/examples/dsa_sparse_finetune/utils.py b/examples/dsa_sparse_finetune/utils.py
index 691af64d..96afd064 100644
--- a/examples/dsa_sparse_finetune/utils.py
+++ b/examples/dsa_sparse_finetune/utils.py
@@ -66,10 +66,8 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
         raise_assert: Whether to raise assertion error on failure
     """
     sim = calculate_tensor_similarity(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print(
-            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
-        )
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
         if raise_assert:
             assert False  # noqa: B011
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
index be018c8b..97ce7d9b 100644
--- a/examples/dynamic_shape/example_dynamic.py
+++ b/examples/dynamic_shape/example_dynamic.py
@@ -29,9 +29,9 @@ def matmul_dynamic_mnk(
 
     @T.prim_func
     def dynamic_matmul(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -53,15 +53,14 @@ def matmul_dynamic_mnk(
     return dynamic_matmul
 
 
-def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads):
+def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads):
     print(
         f"M: {M}, N: {N}, K: {K}, block_M: {block_M}, block_N: {block_N}, block_K: {block_K}, trans_A: {trans_A}, trans_B: {trans_B}, in_dtype: {in_dtype}, out_dtype: {out_dtype}, accum_dtype: {accum_dtype}, num_stages: {num_stages}, threads: {threads}"
     )
-    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                                accum_dtype, num_stages, threads)
+    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 
     import torch
+
     if trans_A:
         A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
     else:
@@ -103,8 +102,7 @@ def main(M=16384, N=16384, K=16384):
     accum_dtype = "float32"
     num_stages = 3
     threads = 128
-    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads)
+    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 
 
 if __name__ == "__main__":
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index bc9bb4df..464312ce 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -12,10 +12,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -24,7 +22,7 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -41,19 +39,21 @@ def get_configs(M, N):
 
 
 def get_best_config(M, N):
-
     def kernel(block_M=None, block_N=None, threads=None):
         return elementwise_add(M, N, block_M, block_N, "float32", "float32", threads)
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N))
+        .set_compile_args(
             out_idx=[-1],
             target="cuda",
-        ).set_profile_args(
+        )
+        .set_profile_args(
             supply_type=tilelang.TensorSupplyType.Auto,
             ref_prog=ref_program,
             skip_check=False,
         )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/examples/flash_attention/bert_padding.py b/examples/flash_attention/bert_padding.py
index 7058fd77..15c4097c 100644
--- a/examples/flash_attention/bert_padding.py
+++ b/examples/flash_attention/bert_padding.py
@@ -6,7 +6,6 @@ from einops import rearrange, repeat
 
 
 class IndexFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -15,9 +14,7 @@ class IndexFirstAxis(torch.autograd.Function):
         second_dim = other_shape.numel()
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0,
-            repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
+        return torch.gather(rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -40,14 +37,12 @@ index_first_axis = IndexFirstAxis.apply
 
 
 class IndexPutFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, values, indices, first_axis_dim):
         ctx.save_for_backward(indices)
         assert indices.ndim == 1
         assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        output = torch.zeros(first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         output[indices] = values
         # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
@@ -66,7 +61,6 @@ index_put_first_axis = IndexPutFirstAxis.apply
 
 
 class IndexFirstAxisResidual(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -128,7 +122,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
     The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
+
     For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
         ```
         [
@@ -177,9 +171,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(
-        seqlen, device=length.device, dtype=length.dtype).expand(len(length),
-                                                                 seqlen) < length.unsqueeze(1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
     real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index 968d1de3..d1f5843e 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -6,11 +6,13 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -20,11 +22,11 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -40,25 +42,21 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -76,18 +74,20 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -96,9 +96,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -107,26 +107,27 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
     dtype = "float16"
     accum_dtype = "float"
@@ -135,35 +136,27 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -173,15 +166,15 @@ def flashattn_bwd_atomic_add(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -201,35 +194,36 @@ def flashattn_bwd_atomic_add(batch,
             dk_shared = T.alloc_shared([block_M, dim_qk], accum_dtype)
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -241,29 +235,21 @@ def flashattn_bwd_atomic_add(batch,
                 for i, j in T.Parallel(block_N, dim_qk):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        heads,
-                        seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -275,15 +261,15 @@ def flashattn_bwd_split(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -303,37 +289,38 @@ def flashattn_bwd_split(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -346,16 +333,15 @@ def flashattn_bwd_split(batch,
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -373,7 +359,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -390,17 +379,8 @@ class _attention(torch.autograd.Function):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -413,17 +393,8 @@ class _attention(torch.autograd.Function):
             dv = dv.to(torch.float16)
         else:
             kernel = flashattn_bwd_split(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -445,53 +416,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -508,7 +471,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -528,17 +491,15 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -550,5 +511,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index c427908a..c6cf336d 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -9,11 +9,13 @@ tilelang.disable_cache()
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -23,11 +25,11 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -43,27 +45,23 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
             T.fill(scores_max, T.Cast(accum_dtype, -1e30))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.Cast(accum_dtype, -1e30))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -81,18 +79,20 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -101,9 +101,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -112,12 +112,12 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
@@ -128,9 +128,11 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -141,46 +143,37 @@ def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bz, bx * blk:(bx + 1) * blk, by, :], dQ_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.copy(dQ[bz, bx * blk : (bx + 1) * blk, by, :], dQ_out[bz, bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(seq_len, blk), head_kv, batch, threads=128) as (bx, by, bz):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bz, bx * blk:(bx + 1) * blk, by, :], dK_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
-            T.copy(dV[bz, bx * blk:(bx + 1) * blk, by, :], dV_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bz, bx * blk : (bx + 1) * blk, by, :], dK_out[bz, bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bz, bx * blk : (bx + 1) * blk, by, :], dV_out[bz, bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -190,15 +183,15 @@ def flashattn_bwd_atomic_add(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -219,37 +212,38 @@ def flashattn_bwd_atomic_add(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -259,33 +253,23 @@ def flashattn_bwd_atomic_add(batch,
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared, use_tma=True)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared, use_tma=True)
             T.copy(dv, dv_shared)
-            T.atomic_add(
-                dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
             T.copy(dk, dk_shared)
-            T.atomic_add(
-                dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split_novarlen(batch,
-                                 heads,
-                                 seq_len,
-                                 dim_qk,
-                                 dim_v,
-                                 is_causal,
-                                 block_M,
-                                 block_N,
-                                 threads=256,
-                                 num_stages=2,
-                                 groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split_novarlen(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -297,15 +281,15 @@ def flashattn_bwd_split_novarlen(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -325,37 +309,38 @@ def flashattn_bwd_split_novarlen(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -368,16 +353,15 @@ def flashattn_bwd_split_novarlen(batch,
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -395,7 +379,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -412,17 +399,8 @@ class _attention(torch.autograd.Function):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -433,17 +411,8 @@ class _attention(torch.autograd.Function):
             dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split_novarlen(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -451,8 +420,7 @@ class _attention(torch.autograd.Function):
             dk = torch.empty(shape_k, dtype=torch.float16, device=q.device)
             dv = torch.empty(shape_v, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         return dq, dk, dv, None, None, None
@@ -466,53 +434,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -529,7 +489,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -552,17 +512,15 @@ if __name__ == "__main__":
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -574,5 +532,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index a9604f4d..112438f7 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -15,32 +15,21 @@ def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
 @tilelang.jit(
-    out_idx=[5, 6], pass_configs={
+    out_idx=[5, 6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_fwd(batch,
-                  total_q,
-                  total_kv,
-                  N_CTX,
-                  heads,
-                  max_seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -51,13 +40,13 @@ def flashattn_fwd(batch,
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -102,15 +91,17 @@ def flashattn_fwd(batch,
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i < q_current_seqlen and
-                                                      k * block_N + j < k_current_seqlen), 0,
-                                                     T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen),
+                            0,
+                            T.Cast(accum_dtype, -1e30),
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(
-                            bx * block_M + i < q_current_seqlen and
-                            k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30))
+                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30)
+                        )
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, d in T.Parallel(block_N, dim_v):
                     if k * block_N + i < k_current_seqlen:
@@ -148,9 +139,11 @@ def flashattn_fwd(batch,
 
 
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -159,10 +152,10 @@ def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -201,9 +194,11 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -214,46 +209,39 @@ def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(total_q, blk), heads, threads=128) as (bx, by):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bx * blk:(bx + 1) * blk, by, :], dQ_out[bx * blk:(bx + 1) * blk, by, :])
+            T.copy(dQ[bx * blk : (bx + 1) * blk, by, :], dQ_out[bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(total_kv, blk), head_kv, threads=128) as (bx, by):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bx * blk:(bx + 1) * blk, by, :], dK_out[bx * blk:(bx + 1) * blk, by, :])
-            T.copy(dV[bx * blk:(bx + 1) * blk, by, :], dV_out[bx * blk:(bx + 1) * blk, by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bx * blk : (bx + 1) * blk, by, :], dK_out[bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bx * blk : (bx + 1) * blk, by, :], dV_out[bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             total_q,
-                             total_kv,
-                             N_CTX,
-                             heads,
-                             max_seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -264,20 +252,19 @@ def flashattn_bwd_atomic_add(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -303,58 +290,54 @@ def flashattn_bwd_atomic_add(batch,
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                }
+            )
 
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
 
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 # dsT: (block_kv, block_q)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
                 T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -364,49 +347,40 @@ def flashattn_bwd_atomic_add(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
                 T.atomic_add(
-                    dQ[q_start_idx + k_base * block_N:q_start_idx + k_base * block_N + block_N,
-                       bx, :],
+                    dQ[q_start_idx + k_base * block_N : q_start_idx + k_base * block_N + block_N, bx, :],
                     dq_shared,
                     memory_order="relaxed",
-                    use_tma=True)
+                    use_tma=True,
+                )
 
             T.copy(dv, dv_shared)
             T.atomic_add(
-                dV[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dV[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dv_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
             T.copy(dk, dk_shared)
             T.atomic_add(
-                dK[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
+                dK[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
                 dk_shared,
                 memory_order="relaxed",
-                use_tma=True)
+                use_tma=True,
+            )
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        total_q,
-                        total_kv,
-                        N_CTX,
-                        heads,
-                        max_seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -419,20 +393,19 @@ def flashattn_bwd_split(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -457,59 +430,55 @@ def flashattn_bwd_split(batch,
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
 
-            T.copy(K[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   K_shared)
-            T.copy(V[k_start_idx + by * block_M:k_start_idx + (by + 1) * block_M, bx // groups, :],
-                   V_shared)
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
-            loop_st = T.min(
-                T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen,
-                                                              block_N)) if is_causal else 0
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
                 # Note: The padding zero of varlen should be considered in T.copy
-                T.copy(
-                    Q[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    q)
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
 
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(
-                    dO[q_start_idx + k_base * block_N:q_start_idx + (k_base + 1) * block_N, bx, :],
-                    do)
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
 
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k_base * block_N:(k_base + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k_base * block_N:(k_base + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -520,62 +489,37 @@ def flashattn_bwd_split(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 for i, j in T.Parallel(block_N, dim_qk):
                     if k_base * block_N + i < q_current_seqlen:
-                        T.atomic_add(
-                            dQ[q_start_idx + k_base * block_N + i, bx, j],
-                            dq[i, j],
-                            memory_order="relaxed")
+                        T.atomic_add(dQ[q_start_idx + k_base * block_N + i, bx, j], dq[i, j], memory_order="relaxed")
 
             T.copy(dv, dv_shared)
-            T.copy(
-                dv_shared,
-                dV[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(
-                dk_shared,
-                dK[bx % groups, k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :])
+            T.copy(dk_shared, dK[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
-    def forward(ctx,
-                q,
-                k,
-                v,
-                seqlens_q,
-                seqlens_k,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                causal,
-                groups=1,
-                use_atomic=True):
+    def forward(
+        ctx, q, k, v, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups=1, use_atomic=True
+    ):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
         D_HEAD_V = v.shape[-1]
         block_M = 128
         block_N = 64
-        q_unpad, indices_q, _, _ = unpad_input(
-            q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
-        k_unpad, indices_k, _, _ = unpad_input(
-            k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
-        v_unpad, _, _, _ = unpad_input(
-            v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        q_unpad, indices_q, _, _ = unpad_input(q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        k_unpad, indices_k, _, _ = unpad_input(k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        v_unpad, _, _, _ = unpad_input(v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
 
         total_q = q_unpad.shape[0]
         total_kv = k_unpad.shape[0]
 
-        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V,
-                            causal, block_M, block_N, groups)
+        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
         o_unpad, lse = mod(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
         o = pad_input(o_unpad, indices_q, BATCH, N_CTX)
-        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k,
-                              cu_seqlens_q, cu_seqlens_k)
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k)
         ctx.batch = BATCH
         ctx.causal = causal
         ctx.use_atomic = use_atomic
@@ -590,8 +534,7 @@ class _attention(torch.autograd.Function):
         N_CTX = do.shape[1]
         q, k, v, o, lse_clone, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
         # lse_clone = lse.clone()
-        do_unpad, _, _, _ = unpad_input(
-            do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        do_unpad, _, _, _ = unpad_input(do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
         total_q, H, D_HEAD_QK = q.shape
         total_kv, HEAD_KV, D_HEAD_V = v.shape
         groups = H // HEAD_KV
@@ -624,7 +567,8 @@ class _attention(torch.autograd.Function):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.zeros_like(k, dtype=torch.float32)
             dv = torch.zeros_like(v, dtype=torch.float32)
@@ -645,13 +589,13 @@ class _attention(torch.autograd.Function):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.empty(groups, *k.shape, dtype=torch.float16, device=q.device)
             dv = torch.empty(groups, *v.shape, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         dq = pad_input(dq, ctx.indices_q, BATCH, N_CTX)
@@ -670,15 +614,13 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
     # HQ = HKV * groups
     # To handle precision issue
     Q, K, V = Q.float(), K.float(), V.float()
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if padding_mask is not None:
         scores.masked_fill_(rearrange(~padding_mask, "b s -> b 1 1 s"), float("-inf"))
@@ -686,41 +628,35 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     if padding_mask is not None:
         output.masked_fill_(rearrange(~padding_mask, "b s -> b s 1 1"), 0.0)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     padding_mask = generate_random_padding_mask(N_CTX, BATCH, "cuda", mode="random")
     seqlens_q = padding_mask.sum(dim=-1, dtype=torch.int32)
     cu_seqlens_q = F.pad(torch.cumsum(seqlens_q, dim=0, dtype=torch.int32), (1, 0))
@@ -729,8 +665,7 @@ def main(BATCH: int = 1,
     # In training backward pass, seqlens_k should be the same as seqlens_q
     seqlens_k, cu_seqlens_k, max_seqlen_k = seqlens_q, cu_seqlens_q, max_seqlen_q
 
-    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q,
-                  max_seqlen_k, causal, groups, use_atomic)
+    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
     dK, K.grad = K.grad.clone(), None
@@ -772,17 +707,15 @@ if __name__ == "__main__":
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
     # Can be set to True/False for testing
     args.causal = True
@@ -796,5 +729,4 @@ if __name__ == "__main__":
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index e916812f..adb7e06a 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -6,11 +6,13 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -20,11 +22,11 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -40,25 +42,21 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -76,18 +74,20 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
     dtype = "float16"
     accum_dtype = "float"
@@ -96,9 +96,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -107,32 +107,24 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  threads=256,
-                  num_stages=2,
-                  groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
@@ -142,15 +134,15 @@ def flashattn_bwd(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -171,45 +163,39 @@ def flashattn_bwd(batch,
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -221,18 +207,17 @@ def flashattn_bwd(batch,
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -250,7 +235,10 @@ class _attention(torch.autograd.Function):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -264,18 +252,7 @@ class _attention(torch.autograd.Function):
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
         delta = mod_prep(o, do)
 
-        kernel = flashattn_bwd(
-            BATCH,
-            H,
-            N_CTX,
-            D_HEAD_QK,
-            D_HEAD_V,
-            ctx.causal,
-            block_M,
-            block_N,
-            threads=256,
-            num_stages=2,
-            groups=groups)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups)
         shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
         shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
         shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -298,52 +275,36 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False):
+def main(BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -360,7 +321,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -380,13 +341,13 @@ def main(BATCH: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index a6d3b5f2..408d6e50 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -9,7 +9,6 @@ from functools import partial
 
 
 class FlashAttentionTuneSpace:
-
     def __init__(
         self,
         block_sizes=(64, 128, 256),
@@ -40,7 +39,7 @@ def get_configs(user_config=None):
             warp_M = block_M // warp_count
             warp_N = block_N // warp_count
 
-            if (warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0):
+            if warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0:
                 continue
 
             shared_mem = 2 * config.dtype_bytes * config.dim * (block_M + block_N)
@@ -48,31 +47,26 @@ def get_configs(user_config=None):
                 continue
 
             for num_stages in config.num_stages_range:
-                valid_configs.append({
-                    "block_M": block_M,
-                    "block_N": block_N,
-                    "num_stages": num_stages,
-                    "threads": threads,
-                })
+                valid_configs.append(
+                    {
+                        "block_M": block_M,
+                        "block_N": block_N,
+                        "num_stages": num_stages,
+                        "threads": threads,
+                    }
+                )
     return valid_configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              groups=1,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, groups=1, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -90,15 +84,13 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
-                                             0)
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -111,18 +103,18 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -148,18 +140,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -175,25 +167,24 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -203,50 +194,34 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 16,
-         tune: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=64,
-            block_N=64,
-            num_stages=2,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -270,12 +245,12 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 03ad15e9..3492be76 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -24,9 +24,11 @@ def get_configs():
     rep=10,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,7 +41,7 @@ def flashattn(
     num_stages=0,
     threads=128,
 ):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -57,15 +59,13 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
-                                             0)
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -78,18 +78,18 @@ def flashattn(
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -115,18 +115,18 @@ def flashattn(
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -142,30 +142,30 @@ def flashattn(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -175,23 +175,21 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -209,18 +207,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -244,12 +232,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index ccc50e41..87b11f71 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -10,14 +10,14 @@ from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
 def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),
-        upcast=True,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),
+    upcast=True,
 ):
     if causal:
         window_size = (window_size[0], 0)
@@ -26,7 +26,7 @@ def attention_ref(
         q, k, v = q.float(), k.float(), v.float()
     b, T, Hq, D = q.shape
     S = k.shape[1]
-    scale = (1.0 / D)**0.5
+    scale = (1.0 / D) ** 0.5
     k = repeat(k, "b s h d -> b s (h g) d", g=Hq // k.shape[2])
     v = repeat(v, "b s h d -> b s (h g) d", g=Hq // v.shape[2])
     scores = torch.einsum("bthd,bshd->bhts", q, k)
@@ -54,21 +54,13 @@ def attention_ref(
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              groups,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, groups, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
@@ -78,17 +70,15 @@ def flashattn(batch_size,
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -102,10 +92,12 @@ def flashattn(batch_size,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             batch_idx = bz
             head_idx = by
@@ -119,36 +111,34 @@ def flashattn(batch_size,
             q_current_seqlen = q_end_idx - q_start_idx
             kv_current_seqlen = k_end_idx - kv_start_idx
 
-            T.copy(
-                Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M, head_idx, :],
-                Q_shared)
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(q_current_seqlen +
-                              (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
-                if is_causal else T.ceildiv(kv_current_seqlen, block_N))
+                T.min(T.ceildiv(q_current_seqlen + (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], K_shared)
+                T.copy(K_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], K_shared)
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i,
-                              j] = T.if_then_else((bx * block_M + i < k * block_N + j) or
-                                                  (bx * block_M + i >= q_current_seqlen or
-                                                   k * block_N + j >= kv_current_seqlen), -1e9, 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= kv_current_seqlen), -1e9,
-                                                     0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -170,9 +160,7 @@ def flashattn(batch_size,
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] *= scores_scale[i]
 
-                T.copy(
-                    V_unpad[kv_start_idx + k * block_N:kv_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], V_shared)
+                T.copy(V_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], V_shared)
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
@@ -187,13 +175,9 @@ def flashattn(batch_size,
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         q_seqlen: int = 2048,
-         k_seqlen: int = 2048,
-         dim: int = 128,
-         groups: int = 16,
-         is_causal: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, q_seqlen: int = 2048, k_seqlen: int = 2048, dim: int = 128, groups: int = 16, is_causal: bool = False
+):
     assert heads % groups == 0, "heads must be divisible by groups"
 
     flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
@@ -231,24 +215,12 @@ def main(batch: int = 1,
         output_pad_fn,
         _,
         _,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]
     UKV = k_unpad.shape[0]
 
-    kernel = flashattn(
-        batch,
-        groups,
-        UQ,
-        UKV,
-        heads,
-        dim,
-        is_causal,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256)
+    kernel = flashattn(batch, groups, UQ, UKV, heads, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
@@ -263,23 +235,19 @@ def main(batch: int = 1,
     )
     torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
     print("All checks passed.✅")
-    latency = do_bench(
-        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q),
-        _n_warmup=5,
-        _n_repeat=5)
+    latency = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q), _n_warmup=5, _n_repeat=5)
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='query heads')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument('--q_seqlen', type=int, default=2048, help='query sequence length')
-    parser.add_argument('--k_seqlen', type=int, default=2048, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='head dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal attention')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups,
-         args.is_causal)
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal)
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index d91d1770..81eb6d1e 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -39,28 +41,24 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             # T.copy(Q_shared, Q_local)
             # for i, j in T.Parallel(block_M, dim):
             #     Q_local[i, j] *= scale
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -78,18 +76,20 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
     dtype = "float16"
     accum_dtype = "float"
@@ -98,9 +98,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -109,26 +109,27 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     dtype = "float16"
     accum_dtype = "float"
@@ -137,40 +138,42 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -194,38 +197,39 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 # We don't need to handle OOB positions for non-causal cases,
                 # since OOB values won't affect other positions here.
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], do)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -238,14 +242,13 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     T.atomic_add(dQ[bz, bx, k * block_N + i, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, H, N_CTX, D_HEAD = q.shape
@@ -287,15 +290,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(2)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -310,9 +313,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -353,10 +354,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd_bshd.py b/examples/flash_attention/example_mha_bwd_bshd.py
index 7c85f982..427a0f69 100644
--- a/examples/flash_attention/example_mha_bwd_bshd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,25 +40,21 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -74,18 +72,20 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
     dtype = "float16"
     accum_dtype = "float"
@@ -94,9 +94,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -105,26 +105,27 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     dtype = "float16"
     accum_dtype = "float"
@@ -133,40 +134,42 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -190,35 +193,36 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-            })
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 # We don't need to handle OOB positions for non-causal cases,
                 # since OOB values won't affect other positions here.
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -231,14 +235,13 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -280,15 +283,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -303,9 +306,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -344,10 +345,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index e8ee5d97..813f379c 100644
--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -7,22 +7,24 @@ import argparse
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,26 +40,22 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
             T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len,
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                 for i in T.Parallel(block_M):
@@ -75,18 +73,20 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
     dtype = "float16"
     accum_dtype = "float"
@@ -95,9 +95,9 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -106,37 +106,39 @@ def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=256) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -161,49 +163,43 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
             dk_shared = T.alloc_shared([block_M, dim], dtype)
             dq_shared = T.alloc_shared([block_N, dim], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                }
+            )
+
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 # We don't need to handle OOB positions for non-causal cases,
                 # since OOB values won't affect other positions here.
                 T.wait_wgmma(0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -214,17 +210,16 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -266,15 +261,15 @@ attention = _attention.apply
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -289,9 +284,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -311,7 +304,7 @@ def main(
     assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -329,10 +322,10 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index e0e0bca2..7fa5549d 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -15,20 +15,13 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     dtype = "float16"
@@ -48,7 +41,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
@@ -70,18 +63,18 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -110,18 +103,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -137,43 +130,42 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -191,18 +183,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -227,12 +209,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=1, help='heads')
-    parser.add_argument('--seq_q', type=int, default=256, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=256, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal', default=False)
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=1, help="heads")
+    parser.add_argument("--seq_q", type=int, default=256, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=256, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal", default=False)
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index b797bbcc..440a2cd7 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -15,20 +15,13 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     dtype = "float16"
@@ -48,7 +41,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
@@ -70,18 +63,18 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -108,18 +101,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -135,48 +128,48 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -194,18 +187,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -230,12 +213,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index b5b72828..888914c9 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -15,19 +15,13 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
@@ -43,16 +37,14 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             # We shall fill -inf for OOB positions
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
-                                             0)
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -65,18 +57,18 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -102,18 +94,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -129,40 +121,39 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -179,17 +170,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -213,11 +195,11 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index 02d8baef..b54d3e62 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -15,19 +15,13 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
     dtype = "float16"
     accum_dtype = "float"
@@ -43,16 +37,14 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             # We shall fill -inf for OOB positions
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype),
-                                             0)
+                acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
@@ -65,18 +57,18 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -102,18 +94,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -129,45 +121,45 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]]):
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -184,17 +176,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -218,11 +201,11 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index bbb4546c..f7bb36f7 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -11,14 +11,14 @@ from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
 def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),  # -1 means infinite window size
-        upcast=True,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    upcast=True,
 ):
     """
     Arguments:
@@ -47,7 +47,7 @@ def attention_ref(
     if upcast:
         q, k, v = q.float(), k.float(), v.float()
     dim = q.shape[-1]
-    scale = (1.0 / dim)**0.5  # log2(e)
+    scale = (1.0 / dim) ** 0.5  # log2(e)
     k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
     v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
     scores = torch.einsum("bthd,bshd->bhts", q, k)
@@ -68,20 +68,13 @@ def attention_ref(
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=32):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=32):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [UQ, heads, dim]
     k_shape = [UKV, heads, dim]
     v_shape = [UKV, heads, dim]
@@ -92,17 +85,15 @@ def flashattn(batch_size,
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(k_shape, dtype),
-            V_unpad: T.Tensor(v_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(k_shape, dtype),
+        V_unpad: T.Tensor(v_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype, "shared")
             K_shared = T.alloc_shared([block_N, dim], dtype, "shared")
             V_shared = T.alloc_shared([block_N, dim], dtype, "shared")
@@ -151,15 +142,17 @@ def flashattn(batch_size,
                         K_shared[i, d] = 0
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen),
+                            -T.infinity(acc_s.dtype),
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen), -T.infinity(acc_s.dtype), 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -244,8 +237,7 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
         output_pad_fn,
         dq_pad_fn,
         dk_pad_fn,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]  # unpadded query length
     UK = k_unpad.shape[0]  # unpadded key length
@@ -287,10 +279,10 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=2048, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=2048, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
 
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim)
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index b184fc60..da172bb6 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -62,14 +62,12 @@ def test_example_mha_bwd_wgmma_pipelined():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_gqa_fwd_bshd_wgmma_pipelined():
-    example_gqa_fwd_bshd_wgmma_pipelined.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd_wgmma_pipelined.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
 def test_example_gqa_fwd_bshd():
-    example_gqa_fwd_bshd.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
diff --git a/examples/flash_attention/varlen_utils.py b/examples/flash_attention/varlen_utils.py
index 4301215d..43e21cc3 100644
--- a/examples/flash_attention/varlen_utils.py
+++ b/examples/flash_attention/varlen_utils.py
@@ -9,22 +9,14 @@ def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
-def generate_qkv(q,
-                 k,
-                 v,
-                 query_padding_mask=None,
-                 key_padding_mask=None,
-                 kvpacked=False,
-                 qkvpacked=False):
+def generate_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False):
     """
     Arguments:
         q: (batch_size, seqlen_q, nheads, d)
@@ -39,15 +31,12 @@ def generate_qkv(q,
 
     if query_padding_mask is not None:
         q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
-        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q
-                                                      )
+        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q)
     else:
         q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
         max_seqlen_q = seqlen_q
-        output_pad_fn = lambda output_unpad: rearrange(
-            output_unpad, "(b s) h d -> b s h d", b=batch_size)
+        output_pad_fn = lambda output_unpad: rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
 
     if key_padding_mask is not None:
         k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
@@ -55,8 +44,7 @@ def generate_qkv(q,
     else:
         k_unpad = rearrange(k, "b s h d -> (b s) h d")
         v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
+        cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
         max_seqlen_k = seqlen_k
 
     if qkvpacked:
@@ -67,8 +55,7 @@ def generate_qkv(q,
         if query_padding_mask is not None:
             dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
         else:
-            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
-                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             qkv_unpad.detach().requires_grad_(),
             cu_seqlens_q,
@@ -84,8 +71,7 @@ def generate_qkv(q,
         if key_padding_mask is not None:
             dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
         else:
-            dkv_pad_fn = lambda dkv_unpad: rearrange(
-                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dkv_pad_fn = lambda dkv_unpad: rearrange(dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             q_unpad.detach().requires_grad_(),
             kv_unpad.detach().requires_grad_(),
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 7ccd9839..136a5129 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -20,13 +20,7 @@ def get_configs():
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
@@ -48,17 +42,13 @@ def get_heuristic_config() -> Tuple[Dict, int]:
 
 # TODO(lei): fix warp specialized and tma lower pass
 def get_pass_configs():
-    return {
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    }
+    return {tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[6], pass_configs=get_pass_configs())
-def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages,
-              threads):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [batch, seqlen_kv, groups, dim]
     shape_v = [batch, seqlen_kv, groups, dim]
@@ -73,11 +63,11 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -98,20 +88,19 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             hid = by
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_shared)
-                T.copy(mask[bid, k * block_N:(k + 1) * block_N, cur_kv_head], mask_local)
+                T.copy(K[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_shared)
+                T.copy(mask[bid, k * block_N : (k + 1) * block_N, cur_kv_head], mask_local)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -127,23 +116,23 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], V_shared)
+                T.copy(V[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -165,7 +154,7 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -174,19 +163,26 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    K[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], K_shared)
+                    K[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    K_shared,
+                )
                 T.copy(
-                    mask[bid, (seqlen_kv // num_split) * sid +
-                         k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                         cur_kv_head], mask_local)
+                    mask[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                    ],
+                    mask_local,
+                )
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i,
-                          j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split),
-                                              acc_s[i, j], -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split), acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -203,9 +199,14 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
                 T.copy(
-                    V[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], V_shared)
+                    V[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    V_shared,
+                )
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
@@ -216,14 +217,13 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
                 if i < valid_block_H:
                     glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * valid_block_H:(hid + 1) * valid_block_H,
-                                            sid, :])
+            T.copy(O_shared, Output_partial[bid, hid * valid_block_H : (hid + 1) * valid_block_H, sid, :])
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         with T.Kernel(heads, batch, threads=128) as (by, bz):
             po_local = T.alloc_fragment([dim], dtype)
@@ -233,12 +233,14 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
             lse_max_local = T.alloc_fragment([128], accum_dtype)
             scale_local = T.alloc_fragment([128], accum_dtype)
 
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
-                # lse_local: (local_id, thread_id)
-                lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
-            })
+            T.annotate_layout(
+                {
+                    lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
+                    lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
+                    # lse_local: (local_id, thread_id)
+                    lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
@@ -263,26 +265,26 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
 
     @T.prim_func
     def flashattn_gqa_decode_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         flash_attn_split(Q, K, V, mask, glse, Output_partial)
         combine(glse, Output_partial, Output)
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         flash_attn(Q, K, V, mask, Output)
 
@@ -305,27 +307,21 @@ def ref_program(query, key, value, mask, glse, Output_partial):
     dim = query.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
     if mask is not None:
-        mask = rearrange(mask, 'b s h -> b h s')
+        mask = rearrange(mask, "b s h -> b h s")
         mask = mask.unsqueeze(1)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -339,16 +335,12 @@ def flash_split_ref(Q, K, V, mask):
     seqlen_kv = K.size(1)
     num_head_groups = nheads // groups
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float)
-    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N),
-                             device="cuda",
-                             dtype=torch.float16)
+    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, num_head_groups, groups, dim), device="cuda", dtype=torch.float)
     scores_max = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
-    scores_max_prev = torch.empty((batch, num_head_groups, groups),
-                                  device="cuda",
-                                  dtype=torch.float)
+    scores_max_prev = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_scale = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_sum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     logsum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
@@ -356,25 +348,25 @@ def flash_split_ref(Q, K, V, mask):
     glogsum = torch.empty((num_split, batch, nheads), device="cuda", dtype=torch.float)
 
     Q_ = Q * scale
-    Q_ = rearrange(Q_, 'b (h g) d -> b g h d', g=num_head_groups)
+    Q_ = rearrange(Q_, "b (h g) d -> b g h d", g=num_head_groups)
 
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bghd,bkhd->bghk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bghd,bkhd->bghk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             if mask is not None:
-                mask_local = mask[:, (seqlen_kv // num_split) * ks +
-                                  i * block_N:(seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
-                mask_local = rearrange(mask_local, 'b s h -> b h s')
+                mask_local = mask[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
+                mask_local = rearrange(mask_local, "b s h -> b h s")
                 mask_local = mask_local.unsqueeze(1)
-                acc_s = acc_s.masked_fill(mask_local == 0, float('-inf'))
+                acc_s = acc_s.masked_fill(mask_local == 0, float("-inf"))
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -382,15 +374,16 @@ def flash_split_ref(Q, K, V, mask):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bghk,bkhd->bghd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bghk,bkhd->bghd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
-        acc_o_out = rearrange(acc_o, 'b g h d->b (h g) d')
-        logsum_out = rearrange(logsum, 'b g h->b (h g)')
+        acc_o_out = rearrange(acc_o, "b g h d->b (h g) d")
+        logsum_out = rearrange(logsum, "b g h->b (h g)")
         acc_o_out /= logsum_out[:, :, None]
-        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, 'b g h->b (h g)')
+        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, "b g h->b (h g)")
         gacc_o[ks, :, :, :] = acc_o_out
         glogsum[ks, :, :] = logsum_out
 
@@ -426,7 +419,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -434,28 +427,23 @@ def calc_sim(x, y, name="tensor"):
 
 def assert_similar(x, y, eps=1e-2, name="tensor", assert_=False, print_=True):
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if assert_:
-            raise AssertionError(f'{name} Error: {diff}')
+            raise AssertionError(f"{name} Error: {diff}")
     else:
         if print_:
-            print(f'passed: {name} diff={diff}')
+            print(f"passed: {name} diff={diff}")
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         groups: int = 8,
-         kv_seqlen: int = 8192,
-         dim: int = 128,
-         tune: bool = False):
+def main(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128, tune: bool = False):
     batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
     qk_flops = 2 * batch * heads * kv_seqlen * dim
     pv_flops = 2 * batch * heads * kv_seqlen * dim
     total_flops = qk_flops + pv_flops
 
-    if (not tune):
+    if not tune:
         config, sm_version = get_heuristic_config()
         kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
@@ -497,11 +485,11 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument('--kv_seqlen', type=int, default=8192, help='kv sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--kv_seqlen", type=int, default=8192, help="kv sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.groups, args.kv_seqlen, args.dim, args.tune)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
index 16924ebe..0fdd5291 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -19,8 +19,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -74,14 +73,9 @@ def _fwd_inner(
     return m_i, l_i, acc
 
 
-
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [4, 8]\
-        for num_stages in [2, 4]\
-    ],
-    key=['gqa_group_size', 'BLOCK_N', 'BLOCK_D', 'BLOCK_H'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [4, 8] for num_stages in [2, 4]],
+    key=["gqa_group_size", "BLOCK_N", "BLOCK_D", "BLOCK_H"],
 )
 @triton.jit
 def _fwd_kernel_varlen(
@@ -107,13 +101,12 @@ def _fwd_kernel_varlen(
     stride_od,
     stride_sb,
     stride_sh,
-    stride_sn,  #bmask shape [b, q_h, seq/BLOCK_N]
+    stride_sn,  # bmask shape [b, q_h, seq/BLOCK_N]
     gqa_group_size: tl.constexpr,
     BLOCK_H: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_D: tl.constexpr,
 ):
-
     off_z = tl.program_id(0)
     off_h_for_kv = tl.program_id(1)
     off_h_q = off_h_for_kv * gqa_group_size
@@ -134,8 +127,7 @@ def _fwd_kernel_varlen(
     S_ptrs = S + off_z * stride_sb + off_h_q * stride_sh
 
     mask_h = offs_h < gqa_group_size
-    q = tl.load(
-        Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
+    q = tl.load(Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
 
     if s_aux is not None:
         sink = tl.load(s_aux + off_h_q + offs_h, mask=mask_h).to(tl.float32)
@@ -189,14 +181,12 @@ def _fwd_kernel_varlen(
 
     acc = acc.to(O.dtype.element_ty)
 
-    tl.store(
-        O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od,
-        acc,
-        mask=mask_h[:, None])
+    tl.store(O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od, acc, mask=mask_h[:, None])
 
 
 def get_configs():
     import itertools
+
     block_N = [64, 128]
     block_H = [64]
     num_split = [1]
@@ -204,31 +194,16 @@ def get_configs():
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
-def flashattn(batch,
-              heads,
-              k_heads,
-              max_seqlen_kv,
-              total_seqlen_k,
-              dim,
-              has_sink,
-              block_N=128,
-              block_H=64,
-              num_split=1,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(
+    batch, heads, k_heads, max_seqlen_kv, total_seqlen_k, dim, has_sink, block_N=128, block_H=64, num_split=1, num_stages=1, threads=128
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [total_seqlen_k, k_heads, dim]
     shape_v = [total_seqlen_k, k_heads, dim]
@@ -243,13 +218,13 @@ def flashattn(batch,
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor([batch, heads, dim], dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+        s_aux: T.Tensor([heads], "float32"),
+        Output: T.Tensor([batch, heads, dim], dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -268,13 +243,15 @@ def flashattn(batch,
             # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
             s_aux_shared = T.alloc_shared([block_H], "float32")
 
-            T.annotate_layout({
-                # Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                # K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                # V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                # S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
+            T.annotate_layout(
+                {
+                    # Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    # K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    # V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    # S_shared: tilelang.layout.make_swizzled_layout(S_shared),
+                }
+            )
 
             bid = bx
             hid = by
@@ -284,7 +261,7 @@ def flashattn(batch,
             cur_end_k = cu_seqlens_k[bid + 1]
             cur_seqlen_k = cur_end_k - cur_start_k
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -292,15 +269,13 @@ def flashattn(batch,
             # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       K_shared)
+                T.copy(K[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], K_shared)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
                     # acc_s[i, j] = T.if_then_else(mask_local[j] != 0 and k * block_N + j < cur_seqlen_k, acc_s[i, j],
                     #                              -T.infinity(accum_dtype))
-                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -320,12 +295,11 @@ def flashattn(batch,
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[cur_start_k + k * block_N:cur_start_k + (k + 1) * block_N, cur_kv_head, :],
-                       V_shared)
+                T.copy(V[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             if has_sink:
-                T.copy(s_aux[hid * valid_block_H:hid * valid_block_H + block_H], s_aux_shared)
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
                 for i in T.Parallel(block_H):
                     logsum[i] += s_aux_shared[i]
             for i, j in T.Parallel(block_H, dim):
@@ -338,20 +312,19 @@ def flashattn(batch,
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
             # T.copy(S_fragment, S_shared)
-            T.copy(S_shared[:valid_block_H, :], S[bid,
-                                                  hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            Output: T.Tensor(shape_o, dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+        s_aux: T.Tensor([heads], "float32"),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         flash_attn(Q, K, V, cu_seqlens_k, s_aux, Output, S)
 
@@ -388,9 +361,7 @@ def flash_attn_with_attn_pool_decode_tilelang(
     gqa_group_size = q_h // k_h
 
     O_tl = torch.zeros_like(Q)
-    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)),
-                       dtype=Q.dtype,
-                       device=Q.device)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
     O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux)
 
     if use_per_kv_head_sparse_index:
@@ -433,9 +404,7 @@ def flash_attn_with_attn_pool_decode(
     BLOCK_H = 64
 
     O = torch.zeros_like(Q)
-    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)),
-                    dtype=Q.dtype,
-                    device=Q.device)
+    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)), dtype=Q.dtype, device=Q.device)
 
     def grid(META):
         return (batch, k_h)
@@ -483,15 +452,15 @@ def test_equal_seqlen_decode_main(args):
     dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
 
     # For decode, query is just 1 token per batch
-    q = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
-    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+    q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
+    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
     softmax_scale = 1.0 / math.sqrt(head_size)
 
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Convert to varlen format for K, V
@@ -499,8 +468,7 @@ def test_equal_seqlen_decode_main(args):
     v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size)
 
     # Generate cumulative sequence lengths
-    cu_seqlens_k = torch.arange(
-        0, (batch_size + 1) * k_seqlen, k_seqlen, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.arange(0, (batch_size + 1) * k_seqlen, k_seqlen, device="cuda", dtype=torch.int32)
     max_seqlen_k = k_seqlen
 
     print(f"q shape: {q.shape}")
@@ -510,8 +478,7 @@ def test_equal_seqlen_decode_main(args):
     num_tokens, q_h, head_size = q.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Test our decode kernel
     O_triton, S_triton = flash_attn_with_attn_pool_decode(
@@ -524,7 +491,8 @@ def test_equal_seqlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q,
         k_varlen,
@@ -539,9 +507,7 @@ def test_equal_seqlen_decode_main(args):
         tl_kernel=tl_kernel,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Compute torch reference
     q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -550,14 +516,12 @@ def test_equal_seqlen_decode_main(args):
 
     if sink is None:
         # Standard scaled dot-product attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
         attn_weights = torch.softmax(logits, dim=-1)
         O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -566,15 +530,15 @@ def test_equal_seqlen_decode_main(args):
         unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
         normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
         attn_weights = unnormalized_scores / normalizer
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat).squeeze(2)  # [batch, q_heads, head_size]
 
     # Compute attention score pooling
     attn_score_pooled = torch.max_pool2d(
         attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(torch.float16)
+        ceil_mode=True,
+    ).to(torch.float16)
 
     print("S_tilelang", S_tilelang)
     print("attn_score_pooled", attn_score_pooled)
@@ -588,15 +552,10 @@ def test_equal_seqlen_decode_main(args):
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
-    assert torch.allclose(
-        S_tilelang, attn_score_pooled, atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
+    assert torch.allclose(S_tilelang, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
     print("✅ All tests passed!")
 
 
@@ -616,7 +575,7 @@ def test_varlen_decode_main(args):
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Generate variable length k sequences
@@ -624,7 +583,7 @@ def test_varlen_decode_main(args):
     print(f"k_seqlens: {k_seqlens}")
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -634,9 +593,9 @@ def test_varlen_decode_main(args):
     print(f"cu_seqlens_k: {cu_seqlens_k}")
 
     # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -649,8 +608,7 @@ def test_varlen_decode_main(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Test our decode kernel
     O_triton, S_triton = flash_attn_with_attn_pool_decode(
@@ -663,7 +621,8 @@ def test_varlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q_decode,
         k_varlen,
@@ -678,9 +637,7 @@ def test_varlen_decode_main(args):
         tl_kernel=tl_kernel,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Create torch reference - pad tensors for comparison
     k_padded_list = []
@@ -694,8 +651,8 @@ def test_varlen_decode_main(args):
         k_end = cu_seqlens_k[i + 1]
 
         # Pad to max_seqlen_k
-        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
-        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
 
         k_padded[:actual_k_len] = k_varlen[k_start:k_end]
         v_padded[:actual_k_len] = v_varlen[k_start:k_end]
@@ -704,10 +661,8 @@ def test_varlen_decode_main(args):
         v_padded_list.append(v_padded)
 
     # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
-    k_padded_batched = torch.stack(
-        k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-    v_padded_batched = torch.stack(
-        v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
 
     # Expand q to match kv heads: [b, q_heads, 1, head_size]
     q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -717,20 +672,17 @@ def test_varlen_decode_main(args):
     print(f"v_padded_batched shape: {v_padded_batched.shape}")
 
     # Compute torch reference
-    k_repeat = repeat_kv(k_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-    v_repeat = repeat_kv(v_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
 
     if sink is None:
         # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
-        attn_score = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            attn_score[i, :, :, actual_k_len:] = float('-inf')
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
 
         attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
 
@@ -743,13 +695,12 @@ def test_varlen_decode_main(args):
         O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            logits[i, :, :, actual_k_len:] = float('-inf')
+            logits[i, :, :, actual_k_len:] = float("-inf")
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -765,8 +716,7 @@ def test_varlen_decode_main(args):
             attn_weights[i, :, :, actual_k_len:] = 0.0
 
         # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat)  # [b, q_heads, 1, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
 
     O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
 
@@ -775,7 +725,8 @@ def test_varlen_decode_main(args):
         attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
 
     print(f"O_triton shape: {O_triton.shape}")
     print(f"O_tilelang shape: {O_tilelang.shape}")
@@ -791,22 +742,16 @@ def test_varlen_decode_main(args):
     print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
 
     max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_s_tl = torch.max(
-        torch.abs(S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    max_diff_s_tl = torch.max(torch.abs(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
 
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
-    assert torch.allclose(
-        S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)],
-        attn_score_pooled,
-        atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tl.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)], attn_score_pooled, atol=1e-2, rtol=1e-2), (
+        f"Score mismatch: {max_diff_s_tl.item()}"
+    )
 
     print("✅ All tests passed!")
 
@@ -865,7 +810,7 @@ def speed_benchmark_decode_comparison(args):
         k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -873,9 +818,9 @@ def speed_benchmark_decode_comparison(args):
     cu_seqlens_k[batch_size] = total_k_tokens
 
     # Generate tensors
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -883,7 +828,7 @@ def speed_benchmark_decode_comparison(args):
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print("  Using sink attention with sink values")
 
     print("Setup complete:")
@@ -896,8 +841,7 @@ def speed_benchmark_decode_comparison(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
     # Benchmark
     print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
@@ -920,36 +864,41 @@ def speed_benchmark_decode_comparison(args):
 
     # Benchmark
     print("⚡ Benchmarking Triton kernel (100 iterations)...")
-    triton_time = do_bench(flash_attn_with_attn_pool_decode, q_decode, k_varlen, v_varlen,
-                           cu_seqlens_k, max_seqlen_k, args.k_seqlen, 1, softmax_scale, sink,
-                           block_size)
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
     print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
 
     print(f"Speedup: {(triton_time / tilelang_time):.3f}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Flash Attention Decode with Attention Pooling')
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
-    parser.add_argument('--q_heads', type=int, default=32, help='Number of query heads')
-    parser.add_argument('--kv_heads', type=int, default=8, help='Number of key-value heads')
-    parser.add_argument('--k_seqlen', type=int, default=8192, help='Key sequence length')
-    parser.add_argument(
-        '--head_size', type=int, default=128, choices=[64, 128, 256], help='Head dimension')
-    parser.add_argument('--block_size', type=int, default=64, help='Block size for computation')
-    parser.add_argument(
-        '--dtype', type=str, default='bfloat16', choices=['float16', 'bfloat16'], help='Data type')
-    parser.add_argument(
-        '--test_varlen', action='store_true', help='Test with truly variable sequence lengths')
-    parser.add_argument(
-        '--test_sink', action='store_true', help='Test with sink attention mechanism')
-    parser.add_argument('--benchmark', action='store_true', help='Run speed benchmark')
-    parser.add_argument(
-        '--num_split', type=int, default=1, choices=[1, 16], help='Number of splits')
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=64, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
     args = parser.parse_args()
     args.test_sink = True
     args.test_varlen = False
-    args.dtype = 'float16'
+    args.dtype = "float16"
     args.num_split = 1
 
     if args.benchmark:
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
index e565cbeb..3537e5af 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
@@ -10,6 +10,7 @@ torch.manual_seed(0)
 
 def get_configs():
     import itertools
+
     block_N = [64, 128]
     block_H = [64]
     num_split = [1]
@@ -17,32 +18,28 @@ def get_configs():
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
 # @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[-2, -1], debug_root_path="./examples/flash_decoding")
-def flashattn(batch,
-              heads,
-              k_heads,
-              max_seqlen_kv,
-              total_seqlen_k,
-              dim,
-              has_sink,
-              page_block_size,
-              block_N=128,
-              block_H=64,
-              num_split=1,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(
+    batch,
+    heads,
+    k_heads,
+    max_seqlen_kv,
+    total_seqlen_k,
+    dim,
+    has_sink,
+    page_block_size,
+    block_N=128,
+    block_H=64,
+    num_split=1,
+    num_stages=1,
+    threads=128,
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [total_seqlen_k, k_heads, dim]
     shape_v = [total_seqlen_k, k_heads, dim]
@@ -51,21 +48,23 @@ def flashattn(batch,
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // k_heads
-    assert page_block_size >= block_N and page_block_size % block_N == 0, "page_block_size must be larger than block_N and a multiple of block_N"
+    assert page_block_size >= block_N and page_block_size % block_N == 0, (
+        "page_block_size must be larger than block_N and a multiple of block_N"
+    )
 
     valid_block_H = min(block_H, kv_group_num)
     # TODO: check if max_seqlen_kv is correct for varlen case
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], "int32"),
-            Output: T.Tensor([batch, heads, dim], dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+        s_aux: T.Tensor([heads], "float32"),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], "int32"),
+        Output: T.Tensor([batch, heads, dim], dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -91,7 +90,7 @@ def flashattn(batch,
             cur_end_k = cu_seqlens_k[bid + 1]
             cur_seqlen_k = cur_end_k - cur_start_k
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -99,15 +98,12 @@ def flashattn(batch,
             # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                k_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (
-                    k * block_N) % page_block_size
-                T.copy(K[cur_start_k + k_start:cur_start_k + k_start + block_N, cur_kv_head, :],
-                       K_shared)
+                k_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(K[cur_start_k + k_start : cur_start_k + k_start + block_N, cur_kv_head, :], K_shared)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -127,14 +123,12 @@ def flashattn(batch,
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                v_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (
-                    k * block_N) % page_block_size
-                T.copy(V[cur_start_k + v_start:cur_start_k + v_start + block_N, cur_kv_head, :],
-                       V_shared)
+                v_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(V[cur_start_k + v_start : cur_start_k + v_start + block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             if has_sink:
-                T.copy(s_aux[hid * valid_block_H:hid * valid_block_H + block_H], s_aux_shared)
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
                 for i in T.Parallel(block_H):
                     logsum[i] += s_aux_shared[i]
             for i, j in T.Parallel(block_H, dim):
@@ -144,20 +138,19 @@ def flashattn(batch,
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
-            T.copy(S_shared[:valid_block_H, :], S[bid,
-                                                  hid * valid_block_H:(hid + 1) * valid_block_H, :])
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-            s_aux: T.Tensor([heads], "float32"),
-            BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], "int32"),
-            Output: T.Tensor(shape_o, dtype),
-            S: T.Tensor(shape_s, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
+        s_aux: T.Tensor([heads], "float32"),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], "int32"),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
     ):
         flash_attn(Q, K, V, cu_seqlens_k, s_aux, BLOCK_TABLE, Output, S)
 
@@ -195,9 +188,7 @@ def flash_attn_with_attn_pool_decode_tilelang(
     gqa_group_size = q_h // k_h
 
     O_tl = torch.zeros_like(Q)
-    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)),
-                       dtype=Q.dtype,
-                       device=Q.device)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
     O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux, block_table)
 
     if use_per_kv_head_sparse_index:
@@ -223,15 +214,15 @@ def test_equal_seqlen_decode_main(args):
     dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
 
     # For decode, query is just 1 token per batch
-    q = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
-    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device='cuda', dtype=dtype)
+    q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
+    v = torch.randn(batch_size, kv_heads, k_seqlen, head_size, device="cuda", dtype=dtype)
     softmax_scale = 1.0 / math.sqrt(head_size)
 
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Convert to varlen format for K, V
@@ -239,8 +230,7 @@ def test_equal_seqlen_decode_main(args):
     v_varlen = v.transpose(1, 2).reshape(batch_size * k_seqlen, kv_heads, head_size).contiguous()
 
     # Generate cumulative sequence lengths
-    cu_seqlens_k = torch.arange(
-        0, (batch_size + 1) * k_seqlen, k_seqlen, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.arange(0, (batch_size + 1) * k_seqlen, k_seqlen, device="cuda", dtype=torch.int32)
     max_seqlen_k = k_seqlen
 
     print(f"q shape: {q.shape}")
@@ -250,11 +240,9 @@ def test_equal_seqlen_decode_main(args):
     num_tokens, q_h, head_size = q.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink, page_block_size)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
 
-    block_table = torch.zeros(
-        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
     block_cnt = 0
     for i in range(batch):
         cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
@@ -274,7 +262,8 @@ def test_equal_seqlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q,
         k_varlen,
@@ -290,9 +279,7 @@ def test_equal_seqlen_decode_main(args):
         block_table=block_table,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Compute torch reference
     q_expanded = q.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -301,14 +288,12 @@ def test_equal_seqlen_decode_main(args):
 
     if sink is None:
         # Standard scaled dot-product attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
         attn_weights = torch.softmax(logits, dim=-1)
         O_torch = torch.matmul(attn_weights, v_repeat).squeeze(2)  # [batch, q_heads, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [batch, q_heads, 1, seqlen_k]
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -317,15 +302,15 @@ def test_equal_seqlen_decode_main(args):
         unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
         normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
         attn_weights = unnormalized_scores / normalizer
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat).squeeze(2)  # [batch, q_heads, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat).squeeze(2)  # [batch, q_heads, head_size]
 
     # Compute attention score pooling
     attn_score_pooled = torch.max_pool2d(
         attn_weights.squeeze(2),  # [b, q_heads, k_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(torch.float16)
+        ceil_mode=True,
+    ).to(torch.float16)
 
     print("S_tilelang", S_tilelang)
     print("attn_score_pooled", attn_score_pooled)
@@ -339,15 +324,10 @@ def test_equal_seqlen_decode_main(args):
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in O_tilelang: {max_diff_o_tilelang.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tilelang.item()}")
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
-    assert torch.allclose(
-        S_tilelang, attn_score_pooled, atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tilelang.item()}"
+    assert torch.allclose(S_tilelang, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s_tilelang.item()}"
     print("✅ All tests passed!")
 
 
@@ -368,7 +348,7 @@ def test_varlen_decode_main(args):
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print(f"Using sink attention with sink values: {sink}")
 
     # Generate variable length k sequences
@@ -376,7 +356,7 @@ def test_varlen_decode_main(args):
     print(f"k_seqlens: {k_seqlens}")
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -386,9 +366,9 @@ def test_varlen_decode_main(args):
     print(f"cu_seqlens_k: {cu_seqlens_k}")
 
     # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -401,11 +381,9 @@ def test_varlen_decode_main(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink, page_block_size)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
 
-    block_table = torch.zeros(
-        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
     block_cnt = 0
     for i in range(batch):
         cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
@@ -425,7 +403,8 @@ def test_varlen_decode_main(args):
         args.num_split,
         softmax_scale,
         s_aux=sink,
-        block_size=block_size)
+        block_size=block_size,
+    )
     O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
         q_decode,
         k_varlen,
@@ -441,9 +420,7 @@ def test_varlen_decode_main(args):
         block_table=block_table,
     )
     for i in range(batch_size):
-        S_tilelang[i, :,
-                   math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) /
-                             block_size):] = 0
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
 
     # Create torch reference - pad tensors for comparison
     k_padded_list = []
@@ -457,8 +434,8 @@ def test_varlen_decode_main(args):
         k_end = cu_seqlens_k[i + 1]
 
         # Pad to max_seqlen_k
-        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
-        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device='cuda', dtype=dtype)
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
 
         k_padded[:actual_k_len] = k_varlen[k_start:k_end]
         v_padded[:actual_k_len] = v_varlen[k_start:k_end]
@@ -467,10 +444,8 @@ def test_varlen_decode_main(args):
         v_padded_list.append(v_padded)
 
     # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
-    k_padded_batched = torch.stack(
-        k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-    v_padded_batched = torch.stack(
-        v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
 
     # Expand q to match kv heads: [b, q_heads, 1, head_size]
     q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
@@ -480,20 +455,17 @@ def test_varlen_decode_main(args):
     print(f"v_padded_batched shape: {v_padded_batched.shape}")
 
     # Compute torch reference
-    k_repeat = repeat_kv(k_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-    v_repeat = repeat_kv(v_padded_batched,
-                         q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
 
     if sink is None:
         # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
-        attn_score = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            attn_score[i, :, :, actual_k_len:] = float('-inf')
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
 
         attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
 
@@ -506,13 +478,12 @@ def test_varlen_decode_main(args):
         O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
     else:
         # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(
-            -2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
         # Apply sequence length masking
         for i in range(batch_size):
             actual_k_len = k_seqlens[i]
-            logits[i, :, :, actual_k_len:] = float('-inf')
+            logits[i, :, :, actual_k_len:] = float("-inf")
 
         sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
         logits_max = torch.max(logits, dim=-1, keepdim=True).values
@@ -528,8 +499,7 @@ def test_varlen_decode_main(args):
             attn_weights[i, :, :, actual_k_len:] = 0.0
 
         # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype),
-                               v_repeat)  # [b, q_heads, 1, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
 
     O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
 
@@ -538,7 +508,8 @@ def test_varlen_decode_main(args):
         attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
         kernel_size=(q_heads, block_size),
         stride=(q_heads, block_size),
-        ceil_mode=True).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
 
     print(f"O_triton shape: {O_triton.shape}")
     print(f"O_tilelang shape: {O_tilelang.shape}")
@@ -554,22 +525,16 @@ def test_varlen_decode_main(args):
     print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
 
     max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_s_tl = torch.max(
-        torch.abs(S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    max_diff_s_tl = torch.max(torch.abs(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
     print(f"Max difference in S: {max_diff_s.item()}")
     print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
 
-    assert torch.allclose(
-        O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(
-        S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(
-        O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
-    assert torch.allclose(
-        S_tilelang[:, :, :math.ceil(max_seqlen_k / block_size)],
-        attn_score_pooled,
-        atol=1e-2,
-        rtol=1e-2), f"Score mismatch: {max_diff_s_tl.item()}"
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)], attn_score_pooled, atol=1e-2, rtol=1e-2), (
+        f"Score mismatch: {max_diff_s_tl.item()}"
+    )
 
     print("✅ All tests passed!")
 
@@ -605,7 +570,7 @@ def speed_benchmark_decode_comparison(args):
         k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
 
     # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device='cuda', dtype=torch.int32)
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
     total_k_tokens = 0
     for i in range(batch_size):
         cu_seqlens_k[i] = total_k_tokens
@@ -613,9 +578,9 @@ def speed_benchmark_decode_comparison(args):
     cu_seqlens_k[batch_size] = total_k_tokens
 
     # Generate tensors
-    q_decode = torch.randn(batch_size, q_heads, head_size, device='cuda', dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device='cuda', dtype=dtype)
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
 
     softmax_scale = 1.0 / math.sqrt(head_size)
     max_seqlen_k = int(k_seqlens.max())
@@ -623,7 +588,7 @@ def speed_benchmark_decode_comparison(args):
     # Generate sink values if needed
     sink = None
     if args.test_sink:
-        sink = torch.randn(q_heads, device='cuda', dtype=torch.float32) * 0.1  # Small sink values
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
         print("  Using sink attention with sink values")
 
     print("Setup complete:")
@@ -636,11 +601,9 @@ def speed_benchmark_decode_comparison(args):
     num_tokens, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size,
-                          args.test_sink, page_block_size)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
 
-    block_table = torch.zeros(
-        batch, math.ceil(real_max_k_seqlen / page_block_size), device='cuda', dtype=torch.int32)
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
     block_cnt = 0
     for i in range(batch):
         cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
@@ -671,36 +634,41 @@ def speed_benchmark_decode_comparison(args):
 
     # Benchmark
     print("⚡ Benchmarking Triton kernel (100 iterations)...")
-    triton_time = do_bench(flash_attn_with_attn_pool_decode, q_decode, k_varlen, v_varlen,
-                           cu_seqlens_k, max_seqlen_k, args.k_seqlen, 1, softmax_scale, sink,
-                           block_size)
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
     print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
     print(f"Speedup: {(triton_time / tilelang_time):.3f}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Flash Attention Decode with Attention Pooling')
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
-    parser.add_argument('--q_heads', type=int, default=32, help='Number of query heads')
-    parser.add_argument('--kv_heads', type=int, default=8, help='Number of key-value heads')
-    parser.add_argument('--k_seqlen', type=int, default=8192, help='Key sequence length')
-    parser.add_argument(
-        '--head_size', type=int, default=128, choices=[64, 128, 256], help='Head dimension')
-    parser.add_argument('--block_size', type=int, default=128, help='Block size for computation')
-    parser.add_argument(
-        '--dtype', type=str, default='bfloat16', choices=['float16', 'bfloat16'], help='Data type')
-    parser.add_argument(
-        '--test_varlen', action='store_true', help='Test with truly variable sequence lengths')
-    parser.add_argument(
-        '--test_sink', action='store_true', help='Test with sink attention mechanism')
-    parser.add_argument('--benchmark', action='store_true', help='Run speed benchmark')
-    parser.add_argument(
-        '--num_split', type=int, default=1, choices=[1, 16], help='Number of splits')
-    parser.add_argument('--page_block_size', type=int, default=128, help='Page block size')
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
+    parser.add_argument("--page_block_size", type=int, default=128, help="Page block size")
     args = parser.parse_args()
     args.test_sink = True
     args.test_varlen = True
-    args.dtype = 'float16'
+    args.dtype = "float16"
     args.num_split = 1
 
     if args.benchmark:
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 0360b3e2..d0381bc4 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -10,7 +10,7 @@ num_split = 4
 
 @tilelang.jit(out_idx=[5])
 def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, seqlen_q, heads, dim]
     shape_kv = [batch, seqlen_kv, heads, dim]
     part_shape = [batch, seqlen_q, heads, num_split, dim]
@@ -29,14 +29,11 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
         bid: T.int32,
         sid: T.int32,
     ):
-        T.copy(
-            K[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], K_shared)
+        T.copy(K[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :], K_shared)
         # TODO: Handle causal split case
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
+                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
         else:
             T.clear(acc_s)
         T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -52,20 +49,18 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
         bid: T.int32,
         sid: T.int32,
     ):
-        T.copy(
-            V[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], V_shared)
+        T.copy(V[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :], V_shared)
         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -91,23 +86,21 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.macro
     def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_kv, dtype),
+        V: T.Tensor(shape_kv, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(seqlen_q, block_M), heads * batch, num_split,
-                threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(seqlen_q, block_M), heads * batch, num_split, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -128,39 +121,36 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 
             # NOTE(wt): tma barrier has some problems with padded dimensions (seq_q here) currently
             # disable relevant tma copy and use SIMT as fallback for now
-            T.copy(Q[bid, mid * block_M:(mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
+            T.copy(Q[bid, mid * block_M : (mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             # TODO: Handle causal split case
             loop_range = (
-                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv(
-                    (mid + 1) * block_M, block_N)) if is_causal else T.ceildiv(
-                        (seqlen_kv // num_split), block_N))
+                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv((mid + 1) * block_M, block_N))
+                if is_causal
+                else T.ceildiv((seqlen_kv // num_split), block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=2):
                 MMA0(K, Q_shared, K_shared, acc_s, k, mid, hid, bid, sid)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, hid, bid, sid)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid, sid, mid * block_M:(mid + 1) * block_M])
+            T.copy(logsum, glse[bid, hid, sid, mid * block_M : (mid + 1) * block_M])
             T.copy(acc_o, O_shared)
-            T.copy(
-                O_shared,
-                Output_partial[bid, mid * block_M:(mid + 1) * block_M, hid, sid, :],
-                disable_tma=True)
+            T.copy(O_shared, Output_partial[bid, mid * block_M : (mid + 1) * block_M, hid, sid, :], disable_tma=True)
 
     @T.macro
     def combine(
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_q, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_q, dtype),
     ):
         with T.Kernel(T.ceildiv(seqlen_q, block_M), heads, batch, threads=128) as (bx, by, bz):
             po_local = T.alloc_fragment([block_M, dim], dtype)
@@ -173,20 +163,25 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
             lse_max_local = T.alloc_fragment([block_M], accum_dtype)
             scale_local = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
-                o_shared: tilelang.layout.make_swizzled_layout(o_shared),
-                po_shared: tilelang.layout.make_swizzled_layout(po_shared),
-            })
+            T.annotate_layout(
+                {
+                    o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
+                    o_shared: tilelang.layout.make_swizzled_layout(o_shared),
+                    po_shared: tilelang.layout.make_swizzled_layout(po_shared),
+                }
+            )
 
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
-            T.copy(glse[
-                bz,
-                by,
-                :,
-                bx * block_M:(bx + 1) * block_M,
-            ], lse_local)
+            T.copy(
+                glse[
+                    bz,
+                    by,
+                    :,
+                    bx * block_M : (bx + 1) * block_M,
+                ],
+                lse_local,
+            )
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=False)
             for k in T.Pipelined(num_split):
                 T.copy(lse_local[k, :], lse_local_split)
@@ -195,10 +190,7 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
             for i in T.Parallel(block_M):
                 lse_logsum_local[i] = T.log2(lse_logsum_local[i]) + lse_max_local[i]
             for k in T.Pipelined(num_split, num_stages=2):
-                T.copy(
-                    Output_partial[bz, bx * block_M:(bx + 1) * block_M, by, k, :],
-                    po_shared,
-                    disable_tma=True)
+                T.copy(Output_partial[bz, bx * block_M : (bx + 1) * block_M, by, k, :], po_shared, disable_tma=True)
                 T.copy(po_shared, po_local)
                 for i in T.Parallel(block_M):
                     lse_local_split[i] = lse_local[k, i]
@@ -207,16 +199,16 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
                 for i, j in T.Parallel(block_M, dim):
                     o_accum_local[i, j] += po_local[i, j] * scale_local[i]
             T.copy(o_accum_local, o_shared)
-            T.copy(o_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :], disable_tma=True)
+            T.copy(o_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :], disable_tma=True)
 
     @T.prim_func
     def flashattn_mha_inference(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
-            Output: T.Tensor(shape_q, dtype),
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_kv, dtype),
+        V: T.Tensor(shape_kv, dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
+        Output: T.Tensor(shape_q, dtype),
     ):
         flash_attn_split(Q, K, V, glse, Output_partial)
         combine(glse, Output_partial, Output)
@@ -227,10 +219,10 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
 def ref_program(Q, K, V, glse, Output_partial, causal):
     assert causal is False
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -258,7 +250,7 @@ def flash_split_ref(Q, K, V, causal):
     block_N = 128
     seqlen_kv = K.size(1)
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, block_M, nheads, dim), device="cuda", dtype=torch.float)
@@ -275,14 +267,15 @@ def flash_split_ref(Q, K, V, causal):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bqhd,bkhd->bhqk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, seqlen, nheads, block_N]
+            acc_s = torch.einsum(
+                "bqhd,bkhd->bhqk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, seqlen, nheads, block_N]
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [blockM]
             scores_scale = torch.exp2(scores_max_prev - scores_max)
@@ -290,9 +283,10 @@ def flash_split_ref(Q, K, V, causal):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)
             acc_o += torch.einsum(
-                'bhqk,bkhd->bqhd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bhqk,bkhd->bqhd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, :, None].transpose(1, 2)
@@ -300,8 +294,7 @@ def flash_split_ref(Q, K, V, causal):
         gacc_o[ks, :, :, :, :] = acc_o
         glogsum[ks, :, :, :] = logsum
 
-    return glogsum.to(torch.float16).permute(1, 2, 0,
-                                             3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
+    return glogsum.to(torch.float16).permute(1, 2, 0, 3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
 
 
 def main(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index a8d68496..b737f30a 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -9,17 +9,18 @@ from example_fusedmoe_torch import *
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_shared(d_hidden,
-                                d_expert,
-                                n_shared_experts,
-                                dtype,
-                                num_tokens,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1):
-
+def moe_forward_tilelang_shared(
+    d_hidden,
+    d_expert,
+    n_shared_experts,
+    dtype,
+    num_tokens,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -36,17 +37,15 @@ def moe_forward_tilelang_shared(d_hidden,
 
     @T.prim_func
     def kernel_shared(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
-            shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
-            shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
-            up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
+        shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
+        shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
+        up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
             # Split the block to shared experts and routed experts
             input_shared = T.alloc_fragment((block_token, block_dhidden), dtype=dtype)
             W_gate_shared = T.alloc_shared((block_dexpert, block_dhidden), dtype=dtype)
@@ -70,16 +69,13 @@ def moe_forward_tilelang_shared(d_hidden,
 
             # Fuse with SiLU and element-wise product
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             T.copy(up_logits_local, up_logits[bx * block_token, by * block_dexpert])
 
         # Step 2: Compute down logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden), threads=threads) as (bx, by):
             up_logits_shared = T.alloc_fragment((block_token, block_dexpert), dtype=dtype)
             W_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_type)
@@ -98,20 +94,21 @@ def moe_forward_tilelang_shared(d_hidden,
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_routed(d_hidden,
-                                d_expert,
-                                n_routed_experts,
-                                dtype,
-                                group_sum,
-                                group_count,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1,
-                                k_pack=1,
-                                coalesced_width=None):
-
+def moe_forward_tilelang_routed(
+    d_hidden,
+    d_expert,
+    n_routed_experts,
+    dtype,
+    group_sum,
+    group_count,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+    k_pack=1,
+    coalesced_width=None,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -132,22 +129,22 @@ def moe_forward_tilelang_routed(d_hidden,
     routed_expert_gate_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_up_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_down_shape = (n_routed_experts, dhidden, dexpert)
-    routed_expert_weights_shape = (group_sum)
-    group_sizes_shape = (n_routed_experts)
+    routed_expert_weights_shape = group_sum
+    group_sizes_shape = n_routed_experts
 
     @T.prim_func
     def kernel(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
-            routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
-            routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
-            routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
-            group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
-            up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
+        routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
+        routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
+        routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
+        group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
+        group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
+        group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
+        group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
+        up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
         with T.Kernel(M, T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
@@ -168,48 +165,37 @@ def moe_forward_tilelang_routed(d_hidden,
             cur_group_idx[0] = group_idx_for_bx[bx]
 
             cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[cur_group_idx[0]]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size[0] - (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
 
             T.clear(gate_logits_local)
             T.clear(up_logits_local)
 
             for k in T.Pipelined(T.ceildiv(dhidden, block_dhidden), num_stages=num_stages):
                 T.copy(
-                    input[m_start:m_start + block_token, k * block_dhidden:(k + 1) * block_dhidden],
+                    input[m_start : m_start + block_token, k * block_dhidden : (k + 1) * block_dhidden],
                     input_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_gate[cur_group_idx[0],
-                                       by * block_dexpert:(by + 1) * block_dexpert,
-                                       k * block_dhidden:(k + 1) * block_dhidden],
-                    routed_expert_gate_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
+                    routed_expert_gate[
+                        cur_group_idx[0], by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_gate_shared,
-                    gate_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, k_pack=k_pack, transpose_B=True)
                 T.copy(
-                    routed_expert_up[cur_group_idx[0], by * block_dexpert:(by + 1) * block_dexpert,
-                                     k * block_dhidden:(k + 1) * block_dhidden],
+                    routed_expert_up[
+                        cur_group_idx[0], by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_up_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
-                    routed_expert_up_shared,
-                    up_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             for i, j in T.Parallel(block_token, block_dexpert):
@@ -232,50 +218,35 @@ def moe_forward_tilelang_routed(d_hidden,
             cur_group_idx[0] = group_idx_for_bx[bx]
 
             cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[cur_group_idx[0]]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size[0] - (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
 
             T.clear(output_local)
 
             for k in T.Pipelined(T.ceildiv(dexpert, block_dexpert), num_stages=num_stages):
                 T.copy(
-                    up_logits[m_start:m_start + block_token,
-                              k * block_dexpert:(k + 1) * block_dexpert],
+                    up_logits[m_start : m_start + block_token, k * block_dexpert : (k + 1) * block_dexpert],
                     up_logits_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_down[cur_group_idx[0],
-                                       by * block_dhidden:(by + 1) * block_dhidden,
-                                       k * block_dexpert:(k + 1) * block_dexpert],
-                    routed_expert_down_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    up_logits_shared,
+                    routed_expert_down[
+                        cur_group_idx[0], by * block_dhidden : (by + 1) * block_dhidden, k * block_dexpert : (k + 1) * block_dexpert
+                    ],
                     routed_expert_down_shared,
-                    output_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dhidden):
                 if i < actual_rows:
-                    output[m_start + i, by * block_dhidden +
-                           j] = output_local[i, j] * routed_expert_weights[m_start + i]
+                    output[m_start + i, by * block_dhidden + j] = output_local[i, j] * routed_expert_weights[m_start + i]
 
     return kernel
 
 
 class Expert(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 gate: torch.Tensor,
-                 up: torch.Tensor,
-                 down: torch.Tensor,
-                 d_expert: Optional[int] = None):
+    def __init__(self, config: Dict, gate: torch.Tensor, up: torch.Tensor, down: torch.Tensor, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
         self.act_fn = nn.SiLU()
@@ -294,14 +265,13 @@ class Expert(nn.Module):
 
 
 class MoEGate(nn.Module):
-
     def __init__(self, config: Dict, weights: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
         self.num_experts: int = config["n_routed_experts"]
         self.d_hidden: int = config["d_hidden"]
 
-        self.W_g_weight = weights['router.weight'].t()
+        self.W_g_weight = weights["router.weight"].t()
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         logits = x @ self.W_g_weight
@@ -312,76 +282,69 @@ class MoEGate(nn.Module):
 
 
 class MoE(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 shared_kernel: tilelang.JITKernel,
-                 routed_kernel: tilelang.JITKernel,
-                 weights: Dict,
-                 padding_M: int = 128):
+    def __init__(
+        self, config: Dict, shared_kernel: tilelang.JITKernel, routed_kernel: tilelang.JITKernel, weights: Dict, padding_M: int = 128
+    ):
         super().__init__()
         self.config = config
         self.shared_kernel = shared_kernel
         self.routed_kernel = routed_kernel
         self.padding_M = padding_M
-        self.experts = nn.ModuleList([
-            Expert(
-                config,
-                gate=weights[f'experts.{i}.0.weight'],
-                up=weights[f'experts.{i}.1.weight'],
-                down=weights[f'experts.{i}.2.weight']) for i in range(config["n_routed_experts"])
-        ])
+        self.experts = nn.ModuleList(
+            [
+                Expert(
+                    config,
+                    gate=weights[f"experts.{i}.0.weight"],
+                    up=weights[f"experts.{i}.1.weight"],
+                    down=weights[f"experts.{i}.2.weight"],
+                )
+                for i in range(config["n_routed_experts"])
+            ]
+        )
         self.device = torch.device("cuda")
         self.gating_network = MoEGate(config, weights).to(self.device)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = Expert(
             config=config,
-            gate=weights['shared_experts.0.weight'],
-            up=weights['shared_experts.1.weight'],
-            down=weights['shared_experts.2.weight'],
-            d_expert=shared_expert_dim).to(self.device)
+            gate=weights["shared_experts.0.weight"],
+            up=weights["shared_experts.1.weight"],
+            down=weights["shared_experts.2.weight"],
+            d_expert=shared_expert_dim,
+        ).to(self.device)
         self.expert_cache = torch.zeros(
-            (config["batch_size"] * config["seq_len"], config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
-        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts],
-                                                 dim=0)
-        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts],
-                                               dim=0)
-        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts],
-                                                 dim=0)
+            (config["batch_size"] * config["seq_len"], config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
+        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts], dim=0)
         self.stacked_expert_tokens = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.stacked_expert_weights = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.float16, device=self.device
+        )
         self.stacked_expert_tokens_idxs = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.int64,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.int64, device=self.device
+        )
 
         self.up_logits_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_expert"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_expert"]), dtype=torch.float16, device=self.device
+        )
         self.expert_output_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
         self.up_logits_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_expert"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_expert"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.expert_output_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
 
     @torch.no_grad()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -413,22 +376,20 @@ class MoE(nn.Module):
 
             self.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
             self.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
-            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[
-                idxs[start_idx:end_idx]]
+            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
 
         group_sizes = torch.tensor(counts, dtype=torch.int32, device=self.device)
-        group_offset = torch.tensor(
-            tokens_per_expert - counts, dtype=torch.int32, device=self.device)
+        group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=self.device)
 
         group_padded_offsets = [0 for _ in range(len(group_sizes))]
         for i in range(1, len(group_sizes)):
-            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil(
-                (counts[i - 1] + 1) / self.padding_M) * self.padding_M
+            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / self.padding_M) * self.padding_M
 
         block_token = 128
-        M = math.ceil(
-            self.config["batch_size"] * self.config["seq_len"] *
-            self.config["n_experts_per_token"] / block_token) + self.config["n_routed_experts"]
+        M = (
+            math.ceil(self.config["batch_size"] * self.config["seq_len"] * self.config["n_experts_per_token"] / block_token)
+            + self.config["n_routed_experts"]
+        )
         group_idx_for_bx = [0 for _ in range(M)]
 
         for bx in range(M):
@@ -437,8 +398,7 @@ class MoE(nn.Module):
                 if m_start_padded >= group_padded_offsets[i]:
                     group_idx_for_bx[bx] = i
 
-        group_padded_offsets = torch.tensor(
-            group_padded_offsets, dtype=torch.int32, device=self.device)
+        group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=self.device)
         group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=self.device)
 
         # Multi-stream execution
@@ -448,11 +408,19 @@ class MoE(nn.Module):
 
         with torch.cuda.stream(routed_stream):
             # Tilelang version: Grouped GEMM
-            self.routed_kernel(self.stacked_expert_tokens, self.stacked_expert_w_gate,
-                               self.stacked_expert_w_up, self.stacked_expert_w_down,
-                               self.stacked_expert_weights, group_sizes, group_offset,
-                               group_padded_offsets, group_idx_for_bx, self.up_logits_routed,
-                               self.expert_output_routed)
+            self.routed_kernel(
+                self.stacked_expert_tokens,
+                self.stacked_expert_w_gate,
+                self.stacked_expert_w_up,
+                self.stacked_expert_w_down,
+                self.stacked_expert_weights,
+                group_sizes,
+                group_offset,
+                group_padded_offsets,
+                group_idx_for_bx,
+                self.up_logits_routed,
+                self.expert_output_routed,
+            )
 
             # Scatter reduce
             self.expert_cache = torch.scatter_reduce(
@@ -460,14 +428,19 @@ class MoE(nn.Module):
                 0,
                 self.stacked_expert_tokens_idxs.view(-1, 1).repeat(1, x_flat.shape[-1]),
                 self.expert_output_routed,
-                reduce='sum')
+                reduce="sum",
+            )
             routed_output = self.expert_cache.view(*orig_shape)
 
         with torch.cuda.stream(shared_stream):
-
-            self.shared_kernel(x_flat, self.shared_expert.W_gate_weight,
-                               self.shared_expert.W_up_weight, self.shared_expert.W_down_weight,
-                               self.up_logits_shared, self.expert_output_shared)
+            self.shared_kernel(
+                x_flat,
+                self.shared_expert.W_gate_weight,
+                self.shared_expert.W_up_weight,
+                self.shared_expert.W_down_weight,
+                self.up_logits_shared,
+                self.expert_output_shared,
+            )
             shared_output = self.expert_output_shared.view(*orig_shape)
 
         torch.cuda.synchronize()
@@ -498,7 +471,8 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         config["d_expert"],
         config["n_shared_experts"],
         dtype=dtype_str,
-        num_tokens=config["batch_size"] * config["seq_len"])
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
     routed_kernel = moe_forward_tilelang_routed(
         config["d_hidden"],
         config["d_expert"],
@@ -512,7 +486,8 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         threads=256,
         num_stages=1,
         k_pack=1,
-        coalesced_width=2)
+        coalesced_width=2,
+    )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
 
@@ -521,13 +496,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     return output
 
 
-def main(d_hidden=7168,
-         d_expert=2048,
-         n_routed_experts=8,
-         n_shared_experts=1,
-         n_experts_per_token=4,
-         batch_size=1,
-         seq_len=8192):
+def main(d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192):
     config = {
         "dhidden": d_hidden,
         "dexpert": d_expert,
@@ -536,7 +505,7 @@ def main(d_hidden=7168,
         "nexpertspertoken": n_experts_per_token,
         "bs": batch_size,
         "seqlen": seq_len,
-        "seed": 81394
+        "seed": 81394,
     }
 
     data = generate_input(**config)
diff --git a/examples/fusedmoe/example_fusedmoe_torch.py b/examples/fusedmoe/example_fusedmoe_torch.py
index 00219c6e..6b6322af 100644
--- a/examples/fusedmoe/example_fusedmoe_torch.py
+++ b/examples/fusedmoe/example_fusedmoe_torch.py
@@ -6,7 +6,6 @@ from typing import Dict, Tuple, Optional
 
 # Reference code in PyTorch
 class ExpertTorch(nn.Module):
-
     def __init__(self, config: Dict, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -25,7 +24,6 @@ class ExpertTorch(nn.Module):
 
 
 class MoEGateTorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
@@ -43,12 +41,10 @@ class MoEGateTorch(nn.Module):
 
 
 class MoETorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
-        self.experts = nn.ModuleList(
-            [ExpertTorch(config) for _ in range(config["n_routed_experts"])])
+        self.experts = nn.ModuleList([ExpertTorch(config) for _ in range(config["n_routed_experts"])])
         self.gating_network = MoEGateTorch(config)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = ExpertTorch(config=config, d_expert=shared_expert_dim)
@@ -67,8 +63,7 @@ class MoETorch(nn.Module):
         return routed_output + shared_output
 
     @torch.no_grad()
-    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
-                  flat_expert_weights: torch.Tensor) -> torch.Tensor:
+    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor, flat_expert_weights: torch.Tensor) -> torch.Tensor:
         expert_cache = torch.zeros_like(x)
         # test_expert_cache = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
         # test_expert_tokens = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
@@ -91,8 +86,7 @@ class MoETorch(nn.Module):
             expert_out = expert(expert_tokens)
 
             expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-            expert_cache.scatter_reduce_(
-                0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+            expert_cache.scatter_reduce_(0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce="sum")
 
         return expert_cache
 
@@ -116,21 +110,21 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     moe = MoETorch(config)
 
     # Fill in the given weights of the model
-    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+    moe.gating_network.W_g.weight = nn.Parameter(weights["router.weight"])
 
     for i in range(num_experts):
-        gate_proj_weight = weights[f'experts.{i}.0.weight']
-        up_proj_weight = weights[f'experts.{i}.1.weight']
-        down_proj_weight = weights[f'experts.{i}.2.weight']
+        gate_proj_weight = weights[f"experts.{i}.0.weight"]
+        up_proj_weight = weights[f"experts.{i}.1.weight"]
+        down_proj_weight = weights[f"experts.{i}.2.weight"]
 
         # Transpose weights to match expected shape for nn.Linear
         moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
         moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
         moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
 
-    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
-    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
-    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights["shared_experts.0.weight"].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights["shared_experts.1.weight"].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights["shared_experts.2.weight"].t())
 
     output = moe(input_tensor)
 
@@ -140,10 +134,9 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
 # Input generation for the reference code
 
 
-def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int,
-                   nexpertspertoken: int, bs: int, seqlen: int,
-                   seed: int) -> Tuple[torch.Tensor, Dict, Dict]:
-
+def generate_input(
+    dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int, nexpertspertoken: int, bs: int, seqlen: int, seed: int
+) -> Tuple[torch.Tensor, Dict, Dict]:
     # Really dumb but for now _ isn't parsing correctly.
     d_hidden = dhidden
     d_expert = dexpert
@@ -163,50 +156,40 @@ def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexper
         "seq_len": seq_len,
     }
 
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
 
     num_experts = n_routed_experts
     expert_dim = d_expert
     weights = {}
 
-    input_tensor = torch.randn((batch_size, seq_len, d_hidden),
-                               device='cuda',
-                               dtype=torch.float16,
-                               generator=gen).contiguous()
+    input_tensor = torch.randn((batch_size, seq_len, d_hidden), device="cuda", dtype=torch.float16, generator=gen).contiguous()
 
     # Initialize router weights
-    weights['router.weight'] = torch.randn(
-        (num_experts, d_hidden), device="cuda", dtype=torch.float16,
-        generator=gen) / math.sqrt(d_hidden)
+    weights["router.weight"] = torch.randn((num_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen) / math.sqrt(d_hidden)
 
     for i in range(num_experts):
-        weights[f'experts.{i}.0.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.1.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.2.weight'] = torch.randn(
-            (expert_dim, d_hidden), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(d_hidden)
-
-    weights['shared_experts.0.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.1.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.2.weight'] = torch.randn((expert_dim * n_shared_experts, d_hidden),
-                                                     device='cuda',
-                                                     dtype=torch.float16,
-                                                     generator=gen) / math.sqrt(d_hidden)
+        weights[f"experts.{i}.0.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.1.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.2.weight"] = torch.randn(
+            (expert_dim, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(d_hidden)
+
+    weights["shared_experts.0.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.1.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.2.weight"] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(d_hidden)
 
     return (input_tensor, weights, config)
 
diff --git a/examples/fusedmoe/test_example_fusedmoe.py b/examples/fusedmoe/test_example_fusedmoe.py
index 806aff49..ba841589 100644
--- a/examples/fusedmoe/test_example_fusedmoe.py
+++ b/examples/fusedmoe/test_example_fusedmoe.py
@@ -4,13 +4,8 @@ import example_fusedmoe_tilelang
 
 def test_example_fusedmoe_tilelang():
     example_fusedmoe_tilelang.main(
-        d_hidden=1024,
-        d_expert=256,
-        n_routed_experts=8,
-        n_shared_experts=1,
-        n_experts_per_token=4,
-        batch_size=1,
-        seq_len=1024)
+        d_hidden=1024, d_expert=256, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=1024
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index d9ccc256..ecda7e41 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -12,6 +12,7 @@ print(tilelang.__file__, flush=True)
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__, flush=True)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_bwd_dhu
 except ImportError:
@@ -49,6 +50,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -125,8 +127,11 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
     DV = dv.shape[-1]
     block_S = 64
     BS = S // block_S
-    dh, dh0, dv2 = torch.empty((B, BS, H, DK, DV), dtype=output_dtype), torch.empty(
-        (B, H, DK, DV), dtype=state_dtype), torch.empty((B, S, H, DV), dtype=output_dtype)
+    dh, dh0, dv2 = (
+        torch.empty((B, BS, H, DK, DV), dtype=output_dtype),
+        torch.empty((B, H, DK, DV), dtype=state_dtype),
+        torch.empty((B, S, H, DV), dtype=output_dtype),
+    )
     dh_tmp = torch.empty((B, H, DK, DV), dtype=accum_dtype)
     dv_tmp = torch.empty((B, S, H, DV), dtype=accum_dtype)
     Q_tmp = torch.empty((B, S, H, DK), dtype=accum_dtype)
@@ -138,34 +143,30 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
 
     for i_s in range(BS - 1, -1, -1):
         dh[:, i_s, :, :, :] = dh_tmp
-        dv_tmp = torch.matmul(K[:, i_s * block_S:(i_s + 1) * block_S, :, :].permute(0, 2, 1, 3),
-                              dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
+        dv_tmp = torch.matmul(K[:, i_s * block_S : (i_s + 1) * block_S, :, :].permute(0, 2, 1, 3), dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
         if use_g:
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 for i_s2 in range(block_S):
-                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2,
-                                                                    i_h] <= 0:
-                        dv_tmp[i_b, i_s2,
-                               i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] -
-                                                    G[i_b, i_s * block_S + i_s2, i_h])
+                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h] <= 0:
+                        dv_tmp[i_b, i_s2, i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h])
                     else:
                         dv_tmp[i_b, i_s2, i_h, :] = 0
-        dv_tmp += dv[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dv2[:, i_s * block_S:(i_s + 1) * block_S, :, :] = dv_tmp
+        dv_tmp += dv[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dv2[:, i_s * block_S : (i_s + 1) * block_S, :, :] = dv_tmp
 
         if use_g:
             G_last = G[:, i_s * block_S + block_S - 1, :]
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 dh_tmp[i_b, i_h, :, :] *= torch.exp(G_last[i_b, i_h])
-            Q_tmp = Q[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+            Q_tmp = Q[:, i_s * block_S : (i_s + 1) * block_S, :, :]
             for i_s2 in range(block_S):
                 for i_k in range(DK):
                     Q_tmp[:, i_s2, :, i_k] *= torch.exp(G[:, i_s * block_S + i_s2, :])
         Q_tmp *= scale
-        W_tmp = W[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dO_tmp = dO[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        W_tmp = W[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dO_tmp = dO[:, i_s * block_S : (i_s + 1) * block_S, :, :]
 
         torch.backends.cuda.matmul.allow_tf32 = True
         dh_tmp += torch.matmul(Q_tmp.permute(0, 2, 3, 1), dO_tmp.permute(0, 2, 1, 3))
@@ -223,19 +224,19 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
     @T.prim_func
     def kernel(
-            # Input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            h0: T.Tensor(h0_shape, dtype=input_dtype),
-            dht: T.Tensor(dht_shape, dtype=input_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            # Output
-            dh: T.Tensor(dh_shape, dtype=output_dtype),
-            dh0: T.Tensor(dh0_shape, dtype=state_dtype),
-            dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -269,20 +270,22 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
-                b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-            })
+            T.annotate_layout(
+                {
+                    b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
+                    b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                }
+            )
 
             if use_final_state_gradient:
-                T.copy(dht[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_dh_shared)
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
                 T.copy(b_dh_shared, b_dh_fragment)
             else:
                 T.clear(b_dh_fragment)
@@ -293,17 +296,14 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
                 # Store the updated dh
                 T.copy(b_dh_fragment, b_dh_shared)
-                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dv
-                T.copy(K[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
                 T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
 
                 if use_g:
-                    T.copy(
-                        G[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh],
-                        G_shared,
-                        disable_tma=True)
+                    T.copy(G[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh], G_shared, disable_tma=True)
                     T.copy(G_shared, G_fragment)
                     G_last_local[0] = G_shared[block_S - 1]
                     G_last_local_exp[0] = T.exp(G_last_local[0])
@@ -313,27 +313,22 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                         # with T.If(G_last_local[0] - G_shared[i_s2] <= 0):
                         with T.If(G_last_local[0] - G_fragment[i_s2] <= 0):
                             with T.Then():
-                                dv_fragment[i_s2,
-                                            i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
+                                dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
                             with T.Else():
                                 dv_fragment[i_s2, i_v] = 0
 
-                T.copy(
-                    dv[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dv_shared)
+                T.copy(dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared)
                 T.copy(dv_shared, dv_fragment_2)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
 
                 # Store the updated dv
                 T.copy(dv_fragment, dv_shared)
-                T.copy(
-                    dv_shared, dv2[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                                   bv * block_DV:(bv + 1) * block_DV])
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dh
-                T.copy(Q[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
-                T.copy(W[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
 
                 T.clear(Q_fragment)
                 if use_g:
@@ -353,9 +348,7 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                 for i_s2, i_k in T.Parallel(block_S, DK):
                     Q_fragment_t[i_k, i_s2] = Q_fragment[i_s2, i_k]
 
-                T.copy(
-                    dO[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dO_shared)
+                T.copy(dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared)
                 T.copy(dO_shared, dO_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dO_fragment_t[i_v, i_s2] = dO_fragment[i_s2, i_v]
@@ -369,7 +362,7 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
                     b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] - b_dh_fragment_2[i_k, i_v]
 
             if use_initial_state:
-                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -444,44 +437,61 @@ def run_test(
     num_stages=0,
     use_torch=False,
 ):
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    dh_ref, dh0_ref, dv2_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, gate_dtype),
-                                              getattr(torch, state_dtype))
-    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                             getattr(torch, output_dtype),
-                                                             getattr(torch, gate_dtype),
-                                                             getattr(torch, state_dtype))
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dh_ref, dh0_ref, dv2_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
     print("fla running...", flush=True)
     if use_g:
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
     else:
         G = G.fill_(0)
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
 
     # tilelang
     print("tilelang running...", flush=True)
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, scale, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     # kernel = tilelang.compile(program)
     print(kernel.get_kernel_source())
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
 
-    fla_time = do_bench(
-        chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
+    fla_time = do_bench(chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
     tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
 
     print(f"fla time: {fla_time} ms")
@@ -496,19 +506,47 @@ def run_test(
         print("torch running...", flush=True)
         if use_g:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, G, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                G,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
         else:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, None, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                None,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index cc384ade..43f1e972 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -10,6 +10,7 @@ from tilelang.autotuner import autotune
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
 except ImportError:
@@ -56,6 +57,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -83,18 +85,14 @@ def prepare_output(
 
 def get_configs():
     import itertools
+
     block_DK = [32, 64, 128]
     block_DV = [32, 64, 128]
     threads = [128, 256]
     num_stages = [1, 2, 3]
     _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
 
-    configs = [{
-        'block_DK': c[0],
-        'block_DV': c[1],
-        'threads': c[2],
-        'num_stages': c[3]
-    } for c in _configs]
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
     return configs
 
 
@@ -137,14 +135,14 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            U: T.Tensor(U_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=output_dtype),
-            final_state: T.Tensor(final_state_shape, dtype=state_dtype),
-            V_new: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -162,35 +160,35 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
             G_shared = T.alloc_shared((block_S, block_DV), dtype=gate_dtype)
             G_fragment = T.alloc_fragment((block_S, block_DV), dtype=gate_dtype)
 
-            T.annotate_layout({
-                b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                G_shared: tilelang.layout.make_swizzled_layout(G_shared),
-            })
+            T.annotate_layout(
+                {
+                    b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                    V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    G_shared: tilelang.layout.make_swizzled_layout(G_shared),
+                }
+            )
 
             T.use_swizzle(10)
 
             if use_initial_state:
-                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_h_shared)
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
                 T.copy(b_h_shared, b_h_fragment)
             else:
                 T.clear(b_h_fragment)
 
             for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
                 # Store previous result to the hidden tensor, like the epilogue
-                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Recurrence
-                T.copy(W[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], W_shared)
                 T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
 
                 # U - W * S
-                T.copy(
-                    U[bb, i_s * block_S:(i_s + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                    U_shared)
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
                 T.copy(U_shared, U_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
@@ -198,11 +196,9 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
                 # Save V_new
                 if save_new_value:
                     T.copy(V_new_fragment, dst=V_new_shared)
-                    T.copy(
-                        V_new_shared, V_new[bb, i_s * block_S:(i_s + 1) * block_S, bh,
-                                            bv * block_DV:(bv + 1) * block_DV])
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
-                T.copy(K[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
                 # use_g
                 if use_g:
                     G_last_local[0] = G[bb, (i_s + 1) * block_S - 1, bh]
@@ -213,7 +209,8 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
                         with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
                             with T.Then():
                                 V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp2(
-                                    (G_last_local[0] - G_fragment[i_s2, i_v]) * 1.442695)
+                                    (G_last_local[0] - G_fragment[i_s2, i_v]) * 1.442695
+                                )
                             with T.Else():
                                 V_new_fragment[i_s2, i_v] = 0
                     G_last_local[0] = T.exp2(G_last_local[0] * 1.442695)
@@ -228,7 +225,7 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
             # Save final state
             if store_final_state:
-                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -279,17 +276,24 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    h_ref, final_state_ref, V_new_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, state_dtype))
-    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                                      getattr(torch, output_dtype),
-                                                                      getattr(torch, state_dtype))
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
     h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
@@ -300,13 +304,27 @@ def run_test(
         initial_state=initial_state,
         output_final_state=store_final_state,
         chunk_size=chunk_size,
-        save_new_value=save_new_value)
+        save_new_value=save_new_value,
+    )
 
     # tilelang
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value)
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
     h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
@@ -320,19 +338,15 @@ def run_test(
         initial_state=initial_state,
         output_final_state=store_final_state,
         chunk_size=chunk_size,
-        save_new_value=save_new_value)
+        save_new_value=save_new_value,
+    )
     tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
 
     # check correctness
     try:
         h_ref_fp32 = h_ref.to(torch.float32)
         h_tilelang_fp32 = h_tilelang.to(torch.float32)
-        assert_similar(
-            h_ref_fp32,
-            h_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd h",
-            raise_assert=False)
+        assert_similar(h_ref_fp32, h_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd h", raise_assert=False)
         print("tilelang chunk gated delta rule fwd h passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd h failed ✗")
@@ -346,7 +360,8 @@ def run_test(
             final_state_tilelang_fp32,
             eps=1e-5,
             name="tilelang chunk gated delta rule fwd final_state",
-            raise_assert=False)
+            raise_assert=False,
+        )
         print("tilelang chunk gated delta rule fwd final_state passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd final_state failed ✗")
@@ -355,12 +370,7 @@ def run_test(
     try:
         V_new_ref_fp32 = V_new_ref.to(torch.float32)
         V_new_tilelang_fp32 = V_new_tilelang.to(torch.float32)
-        assert_similar(
-            V_new_ref_fp32,
-            V_new_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd V_new",
-            raise_assert=False)
+        assert_similar(V_new_ref_fp32, V_new_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd V_new", raise_assert=False)
         print("tilelang chunk gated delta rule fwd V_new passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd V_new failed ✗")
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index 1c084be7..bd1e9aa2 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_fwd_o
 except ImportError:
@@ -87,16 +88,14 @@ def tilelang_chunk_fwd_o(
 
     @T.prim_func
     def kernel(
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            O: T.Tensor(O_shape, dtype=output_dtype),
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        O: T.Tensor(O_shape, dtype=output_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bv, bs, bbh):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
             bb, bh = bbh // H, bbh % H
             Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
@@ -109,28 +108,24 @@ def tilelang_chunk_fwd_o(
             G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=gate_dtype)
 
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                H_shared: tilelang.layout.make_swizzled_layout(H_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.annotate_layout(
+                {
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    H_shared: tilelang.layout.make_swizzled_layout(H_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                }
+            )
 
             T.clear(A_fragment)
             T.clear(O_fragment)
             T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    Q_shared)
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    HIDDEN[bb, bs, bh, i_k * block_DK:(i_k + 1) * block_DK,
-                           bv * block_DV:(bv + 1) * block_DV], H_shared)
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], H_shared)
                 T.gemm(Q_shared, H_shared, O_fragment)
                 T.gemm(Q_shared, K_shared, A_fragment, transpose_B=True)
 
@@ -145,8 +140,7 @@ def tilelang_chunk_fwd_o(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
 
@@ -155,8 +149,7 @@ def tilelang_chunk_fwd_o(
                     with T.Then():
                         A_fragment[i_s1, i_s2] = 0
 
-            T.copy(V[bb, bs * block_S:(bs + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                   V_shared)
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)
             T.copy(A_fragment, A_shared)
             T.gemm(A_shared, V_shared, O_fragment)
 
@@ -164,8 +157,7 @@ def tilelang_chunk_fwd_o(
                 O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * scale
 
             T.copy(O_fragment, O_shared)
-            T.copy(O_shared, O[bb, bs * block_S:(bs + 1) * block_S, bh,
-                               bv * block_DV:(bv + 1) * block_DV])
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -191,8 +183,9 @@ def run_test(
     output_dtype_torch = getattr(torch, output_dtype)
     accum_dtype_torch = getattr(torch, accum_dtype)
     gate_dtype_torch = getattr(torch, gate_dtype)
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, input_dtype_torch,
-                                       output_dtype_torch, accum_dtype_torch, gate_dtype_torch)
+    Q, K, V, HIDDEN, G = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
     scale = 1.0 / DK**0.5
 
     O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
@@ -200,9 +193,25 @@ def run_test(
 
     block_S = chunk_size
     O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)
 
     try:
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 20aa8414..66cb6942 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -12,6 +12,7 @@ from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F4
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_bwd_dqkwg
 except ImportError:
@@ -108,10 +109,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_chunk_o_bwd_dqkwg(
     # task config
     B,
@@ -155,25 +154,23 @@ def tilelang_chunk_o_bwd_dqkwg(
 
     @T.prim_func
     def kernel(
-            # input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dh: T.Tensor(dh_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            # output
-            dq: T.Tensor(dq_shape, dtype=output_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dw: T.Tensor(dw_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dh: T.Tensor(dh_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        # output
+        dq: T.Tensor(dq_shape, dtype=output_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dw: T.Tensor(dw_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bk, bs, bbh):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
             bb, bh = bbh // H, bbh % H
 
             V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
@@ -212,15 +209,17 @@ def tilelang_chunk_o_bwd_dqkwg(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                h_shared: tilelang.layout.make_swizzled_layout(h_shared),
-                dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                q_shared: tilelang.layout.make_swizzled_layout(q_shared),
-                k_shared: tilelang.layout.make_swizzled_layout(k_shared),
-            })
+            T.annotate_layout(
+                {
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    h_shared: tilelang.layout.make_swizzled_layout(h_shared),
+                    dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                    q_shared: tilelang.layout.make_swizzled_layout(q_shared),
+                    k_shared: tilelang.layout.make_swizzled_layout(k_shared),
+                }
+            )
 
             T.clear(dg_last_local)
             T.clear(G_last_local)
@@ -235,18 +234,10 @@ def tilelang_chunk_o_bwd_dqkwg(
             T.clear(dw_fragment)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
-                T.copy(
-                    dO[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], dO_shared)
-                T.copy(
-                    h[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                      i_v * block_DV:(i_v + 1) * block_DV], h_shared)
-                T.copy(
-                    dh[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                       i_v * block_DV:(i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(dO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dO_shared)
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
 
                 if use_g:
                     T.clear(dg_last_fragment_scalar)
@@ -254,9 +245,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                     # for i_kv in T.Parallel(block_DK * block_DV):
                     #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     for i_kv in T.Parallel(block_DK * block_DV):
-                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv %
-                                                          block_DV] * dh_shared[i_kv // block_DV,
-                                                                                i_kv % block_DV]
+                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
                     dg_last_local[0] += dg_last_fragment_scalar[0]
 
@@ -265,22 +254,16 @@ def tilelang_chunk_o_bwd_dqkwg(
                 T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True)
 
                 if use_dw:
-                    T.copy(
-                        dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                           i_v * block_DV:(i_v + 1) * block_DV], dv_shared)
+                    T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dv_shared)
                     T.gemm(dv_shared, h_shared, dw_fragment, transpose_B=True)
 
             if use_dw:
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
-                T.copy(
-                    dw_fragment, dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-
-            T.copy(Q[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   q_shared)
-            T.copy(K[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   k_shared)
+                T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], k_shared)
             T.copy(q_shared, q_fragment)
             T.copy(k_shared, k_fragment)
 
@@ -294,8 +277,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 dg_last_local[0] = dg_last_local[0] * T.exp(G[bb, bs * block_S + block_S - 1, bh])
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s,
-                                                                            bh]) * scale
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s, bh]) * scale
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dq_fragment[i_s, i_k] * q_shared[i_s, i_k]
@@ -305,8 +287,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     with T.If(G_last_local[0] - G[bb, bs * block_S + i_s, bh] <= 0):
                         with T.Then():
-                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(
-                                G_last_local[0] - G[bb, bs * block_S + i_s, bh])
+                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(G_last_local[0] - G[bb, bs * block_S + i_s, bh])
                         with T.Else():
                             dk_fragment[i_s, i_k] = 0
                 T.clear(dg_fragment_reduce_tmp)
@@ -325,12 +306,11 @@ def tilelang_chunk_o_bwd_dqkwg(
                 dg_last_local[1] = dg_last_fragment_scalar_2[0]
 
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 >= i_s2 and
-                              G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
+                    with T.If(i_s1 >= i_s2 and G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
                         with T.Then():
-                            ds_fragment[i_s1, i_s2] = ds_fragment[
-                                i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                    G[bb, bs * block_S + i_s2, bh]) * scale
+                            ds_fragment[i_s1, i_s2] = (
+                                ds_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]) * scale
+                            )
                         with T.Else():
                             ds_fragment[i_s1, i_s2] = 0
 
@@ -338,8 +318,7 @@ def tilelang_chunk_o_bwd_dqkwg(
                 T.clear(ds_fragment_positive_transpose)
                 T.gemm(q_shared, k_shared, ds_fragment_positive, transpose_B=True)
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    ds_fragment_positive[
-                        i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
+                    ds_fragment_positive[i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
 
                 # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
                 T.reduce_sum(ds_fragment_positive, dg_fragment, dim=1, clear=False)
@@ -363,15 +342,10 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s in T.Parallel(block_S):
                     with T.If(i_s >= block_S - 1):  # noqa: SIM117
                         with T.Then():
-                            dg_fragment_final[
-                                i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
-
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                            dg_fragment_final[i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
+
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
                 for i_s in T.Parallel(block_S):
                     dg[bk, bb, bs * block_S + i_s, bh] = dg_fragment_final[i_s]
 
@@ -387,12 +361,8 @@ def tilelang_chunk_o_bwd_dqkwg(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale
                     dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] + dk_fragment_2[i_s, i_k] * scale
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
 
     return kernel
 
@@ -442,32 +412,53 @@ def run_test(
     threads=256,
     num_stages=0,
 ):
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                    getattr(torch, output_dtype),
-                                                    getattr(torch, gate_dtype),
-                                                    getattr(torch, state_dtype), block_DK)
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype), block_DK)
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
 
     # ref
     if use_g:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
     else:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
 
     # tilelang
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
-                                        block_DK, block_DV, threads, num_stages)
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_dw,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index d07a4776..af2b08e5 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
 except ImportError:
@@ -75,10 +76,10 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=accum_dtype),
-            A: T.Tensor(output_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=accum_dtype),
+        A: T.Tensor(output_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -93,10 +94,12 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
             G_shared = T.alloc_shared((block_S,), dtype=accum_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
 
             T.fill(A_fragment, 0)
             T.disable_warp_group_reg_alloc()
@@ -104,9 +107,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     Beta_K_fragment[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
                 T.gemm(Beta_K_fragment, K_shared, A_fragment, transpose_B=True)
@@ -119,8 +120,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
             else:
@@ -130,7 +130,7 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
                             A_fragment[i_s1, i_s2] = 0
 
             T.copy(A_fragment, A_shared)
-            T.copy(A_shared, A[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(A_shared, A[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
@@ -149,24 +149,21 @@ def run_test(
     threads,
     num_stages,
 ):
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     A_ref = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
     A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
 
     # reference
     if use_g:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
     else:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
 
     # tilelang
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)
 
     try:
@@ -192,7 +189,8 @@ def main():
         use_g=True,
         block_DK=64,
         threads=128,
-        num_stages=2)
+        num_stages=2,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 9896c7ec..13547cd6 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -10,6 +10,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.utils.cumsum import chunk_local_cumsum_scalar
 except ImportError:
@@ -20,11 +21,8 @@ import torch
 
 
 @tilelang.jit(
-    out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+)
 def tilelang_chunk_local_cumsum_scalar(
     # task config
     B,
@@ -42,35 +40,35 @@ def tilelang_chunk_local_cumsum_scalar(
     use_fragment=False,
 ):
     G_shape = (B, H, S) if head_first else (B, S, H)
-    assert chunk_size == 2**(chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
     assert chunk_size == block_S, "chunk_size must be equal to block_S"
 
     @T.prim_func
     def kernel(
-            G: T.Tensor(G_shape, dtype=input_dtype),
-            G_new: T.Tensor(G_shape, dtype=output_dtype),
+        G: T.Tensor(G_shape, dtype=input_dtype),
+        G_new: T.Tensor(G_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
             G_shared = T.alloc_shared((1, block_S), dtype=output_dtype, scope="shared")
             if head_first:
-                T.copy(G[bb, bh, bs * block_S:(bs + 1) * block_S], G_shared)
+                T.copy(G[bb, bh, bs * block_S : (bs + 1) * block_S], G_shared)
             else:
-                T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh], G_shared)
             if use_fragment:
                 G_fragment = T.alloc_fragment((1, block_S), dtype=output_dtype, scope="shared")
                 T.copy(G_shared, G_fragment)
                 T.cumsum(G_fragment, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_fragment, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_fragment, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_fragment, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_fragment, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
             else:
                 T.cumsum(G_shared, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_shared, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_shared, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_shared, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_shared, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
 
     return kernel
 
@@ -113,11 +111,8 @@ def run_test(
 
     # reference cumsum
     G_new_ref = chunk_local_cumsum_scalar(
-        g=G,
-        chunk_size=chunk_size,
-        reverse=reverse,
-        head_first=head_first,
-        output_dtype=getattr(torch, output_dtype))
+        g=G, chunk_size=chunk_size, reverse=reverse, head_first=head_first, output_dtype=getattr(torch, output_dtype)
+    )
 
     # tilelang cumsum
     block_S = chunk_size
@@ -162,7 +157,8 @@ def main():
         input_dtype="float32",
         output_dtype="float32",
         threads=256,
-        use_fragment=False)
+        use_fragment=False,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
index 0a0983a8..874e25c3 100644
--- a/examples/gdn/example_wy_fast.py
+++ b/examples/gdn/example_wy_fast.py
@@ -9,6 +9,7 @@ import sys  # noqa: F401
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd
 except ImportError:
@@ -73,13 +74,13 @@ def tilelang_recompute_w_u_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=output_dtype),
-            W: T.Tensor(K_shape, dtype=output_dtype),
-            U: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -95,49 +96,42 @@ def tilelang_recompute_w_u_fwd(
             W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
-                U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    W_shared: tilelang.layout.make_swizzled_layout(W_shared),
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
+                    U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
+                }
+            )
 
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
                 T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(U_fragment, U_shared)
-                T.copy(
-                    U_shared, U[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    W_Beta_shared[i_s,
-                                  i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
                 T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(W_fragment, W_shared)
-                T.copy(
-                    W_shared, W[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
     return kernel
 
@@ -159,15 +153,8 @@ def run_test(
     num_stages,
 ):
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     W_ref, U_ref = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
     W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
 
@@ -191,7 +178,8 @@ def run_test(
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
@@ -224,7 +212,8 @@ def main():
         block_DK=64,
         block_DV=32,
         threads=128,
-        num_stages=3)
+        num_stages=3,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 42a0040d..5b0230e5 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -10,6 +10,7 @@ import tilelang.language as T
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import bwd_prepare_wy_repr
 except ImportError:
@@ -93,10 +94,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-5, -4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_wy_fast_bwd(
     # task config
     B,
@@ -135,20 +134,20 @@ def tilelang_wy_fast_bwd(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            # output
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -187,7 +186,7 @@ def tilelang_wy_fast_bwd(
             T.clear(dbeta_fragment_v)
             T.clear(dg_fragment)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -195,51 +194,37 @@ def tilelang_wy_fast_bwd(
 
             # Update dk
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    K_shared_beta_g[i_s,
-                                    i_k2] = K_shared[i_s,
-                                                     i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
-                T.copy(
-                    dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dw_shared)
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
                 T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, dw_shared, dk_fragment_beta_g, clear_accum=True, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dk_fragment[
-                        i_s,
-                        i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                    dk_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[
-                        i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
 
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dg_fragment[i_s] = dg_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dg_fragment_reduce_tmp[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[
-                        i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    dg_fragment_reduce_tmp[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    )
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=1, clear=False)
 
                 # correct dk
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dv
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
-                T.copy(
-                    du[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], du_shared)
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
                 T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
@@ -247,30 +232,22 @@ def tilelang_wy_fast_bwd(
                 # for i_s, i_v2 in T.Parallel(block_S, block_DV):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
-                    dbeta_fragment_reduce_tmpv[i_s,
-                                               i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s,
-                                                                                              i_v2]
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment_v, dim=1, clear=False)
 
-                T.copy(
-                    dv_fragment, dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             # Temporary store dbeta, dg and dA
             for i_s in T.Parallel(block_S):
                 dbeta[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s] + dbeta_fragment_v[i_s]
                 dg[bb, bs * block_S + i_s, bh] = dg_fragment[i_s]
             # correct dA
-            T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_wy_fast_bwd_split(
     # task config
     B,
@@ -308,20 +285,20 @@ def tilelang_wy_fast_bwd_split(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
-            dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
+        dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -350,7 +327,7 @@ def tilelang_wy_fast_bwd_split(
             T.clear(dA_A_fragment_1)
             T.clear(dA_A_fragment_2)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -361,7 +338,7 @@ def tilelang_wy_fast_bwd_split(
             # for i_s in T.Parallel(block_S):
             # dbeta_fragment[i_s] = dbeta[bb, bs * block_S + i_s, bh]
             # dg_fragment[i_s] = dg[bb, bs * block_S + i_s, bh]
-            T.copy(dA[bb, bs * block_S:(bs + 1) * block_S, bh, :], dA_shared)
+            T.copy(dA[bb, bs * block_S : (bs + 1) * block_S, bh, :], dA_shared)
             # T.copy(dA_shared, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
 
             # Update dA
@@ -385,8 +362,7 @@ def tilelang_wy_fast_bwd_split(
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
                 with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
                     with T.Then():
-                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                         G[bb, bs * block_S + i_s2, bh])
+                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh])
                     with T.Else():
                         dA_fragment[i_s1, i_s2] = 0
             T.copy(dA_fragment, dA_shared)
@@ -397,12 +373,8 @@ def tilelang_wy_fast_bwd_split(
             # Update dk using previous dk
             T.clear(A_fragment)
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dk_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
                 T.copy(dk_shared, dk_fragment)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     K_shared_beta[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
@@ -411,18 +383,14 @@ def tilelang_wy_fast_bwd_split(
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s,
-                                               i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s,
-                                                                                              i_k2]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
                 T.gemm(dA_shared, K_shared_beta, dk_fragment, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_shared_beta[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_fragment[i_s, i_k2] = dk_fragment[i_s, i_k2] + dk_shared_beta[i_s, i_k2]
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dg and dbeta
             T.copy(A_fragment, A_shared)
@@ -460,19 +428,25 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
-    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, gate_dtype),
-                                                       getattr(torch, state_dtype))
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -480,28 +454,55 @@ def run_test(
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # ref
-    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(
-        K, V, G, Beta, A, dw, du, cu_seqlens=None)
+    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(K, V, G, Beta, A, dw, du, cu_seqlens=None)
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
 
     from test_utils import assert_similar
+
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index 75a62171..a51936ef 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -25,16 +25,10 @@ num_stages = 1
 
 def test_example_wy_fast_compilation():
     from example_wy_fast import tilelang_recompute_w_u_fwd, prepare_input
+
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     # tilelang
     block_S = chunk_size
     kernel = tilelang_recompute_w_u_fwd(
@@ -52,22 +46,31 @@ def test_example_wy_fast_compilation():
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
 
 def test_example_wy_fast_bwd_split_compilation():
     from example_wy_fast_bwd_split import tilelang_wy_fast_bwd, tilelang_wy_fast_bwd_split, prepare_input, prepare_output
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
+
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -75,68 +78,146 @@ def test_example_wy_fast_bwd_split_compilation():
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
 
 
 def test_example_chunk_o_compilation():
     from example_chunk_o import tilelang_chunk_fwd_o, prepare_input
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype),
-                                       getattr(torch, output_dtype), getattr(torch, accum_dtype),
-                                       getattr(torch, gate_dtype))
+
+    Q, K, V, HIDDEN, G = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
     scale = 1.0 / DK**0.5
     block_S = chunk_size
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)  # noqa: F841
 
 
 def test_example_chunk_o_bwd_compilation():
     from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
-
-    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
-                                                                W)  # noqa: F841
+
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        True,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)  # noqa: F841
     if use_g:
         dg_tilelang = dg_tilelang.sum(dim=0)
 
 
 def test_example_chunk_scaled_dot_kkt_compilation():
     from example_chunk_scaled_dot_kkt import tilelang_chunk_scaled_dot_kkt_fwd, prepare_input
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)  # noqa: F841
 
 
 def test_example_cumsum_compilation():
     from example_cumsum import tilelang_chunk_local_cumsum_scalar, prepare_cumsum_input, prepare_cumsum_output
+
     G = prepare_cumsum_input(B, S, H, getattr(torch, gate_dtype))
     G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, gate_dtype))
     block_S = chunk_size
@@ -158,33 +239,79 @@ def test_example_cumsum_compilation():
 
 def test_example_chunk_delta_h_compilation():
     from example_chunk_delta_h import tilelang_chunk_gated_delta_rule_fwd_h, prepare_input
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
-    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G,
-                                                              initial_state)  # noqa: F841
+
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)  # noqa: F841
 
 
 def test_example_chunk_delta_bwd_compilation():
     from example_chunk_delta_bwd import tilelang_chunk_gated_delta_rule_bwd_dhu, prepare_input
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, 1.0, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)  # noqa: F841
 
 
diff --git a/examples/gdn/test_utils.py b/examples/gdn/test_utils.py
index 37f8d8e6..3588551c 100644
--- a/examples/gdn/test_utils.py
+++ b/examples/gdn/test_utils.py
@@ -9,7 +9,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -19,21 +19,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if raise_assert:
             raise AssertionError
     else:
diff --git a/examples/gemm/example_gemm.py b/examples/gemm/example_gemm.py
index f18cd388..2c234d12 100644
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
@@ -4,12 +4,11 @@ import tilelang.language as T
 
 @tilelang.jit(out_idx=[-1])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 661ef127..badc3340 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -90,7 +90,8 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -100,13 +101,13 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
 
 def get_best_config(M, N, K, with_roller=False):
-
     def kernel(
         block_M=None,
         block_N=None,
@@ -120,12 +121,11 @@ def get_best_config(M, N, K, with_roller=False):
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -146,15 +146,18 @@ def get_best_config(M, N, K, with_roller=False):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
+        )
+        .set_profile_args(
             supply_type=tl.TensorSupplyType.Integer,
             ref_prog=ref_program,
             skip_check=False,
         )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
@@ -167,52 +170,20 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tl.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           thread_num,
-           enable_rasteration,
-           dtype="float16",
-           accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"):
     @T.prim_func
     def gemm_autotune(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -236,11 +207,7 @@ def matmul(M,
     return gemm_autotune
 
 
-def main(M: int = 4096,
-         N: int = 4096,
-         K: int = 4096,
-         use_autotune: bool = False,
-         with_roller: bool = False):
+def main(M: int = 4096, N: int = 4096, K: int = 4096, use_autotune: bool = False, with_roller: bool = False):
     use_autotune = True
     if use_autotune:
         result = get_best_config(M, N, K, with_roller)
@@ -266,15 +233,7 @@ if __name__ == "__main__":
     parser.add_argument("--m", type=int, default=4096, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=4096, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=4096, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=False,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=False, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
     main(args.m, args.n, args.k, args.use_autotune, args.with_roller)
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index 5c014ce3..488e5bf6 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -4,7 +4,8 @@ import tilelang
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 
@@ -99,12 +100,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_intrinsics(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -112,10 +112,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -123,7 +125,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -133,7 +134,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
diff --git a/examples/gemm/example_gemm_persistent.py b/examples/gemm/example_gemm_persistent.py
index a2a7122d..6fc0e5aa 100644
--- a/examples/gemm/example_gemm_persistent.py
+++ b/examples/gemm/example_gemm_persistent.py
@@ -5,22 +5,12 @@ import argparse
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_non_persistent(M,
-                          N,
-                          K,
-                          block_M,
-                          block_N,
-                          block_K,
-                          threads,
-                          num_stages,
-                          dtype="float16",
-                          accum_dtype="float"):
-
+def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype="float16", accum_dtype="float"):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -43,18 +33,9 @@ def matmul_non_persistent(M,
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_persistent(M,
-                      N,
-                      K,
-                      block_M,
-                      block_N,
-                      block_K,
-                      threads,
-                      num_stages,
-                      dtype="float16",
-                      accum_dtype="float",
-                      use_persistent_primitive=True):
-
+def matmul_persistent(
+    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype="float16", accum_dtype="float", use_persistent_primitive=True
+):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
     n_blocks = T.ceildiv(N, block_N)
@@ -63,9 +44,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -90,9 +71,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main_persistent_primitive(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -100,8 +81,7 @@ def matmul_persistent(M,
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            for bx, by in T.Persistent(
-                [T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
+            for bx, by in T.Persistent([T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
                 T.clear(C_local)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     T.copy(A[bx * block_M, k * block_K], A_shared)
@@ -128,18 +108,15 @@ def main(M=4096, N=4096, K=4096):
     num_stages = 3
 
     persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
-    persistent_profiler = persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Persistent GEMM: All check passed.")
     persistent_latency = persistent_profiler.do_bench(warmup=500)
     print(f"Persistent GEMM Latency: {persistent_latency} ms")
     print(f"Persistent GEMM TFlops: {total_flops / persistent_latency * 1e-9} TFlops")
 
-    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads,
-                                                  num_stages)
-    non_persistent_profiler = non_persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    non_persistent_profiler = non_persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     non_persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Non-Persistent GEMM: All check passed.")
     non_persistent_latency = non_persistent_profiler.do_bench(warmup=500)
@@ -151,9 +128,9 @@ def main(M=4096, N=4096, K=4096):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=8192, help='N dimension')
-    parser.add_argument('--K', type=int, default=8192, help='K dimension')
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=8192, help="N dimension")
+    parser.add_argument("--K", type=int, default=8192, help="K dimension")
     args = parser.parse_args()
     M, N, K = args.M, args.N, args.K
     main(M, N, K)
diff --git a/examples/gemm/example_gemm_schedule.py b/examples/gemm/example_gemm_schedule.py
index f4727412..d1eb11df 100644
--- a/examples/gemm/example_gemm_schedule.py
+++ b/examples/gemm/example_gemm_schedule.py
@@ -4,12 +4,11 @@ import tilelang.language as T
 
 @tilelang.jit(out_idx=[-1])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def gemm_schedule(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 0e6ace75..4c58144e 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -17,10 +17,8 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     return [a, b]
 
 
@@ -35,27 +33,24 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks,
-                                                               num_stages, num_threads, k_packs,
-                                                               gemm_types):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "block_K": k,
-            "num_stages": stages,
-            "num_threads": t,
-            "k_pack": kp,
-            "gemm_type": gemm_type,
-        })
+    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks, num_stages, num_threads, k_packs, gemm_types):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+                "num_threads": t,
+                "k_pack": kp,
+                "gemm_type": gemm_type,
+            }
+        )
     return valid_configs
 
 
 @tilelang.autotune(
-    configs=get_configs(),
-    cache_input_tensors=True,
-    ref_prog=ref_program,
-    manual_check_prog=manual_check_prog,
-    supply_prog=supply_prog)
+    configs=get_configs(), cache_input_tensors=True, ref_prog=ref_program, manual_check_prog=manual_check_prog, supply_prog=supply_prog
+)
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
     dtype = "float8_e4m3fnuz"
@@ -63,12 +58,11 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
 
     @T.prim_func
     def gemm_fp8_rs(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_local = T.alloc_fragment((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -77,24 +71,17 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_local)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_local,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_local, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     @T.prim_func
     def gemm_fp8_ss(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -103,13 +90,7 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
@@ -123,10 +104,8 @@ def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pa
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index a403ed06..1ecd344b 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -13,12 +13,11 @@ def calc_diff(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
-
     @T.prim_func
     def gemm_fp8(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -41,8 +40,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.randn(M, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
-    b = torch.randn(N, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
+    a = torch.randn(M, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
+    b = torch.randn(N, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
@@ -57,8 +56,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 1024, "float8_e4m3")
+    test_gemm_fp8(1024, 1024, 1024, "float8_e5m2")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index 1d9207af..3af4c3d6 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -13,9 +13,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
     @T.prim_func
     def gemm_fp8_2xAcc(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -59,14 +59,14 @@ def test_gemm_fp8(M, N, K, dtype):
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.rand(M, K, dtype=torch.float16, device='cuda')
+    a = torch.rand(M, K, dtype=torch.float16, device="cuda")
     a = (100 * (2 * a - 1)).to(dtype=torch_dtype)
-    b = torch.rand(N, K, dtype=torch.float16, device='cuda')
+    b = torch.rand(N, K, dtype=torch.float16, device="cuda")
     b = (100 * (2 * b - 1)).to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
-    ref_c = (a.float() @ b.float().T)
+    ref_c = a.float() @ b.float().T
 
     diff = calc_diff(c, ref_c)
     print(f"diff: {diff}")
@@ -74,8 +74,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 8192, "float8_e4m3")
+    test_gemm_fp8(1024, 1024, 8192, "float8_e5m2")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index 0e2c437e..6e2d41be 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -5,7 +5,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -110,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_fp8_intrinsic(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -123,10 +123,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -134,7 +136,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -144,7 +145,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
index 4628a997..5cb42e32 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -26,9 +26,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -121,6 +121,4 @@ for tvm_fp8_dtype in ["float8_e4m3", "float8_e5m2"]:
         profiler = jit_kernel.get_profiler()
         latency = profiler.do_bench()
         print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
-        print(
-            f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS"
-        )
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index a58e5a7c..be43f4ec 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -5,12 +5,11 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -62,7 +61,8 @@ jit_kernel = tilelang.compile(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 print(jit_kernel.get_kernel_source())
 # 3. Test the kernel in Python with PyTorch data
 import torch
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 9008c7ef..88614f56 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -25,9 +25,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -40,15 +40,7 @@ def matmul(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -66,8 +58,7 @@ in_dtype, out_dtype, accum_dtype = "bfloat16", "bfloat16", "float"
 num_stages = 2
 threads = 256
 
-func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-              accum_dtype, num_stages, threads)
+func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 jit_kernel = tilelang.compile(
     func,
     out_idx=[2],
@@ -75,7 +66,8 @@ jit_kernel = tilelang.compile(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 
 print(jit_kernel.get_kernel_source())
 
@@ -88,4 +80,4 @@ torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 profiler = jit_kernel.get_profiler()
 latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
-print(f"Flops: {2 * M * N * K / (latency/1e3) / 1e12} TFLOPS")
+print(f"Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
index 5125aed0..fe3b1523 100644
--- a/examples/gemm_sp/example_custom_compress.py
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -17,77 +17,76 @@ torch.manual_seed(42)
 
 DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 64,
-            'num_stages': 1,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        "float": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        "float16": {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 256,
-            'block_N': 128,
-            'block_K': 64,
-            'num_stages': 2,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
     },
     "h20": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        "float": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        "float16": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
-    }
+    },
 }
 
 ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_sp_fp16_custom_compress(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages,
-                                   thread_num, policy, enable_rasterization, use_cutlass_layout):
+def matmul_sp_fp16_custom_compress(
+    M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization, use_cutlass_layout
+):
     e_factor, e_dtype = (16, "int16")
 
     @T.prim_func
     def gemm_sp_fp16_custom_compress(
-            A_sparse: T.Tensor((M, K // 2), 'float16'),
-            E: T.Tensor((M, K // e_factor), e_dtype),
-            B: T.Tensor((K, N), 'float16'),
-            C: T.Tensor((M, N), accum_dtype),
+        A_sparse: T.Tensor((M, K // 2), "float16"),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), "float16"),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            A_shared = T.alloc_shared((block_M, block_K // 2), "float16")
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            B_shared = T.alloc_shared((block_K, block_N), "float16")
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             if use_cutlass_layout:
-                T.annotate_layout({
-                    E:
-                        make_cutlass_metadata_layout(
-                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
-                    E_shared:
-                        make_cutlass_metadata_layout(
-                            E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    }
+                )
             T.clear(C_local)
             T.disable_warp_group_reg_alloc()
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
@@ -108,8 +107,7 @@ def torch_compress(dense):
     A naive compression function, where each 4-bit meta matches 4 elements in original matrix in row major layout.
     """
     if dense.dim() != 2:
-        raise RuntimeError(
-            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
+        raise RuntimeError(f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
 
     m, k = dense.shape
 
@@ -131,9 +129,7 @@ def torch_compress(dense):
         if m % 32 != 0:
             raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 32")
     if k % (4 * quadbits_per_meta_elem) != 0:
-        raise RuntimeError(
-            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"
-        )
+        raise RuntimeError(f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}")
 
     if dense.dtype != torch.float:
         ksparse = 4
@@ -194,19 +190,13 @@ def torch_compress(dense):
         sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
         sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
     else:
-        sparse = dense_2.gather(-1,
-                                idxs0.unsqueeze(-1) // 2).view(
-                                    m, k // 2)  # type: ignore[possibly-undefined]
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
 
     meta_4 = idxs0 | (idxs1 << 2)
     meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
 
     if quadbits_per_meta_elem == 4:
-        meta = (
-            meta_n[:, :, 0]
-            | (meta_n[:, :, 1] << 4)
-            | (meta_n[:, :, 2] << 8)
-            | (meta_n[:, :, 3] << 12))
+        meta = meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12)
     elif quadbits_per_meta_elem == 8:
         meta = (
             meta_n[:, :, 0]
@@ -216,7 +206,8 @@ def torch_compress(dense):
             | (meta_n[:, :, 4] << 16)
             | (meta_n[:, :, 5] << 20)
             | (meta_n[:, :, 6] << 24)
-            | (meta_n[:, :, 7] << 28))
+            | (meta_n[:, :, 7] << 28)
+        )
 
     return (sparse, meta)
 
@@ -234,9 +225,11 @@ def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
 
 
 @tilelang.jit(
-    out_idx=[1, 2], pass_configs={
+    out_idx=[1, 2],
+    pass_configs={
         tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
-    })
+    },
+)
 def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
     e_factor, e_dtype = ARCH_INFO["8.0"]
     e_K = K // e_factor
@@ -249,23 +242,21 @@ def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
 
     @T.prim_func
     def kernel(
-            A: T.Tensor((M, K), dtype),
-            A_sp: T.Tensor((M, K // 2), dtype),
-            E: T.Tensor((M, e_K), e_dtype),
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
             if use_cutlass_layout:
-                T.annotate_layout({
-                    E:
-                        make_cutlass_metadata_layout(
-                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
-                    E_shared:
-                        make_cutlass_metadata_layout(
-                            E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    }
+                )
             T.clear(A_sp_shared)
             T.clear(E_shared)
             # TODO: alloc_var seems buggy here
@@ -295,8 +286,7 @@ def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
                         non_zero_elt_log_idx[1] = 3
                     for i in T.serial(elem):
                         val = non_zero_elt_log_idx[i]
-                        E_shared[tm, a_k // e_factor] |= T.shift_left(
-                            val, 4 * (g_i % (e_factor // group)) + 2 * i)
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
             T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
             T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
 
@@ -304,41 +294,27 @@ def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_cutlass_layout", action='store_true', help="Use cutlass layout for E tensor")
-    parser.add_argument(
-        "--use_torch_compressor", action='store_true', help="Use torch sparse for reference")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
+    parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
     args = parser.parse_args()
     kernel = matmul_sp_fp16_custom_compress(
-        args.m,
-        args.n,
-        args.k,
-        args.accum_dtype,
-        **DEFAULT_CONFIG[args.cfg][args.accum_dtype],
-        use_cutlass_layout=args.use_cutlass_layout)
+        args.m, args.n, args.k, args.accum_dtype, **DEFAULT_CONFIG[args.cfg][args.accum_dtype], use_cutlass_layout=args.use_cutlass_layout
+    )
 
-    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
-    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+    a = randn_semi_sparse(args.m, args.k, device="cuda", dtype=torch.half)
+    b = torch.randn(args.k, args.n, device="cuda", dtype=torch.half)
 
     if args.use_torch_compressor:
         assert not args.use_cutlass_layout, "torch sparse must be used with naive layout"
         a_sparse, e = torch_compress(a)
     else:
-        a_sparse, e = compress_kernel(
-            args.m, args.k, 32, 32, "float16", use_cutlass_layout=args.use_cutlass_layout)(
-                a)
+        a_sparse, e = compress_kernel(args.m, args.k, 32, 32, "float16", use_cutlass_layout=args.use_cutlass_layout)(a)
 
     c = kernel(a_sparse, e, b)
 
@@ -346,9 +322,7 @@ def main():
 
     assert not c.isnan().any(), "Reference result contains NaNs, please report an issue"
     torch_assert_close(c, ref_c.to(c.dtype), rtol=1e-3, atol=1e-3)
-    print(
-        f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}"
-    )
+    print(f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}")
 
     latency = do_bench(lambda: kernel(a_sparse, e, b))
     ref_latency = do_bench(lambda: a @ b)
@@ -356,8 +330,8 @@ def main():
     total_flops = 2 * args.m * args.n * args.k
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
-    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
-    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 91682a9e..828ca43a 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -16,80 +16,77 @@ arch = nvcc.get_target_compute_version()
 
 DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 64,
-            'num_stages': 1,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        "float": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        "float16": {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 256,
-            'block_N': 128,
-            'block_K': 64,
-            'num_stages': 2,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
     },
     "h20": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        "float": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        "float16": {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
-    }
+    },
 }
 
 ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy,
-                   enable_rasterization):
+def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization):
     e_factor, e_dtype = ARCH_INFO[arch]
 
     @T.prim_func
     def gemm_sp_fp16(
-            A_sparse: T.Tensor((M, K // 2), 'float16'),
-            E: T.Tensor((M, K // e_factor), e_dtype),
-            B: T.Tensor((K, N), 'float16'),
-            C: T.Tensor((M, N), accum_dtype),
+        A_sparse: T.Tensor((M, K // 2), "float16"),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), "float16"),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            A_shared = T.alloc_shared((block_M, block_K // 2), "float16")
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            B_shared = T.alloc_shared((block_K, block_N), "float16")
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
             T.disable_warp_group_reg_alloc()
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
-            T.annotate_layout({
-                E:
-                    make_cutlass_metadata_layout(
-                        E, mma_dtype="float16", block_k=block_K, arch=arch),
-                E_shared:
-                    make_cutlass_metadata_layout(
-                        E_shared, mma_dtype="float16", block_k=block_K, arch=arch),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype="float16", block_k=block_K, arch=arch),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", block_k=block_K, arch=arch),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
                 T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
@@ -107,25 +104,15 @@ def main():
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
     args = parser.parse_args()
-    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype,
-                            **DEFAULT_CONFIG[args.cfg][args.accum_dtype])
+    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype, **DEFAULT_CONFIG[args.cfg][args.accum_dtype])
 
-    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
-    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+    a = randn_semi_sparse(args.m, args.k, device="cuda", dtype=torch.half)
+    b = torch.randn(args.k, args.n, device="cuda", dtype=torch.half)
 
-    a_sparse, e = compress(
-        a,
-        transposed=False,
-        block_k=DEFAULT_CONFIG[args.cfg][args.accum_dtype]['block_K'],
-        arch=arch)
+    a_sparse, e = compress(a, transposed=False, block_k=DEFAULT_CONFIG[args.cfg][args.accum_dtype]["block_K"], arch=arch)
     c = kernel(a_sparse, e, b)
 
     ref_c = a @ b
@@ -140,8 +127,8 @@ def main():
     total_flops = 2 * args.m * args.n * args.k
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
-    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
-    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index c9666971..320a699c 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -3,27 +3,16 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype="float16", accum_dtype="float", out_dtype="float32"):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
index 145d622e..dfd84710 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -3,27 +3,16 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype="float16", accum_dtype="float", out_dtype="float32"):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index 31cf4064..2d83586e 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -39,7 +39,7 @@ total_tiles = num_block_m * num_block_n
 
 # Two-tile SK + DP
 streamk_tiles = total_tiles % streamk_programs
-if (total_tiles - streamk_tiles > streamk_programs):  # (total_tiles // total_programs > 1)
+if total_tiles - streamk_tiles > streamk_programs:  # (total_tiles // total_programs > 1)
     streamk_tiles += streamk_programs
 
 blocking_tiles = total_tiles - streamk_tiles
@@ -135,7 +135,6 @@ def tl_matmul_streamk(
         C: T.Tensor,
         C_local: T.LocalBuffer,
     ):
-
         for p in T.serial(sm_patition_factor):
             tile_id = pid + streamk_tiles + p * total_sm
             pid_m = tile_id // T.ceildiv(N, block_N)
@@ -150,12 +149,11 @@ def tl_matmul_streamk(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, dtypeAB),
-            B: T.Tensor(B_shape, dtypeAB),
-            C: T.Tensor((M, N), dtypeC),
+        A: T.Tensor(A_shape, dtypeAB),
+        B: T.Tensor(B_shape, dtypeAB),
+        C: T.Tensor((M, N), dtypeC),
     ):
         with T.Kernel(streamk_programs, threads=threads) as pid:
-
             A_shared = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared = T.alloc_shared(B_shared_shape, dtypeAB)
             A_shared_full_tiles = T.alloc_shared(A_shared_shape, dtypeAB)
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 3772dc6b..00cbac06 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -20,12 +20,11 @@ def naive_gemv(
     dtype: str = "float16",
     accum_dtype: str = "float",
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N)) as bn:
             tn = T.get_thread_binding(0)  # tn = threadIdx.x
@@ -38,8 +37,7 @@ def naive_gemv(
                     A_shared[tk] = A[bk * BLOCK_K + tk]
                     B_shared[tn, tk] = B[bn * BLOCK_N + tn, bk * BLOCK_K + tk]
                 for tk in T.serial(BLOCK_K):
-                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn,
-                                                                            tk].astype(accum_dtype)
+                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn, tk].astype(accum_dtype)
             C[bn * BLOCK_N + tn] = C_reg[0]
 
     return main
@@ -54,12 +52,11 @@ def naive_splitk_gemv(
     dtype: str = "float16",
     accum_dtype: str = "float",
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, BLOCK_K)) as bn:
             tn = T.get_thread_binding(0)
@@ -95,9 +92,9 @@ def splitk_gemv(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -136,9 +133,9 @@ def splitk_gemv_vectorized(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -177,9 +174,9 @@ def splitk_gemv_vectorized_tvm(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -197,9 +194,9 @@ def splitk_gemv_vectorized_tvm(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -209,7 +206,8 @@ def splitk_gemv_vectorized_tvm(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
@@ -218,10 +216,8 @@ def splitk_gemv_vectorized_tvm(
 
 def get_block_template_configs():
     iter_params = dict(
-        block_M=[2, 4, 8, 32, 64, 128],
-        block_N=[2, 4, 8, 32, 64, 128],
-        num_stages=[0, 1, 2, 3, 4],
-        threads=[32, 64, 128, 256])
+        block_M=[2, 4, 8, 32, 64, 128], block_N=[2, 4, 8, 32, 64, 128], num_stages=[0, 1, 2, 3, 4], threads=[32, 64, 128, 256]
+    )
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -237,18 +233,9 @@ def get_block_template_configs():
     },
     out_idx=[2],
 )
-def gemv_alloc_reducer(M,
-                       N,
-                       block_M=128,
-                       block_N=128,
-                       num_stages=2,
-                       threads=256,
-                       dtype: str = "float16",
-                       accum_dtype: str = "float"):
-
+def gemv_alloc_reducer(M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: str = "float16", accum_dtype: str = "float"):
     @T.prim_func
-    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M,
-                                                                            dtype)):  # type: ignore
+    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M, dtype)):  # type: ignore
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as i0_m:
             o_reducer = T.alloc_reducer(block_M, accum_dtype, replication="all")
             T.clear(o_reducer)
@@ -295,9 +282,9 @@ def get_autotuned_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -315,9 +302,9 @@ def get_autotuned_kernel(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -327,7 +314,8 @@ def get_autotuned_kernel(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
@@ -355,8 +343,7 @@ def main(do_bench: bool = True):
     check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K, do_bench=do_bench)
     check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K, do_bench=do_bench)
     check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K, do_bench=do_bench)
-    check_correctness_and_bench(
-        gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
+    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
 
     print("Test passed!")
 
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index ac8da7e2..b1af5360 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -5,21 +5,8 @@ import tilelang
 import tilelang.language as T
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_fwd(batch_sum,
-                     batch_count,
-                     K,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -29,17 +16,14 @@ def grouped_gemm_fwd(batch_sum,
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -49,23 +33,17 @@ def grouped_gemm_fwd(batch_sum,
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
                 cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
 
             cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[cur_batch_idx[0]]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx[0], k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
@@ -76,7 +54,6 @@ def grouped_gemm_fwd(batch_sum,
 
 
 class _GroupedGEMM(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, a, b, batch_sizes):
         block_M = 64
@@ -99,15 +76,11 @@ class _GroupedGEMM(torch.autograd.Function):
         for i in range(batch_count - 1):
             batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes[i])
         for i in range(batch_count - 1):
-            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                             math.ceil((batch_sizes[i] + 1) / padding_M) *
-                                             padding_M)
+            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes[i] + 1) / padding_M) * padding_M)
         batch_offsets = torch.tensor(batch_offsets_list, device=a.device, dtype=torch.int32)
-        batch_padded_offsets = torch.tensor(
-            batch_padded_offsets_list, device=a.device, dtype=torch.int32)
+        batch_padded_offsets = torch.tensor(batch_padded_offsets_list, device=a.device, dtype=torch.int32)
 
-        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages, threads)
 
         o = kernel(a, b, batch_sizes, batch_offsets, batch_padded_offsets)
         ctx.save_for_backward(a, b, batch_sizes, batch_offsets)
@@ -135,8 +108,7 @@ class _GroupedGEMM(torch.autograd.Function):
             return x
 
         A, B, batch_sizes = [maybe_contiguous(x) for x in (A, B, batch_sizes)]
-        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K, num_stages, threads)
 
         dB = kernel(A, grad_output, batch_sizes, batch_offsets)
         return None, dB, None
@@ -172,9 +144,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i] + 1) / padding_M) *
-                                         padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i] + 1) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -187,21 +157,8 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_bwd(batch_sum,
-                     batch_count,
-                     M,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -211,16 +168,13 @@ def grouped_gemm_bwd(batch_sum,
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, M], dtype),  # type: ignore
-            B: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, M], dtype),  # type: ignore
+        B: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count, threads=threads) as (bx, by, bz):
             A_shared = T.alloc_shared([block_K, block_M], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -228,13 +182,9 @@ def grouped_gemm_bwd(batch_sum,
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(batch_sizes[bz], block_K), num_stages=num_stages):
                 for i, j in T.Parallel(block_K, block_M):
-                    A_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i,
-                                               bx * block_M + j], 0)
+                    A_shared[i, j] = T.if_then_else(i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i, bx * block_M + j], 0)
                 for i, j in T.Parallel(block_K, block_N):
-                    B_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i,
-                                               by * block_N + j], 0)
+                    B_shared[i, j] = T.if_then_else(i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i, by * block_N + j], 0)
                 T.gemm(A_shared, B_shared, C_local, transpose_A=True)
 
             T.copy(C_local, C[bz, bx * block_M, by * block_N])
@@ -242,23 +192,12 @@ def grouped_gemm_bwd(batch_sum,
     return kernel
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
-
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, False, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, False, padding_M, device, dtype)
 
     A.requires_grad_(False)
     B.requires_grad_(True)
@@ -273,10 +212,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     O.backward(dO, retain_graph=True)
     dB, B.grad = B.grad.clone(), None
 
-    if (
-        torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and \
-        torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2)
-    ):
+    if torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2):
         print("✅ Tilelang and Torch match")
     else:
         print("❌ Tilelang and Torch mismatch")
@@ -284,12 +220,11 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -301,14 +236,4 @@ if __name__ == "__main__":
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 9b58e3a2..8f771051 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -18,8 +18,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
         torch.Tensor: Resulting tensor after grouped matrix multiplication.
     """
     assert a.shape[0] == sum(batch_sizes), "Sum of batch_sizes must equal the first dimension of a"
-    assert b.shape[0] == len(
-        batch_sizes), "The first dimension of b must match the length of batch_sizes"
+    assert b.shape[0] == len(batch_sizes), "The first dimension of b must match the length of batch_sizes"
 
     # Initialize output tensor
     output = torch.empty((sum(batch_sizes), b.shape[2]), device=a.device, dtype=a.dtype)
@@ -38,15 +37,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
 
 
 @tilelang.jit(out_idx=[2])
-def grouped_gemm(batch_sizes_list,
-                 K,
-                 N,
-                 block_M,
-                 block_N,
-                 block_K,
-                 num_stages=2,
-                 threads=128,
-                 dtype="float16"):
+def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -59,14 +50,13 @@ def grouped_gemm(batch_sizes_list,
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
     ):
-
         with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
@@ -77,23 +67,17 @@ def grouped_gemm(batch_sizes_list,
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
                 cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
 
             cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[cur_batch_idx[0]]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx[0], k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
@@ -111,8 +95,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -125,27 +108,16 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     batch_sum = sum(batch_sizes_list)
-    kernel = grouped_gemm(
-        tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
+    kernel = grouped_gemm(tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
     # print(kernel.get_kernel_source())
 
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
     out = kernel(A, B, batch_sizes, batch_offsets, batch_padded_offsets)
     ref_output = torch_gmm(A, B, batch_sizes, batch_offsets, trans_b)
     # print(out)
@@ -157,8 +129,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
     if profile:
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
-        latency = profiler.do_bench(
-            warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
+        latency = profiler.do_bench(warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
         print(f"Latency: {latency} ms")
         print(f"TFlops: {batch_sum * K * M * 2 / latency * 1e-9} TFlops")
 
@@ -173,12 +144,11 @@ def test_grouped_gemm():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -190,14 +160,4 @@ if __name__ == "__main__":
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/hadamard_transform/example_hadamard.py b/examples/hadamard_transform/example_hadamard.py
index 531d4689..64eb9bbd 100644
--- a/examples/hadamard_transform/example_hadamard.py
+++ b/examples/hadamard_transform/example_hadamard.py
@@ -17,7 +17,7 @@ def is_pow_of_2(n):
 def hadamard(b, n, dtype):
     assert is_pow_of_2(n), "n must be a power of 2"
     assert 2 <= n <= 32768, "n must be in [2, 32768]"
-    elem_size = {'float32': 4, 'float16': 2, 'bfloat16': 2}[dtype]
+    elem_size = {"float32": 4, "float16": 2, "bfloat16": 2}[dtype]
 
     logN = int(math.log2(n))
     threads = [0, 1, 1, 1, 2, 4, 8, 16, 32, 32, 128, 256, 256, 256, 256, 256][logN]
@@ -40,23 +40,21 @@ def hadamard(b, n, dtype):
     # print(f'{exchange_round=}')
 
     @T.macro
-    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype),
-                  round: int):
+    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype), round: int):
         tx = T.get_thread_binding(0)
         for i in T.serial(round):
             tx_stride = 1 << i
             another_tx = tx ^ tx_stride
-            sign = (
-                tx >> i
-            ) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
+            sign = (tx >> i) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
 
             for j in T.Pipelined(thread_elem, num_stages=1):
                 buf[j] = T.tvm_warp_shuffle(
-                    0xffffffff,  # mask of all threads
+                    0xFFFFFFFF,  # mask of all threads
                     local[j],
                     another_tx % warp_size,
                     warp_size,
-                    warp_size)
+                    warp_size,
+                )
                 local[j] = T.if_then_else(sign == 0, local[j] + buf[j], buf[j] - local[j])
 
     @T.prim_func
@@ -78,10 +76,8 @@ def hadamard(b, n, dtype):
                 for j in T.serial(chunknum):
                     chunkbase = j * chunksize
                     for k in T.serial(chunksize // 2):
-                        local[chunkbase +
-                              k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
-                        local[chunkbase + k + chunksize //
-                              2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k + chunksize // 2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
 
             # 3. Hadamard inside warp, n<=512
             # In warp level, we rely on warp shuffle to exchange data inside each warp, without using shared memory
@@ -131,28 +127,27 @@ def ref_program(x: torch.Tensor):
     assert x.ndim == 2
     dim = x.shape[-1]
     assert is_pow_of_2(dim)
-    return F.linear(
-        x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
+    return F.linear(x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='Batch size')
-    parser.add_argument('--dim', type=int, default=32768, help='Dimension')
+    parser.add_argument("--batch", type=int, default=64, help="Batch size")
+    parser.add_argument("--dim", type=int, default=32768, help="Dimension")
     args = parser.parse_args()
 
     B, D = args.batch, args.dim
-    x = torch.randn((B, D), device='cuda')
-    kernel = hadamard(B, D, 'float32')
+    x = torch.randn((B, D), device="cuda")
+    kernel = hadamard(B, D, "float32")
     y = kernel(x)
     y_ref = ref_program(x)
     torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
-    print('All tests passed.')
+    print("All tests passed.")
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
     latency = profiler.do_bench(warmup=100)
     print("Tile-lang: {:.2f} ms".format(latency))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
index acb318c1..196ddfc4 100644
--- a/examples/lazy_jit/lazyjit.en.ipynb
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -9,6 +9,7 @@
    "source": [
     "import sys\n",
     "from pathlib import Path\n",
+    "\n",
     "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
     "import tilelang\n",
     "import torch\n",
@@ -61,7 +62,7 @@
     "    out_dtype: T.dtype = T.float32,\n",
     "    block_M: int = 128,\n",
     "    block_N: int = 128,\n",
-    "    block_K: int = 32\n",
+    "    block_K: int = 32,\n",
     "):\n",
     "    M, K = A.shape\n",
     "    K, N = B.shape\n",
@@ -94,8 +95,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm(A, B)\n",
     "\n",
     "# check output is correct\n",
@@ -118,8 +119,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 1024, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm(A, B, block_M=64, block_N=64)"
    ]
   },
@@ -218,8 +219,8 @@
    "source": [
     "@tilelang.lazy_jit\n",
     "def gemm_dyn_K(\n",
-    "    A: T.Tensor[[int, T.dyn['K']], T.float16], # noqa: F821\n",
-    "    B: T.Tensor[[T.dyn['K'], int], T.float16], # noqa: F821\n",
+    "    A: T.Tensor[[int, T.dyn[\"K\"]], T.float16],  # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn[\"K\"], int], T.float16],  # noqa: F821\n",
     "):\n",
     "    M, K = A.shape\n",
     "    K, N = B.shape\n",
@@ -265,8 +266,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_dyn_K(A, B)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -295,18 +296,17 @@
    "source": [
     "from typing import Any\n",
     "\n",
+    "\n",
     "@tilelang.lazy_jit\n",
-    "def as_contingious(\n",
-    "    A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]\n",
-    "):\n",
+    "def as_contingious(A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]):\n",
     "    M, N = A.shape\n",
     "    B = T.empty((M, N), A.dtype)\n",
     "    block_M = 128\n",
     "    block_N = 128\n",
     "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
     "        T.copy(\n",
-    "            A[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N],\n",
-    "            B[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N]\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
     "        )\n",
     "    return B"
    ]
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 1024, device='cuda')\n",
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
     "B = as_contingious(A[::2, ::2])\n",
     "B_ref = A[::2, ::2].contiguous()\n",
     "torch.testing.assert_close(B, B_ref)"
@@ -370,8 +370,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_ptr(A, B, 1024, 256, 512)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -416,8 +416,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -496,18 +496,20 @@
    "source": [
     "from itertools import product\n",
     "\n",
+    "\n",
     "def get_configs():\n",
     "    return [\n",
     "        {\n",
-    "            'A': T.Tensor((1024, 1024), T.float32),\n",
-    "            'B': T.Tensor((1024, 1024), T.float32),\n",
-    "            'block_M': block_M,\n",
-    "            'block_N': block_N,\n",
-    "            'block_K': block_K,\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
     "        }\n",
     "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
     "    ]\n",
     "\n",
+    "\n",
     "gemm.par_compile(get_configs())"
    ]
   },
@@ -579,7 +581,8 @@
    "source": [
     "@T.macro\n",
     "def macro_with_ref(x: T.Ref):\n",
-    "    x = 1 # noqa: F841\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
     "\n",
     "@T.prim_func\n",
     "def foo(x: T.Tensor((2,))):\n",
@@ -591,6 +594,7 @@
     "        idx = T.alloc_var(T.int32, 0)\n",
     "        macro_with_ref(x[idx])\n",
     "\n",
+    "\n",
     "foo"
    ]
   },
@@ -616,7 +620,7 @@
     "    A: T.Tensor[[T.dyn], Any],\n",
     "    fn,\n",
     "):\n",
-    "    N, = A.shape\n",
+    "    (N,) = A.shape\n",
     "    B = T.empty((N,), dtype=A.dtype)\n",
     "    block_N = 128\n",
     "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
@@ -624,6 +628,8 @@
     "            idx = bx * block_N + i\n",
     "            B[idx] = fn(A[idx])\n",
     "    return B\n",
+    "\n",
+    "\n",
     "@T.macro\n",
     "def add_one(x):\n",
     "    return x + 1"
@@ -636,7 +642,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, device='cuda')\n",
+    "A = torch.randn(1024, device=\"cuda\")\n",
     "B = element_wise(A, add_one)\n",
     "B_ref = A + 1\n",
     "torch.testing.assert_close(B, B_ref)"
@@ -670,10 +676,11 @@
     "        var = var * 3 + 1\n",
     "        n31(x * 3 + 1, var)\n",
     "\n",
+    "\n",
     "@tilelang.lazy_jit\n",
     "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
     "    with T.Kernel(1) as _:\n",
-    "        n31(n, A[0])\n"
+    "        n31(n, A[0])"
    ]
   },
   {
@@ -694,7 +701,7 @@
     }
    ],
    "source": [
-    "A = torch.tensor([100], dtype=torch.int32, device='cuda')\n",
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
     "foo(A, 5)\n",
     "A"
    ]
@@ -745,12 +752,15 @@
     "def sincos(x):\n",
     "    return T.sin(x), T.cos(x)\n",
     "\n",
+    "\n",
     "@T.prim_func\n",
     "def foo():\n",
     "    with T.Kernel(32) as x:\n",
     "        s, c = sincos(x)\n",
-    "        a = s + c # noqa: F841\n",
-    "        b = s - c # noqa: F841\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
     "foo"
    ]
   }
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
index fb9b71b7..d6db4c76 100644
--- a/examples/lazy_jit/lazyjit.zh.ipynb
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -9,6 +9,7 @@
    "source": [
     "import sys\n",
     "from pathlib import Path\n",
+    "\n",
     "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
     "import tilelang\n",
     "import torch\n",
@@ -61,7 +62,7 @@
     "    out_dtype: T.dtype = T.float32,\n",
     "    block_M: int = 128,\n",
     "    block_N: int = 128,\n",
-    "    block_K: int = 32\n",
+    "    block_K: int = 32,\n",
     "):\n",
     "    M, K = A.shape\n",
     "    K, N = B.shape\n",
@@ -94,8 +95,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm(A, B)\n",
     "\n",
     "# check output is correct\n",
@@ -118,8 +119,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 1024, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm(A, B, block_M=64, block_N=64)"
    ]
   },
@@ -218,8 +219,8 @@
    "source": [
     "@tilelang.lazy_jit\n",
     "def gemm_dyn_K(\n",
-    "    A: T.Tensor[[int, T.dyn['K']], T.float16], # noqa: F821\n",
-    "    B: T.Tensor[[T.dyn['K'], int], T.float16], # noqa: F821\n",
+    "    A: T.Tensor[[int, T.dyn[\"K\"]], T.float16],  # noqa: F821\n",
+    "    B: T.Tensor[[T.dyn[\"K\"], int], T.float16],  # noqa: F821\n",
     "):\n",
     "    M, K = A.shape\n",
     "    K, N = B.shape\n",
@@ -265,8 +266,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_dyn_K(A, B)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -295,18 +296,17 @@
    "source": [
     "from typing import Any\n",
     "\n",
+    "\n",
     "@tilelang.lazy_jit\n",
-    "def as_contingious(\n",
-    "    A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]\n",
-    "):\n",
+    "def as_contingious(A: T.StridedTensor[[T.dyn, T.dyn], [T.dyn, T.dyn], Any]):\n",
     "    M, N = A.shape\n",
     "    B = T.empty((M, N), A.dtype)\n",
     "    block_M = 128\n",
     "    block_N = 128\n",
     "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
     "        T.copy(\n",
-    "            A[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N],\n",
-    "            B[bx * block_M: (bx + 1) * block_M, by * block_N: (by + 1) * block_N]\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
     "        )\n",
     "    return B"
    ]
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 1024, device='cuda')\n",
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
     "B = as_contingious(A[::2, ::2])\n",
     "B_ref = A[::2, ::2].contiguous()\n",
     "torch.testing.assert_close(B, B_ref)"
@@ -370,8 +370,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_ptr(A, B, 1024, 256, 512)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -416,8 +416,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, 512, dtype=torch.float16, device='cuda')\n",
-    "B = torch.randn(512, 256, dtype=torch.float16, device='cuda')\n",
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
     "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
     "C_ref = (A @ B).float()\n",
     "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
@@ -496,18 +496,20 @@
    "source": [
     "from itertools import product\n",
     "\n",
+    "\n",
     "def get_configs():\n",
     "    return [\n",
     "        {\n",
-    "            'A': T.Tensor((1024, 1024), T.float32),\n",
-    "            'B': T.Tensor((1024, 1024), T.float32),\n",
-    "            'block_M': block_M,\n",
-    "            'block_N': block_N,\n",
-    "            'block_K': block_K,\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
     "        }\n",
     "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
     "    ]\n",
     "\n",
+    "\n",
     "gemm.par_compile(get_configs())"
    ]
   },
@@ -579,7 +581,8 @@
    "source": [
     "@T.macro\n",
     "def macro_with_ref(x: T.Ref):\n",
-    "    x = 1 # noqa: F841\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
     "\n",
     "@T.prim_func\n",
     "def foo(x: T.Tensor((2,))):\n",
@@ -591,6 +594,7 @@
     "        idx = T.alloc_var(T.int32, 0)\n",
     "        macro_with_ref(x[idx])\n",
     "\n",
+    "\n",
     "foo"
    ]
   },
@@ -616,7 +620,7 @@
     "    A: T.Tensor[[T.dyn], Any],\n",
     "    fn,\n",
     "):\n",
-    "    N, = A.shape\n",
+    "    (N,) = A.shape\n",
     "    B = T.empty((N,), dtype=A.dtype)\n",
     "    block_N = 128\n",
     "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
@@ -624,6 +628,8 @@
     "            idx = bx * block_N + i\n",
     "            B[idx] = fn(A[idx])\n",
     "    return B\n",
+    "\n",
+    "\n",
     "@T.macro\n",
     "def add_one(x):\n",
     "    return x + 1"
@@ -636,7 +642,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "A = torch.randn(1024, device='cuda')\n",
+    "A = torch.randn(1024, device=\"cuda\")\n",
     "B = element_wise(A, add_one)\n",
     "B_ref = A + 1\n",
     "torch.testing.assert_close(B, B_ref)"
@@ -670,10 +676,11 @@
     "        var = var * 3 + 1\n",
     "        n31(x * 3 + 1, var)\n",
     "\n",
+    "\n",
     "@tilelang.lazy_jit\n",
     "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
     "    with T.Kernel(1) as _:\n",
-    "        n31(n, A[0])\n"
+    "        n31(n, A[0])"
    ]
   },
   {
@@ -694,7 +701,7 @@
     }
    ],
    "source": [
-    "A = torch.tensor([100], dtype=torch.int32, device='cuda')\n",
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
     "foo(A, 5)\n",
     "A"
    ]
@@ -745,12 +752,15 @@
     "def sincos(x):\n",
     "    return T.sin(x), T.cos(x)\n",
     "\n",
+    "\n",
     "@T.prim_func\n",
     "def foo():\n",
     "    with T.Kernel(32) as x:\n",
     "        s, c = sincos(x)\n",
-    "        a = s + c # noqa: F841\n",
-    "        b = s - c # noqa: F841\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
     "foo"
    ]
   }
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 568bcc55..7cbfc465 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -13,20 +13,20 @@ from typing import Optional, Tuple
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: str = "float16",
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = "float"
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -37,13 +37,13 @@ def tl_fused_chunk_bwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_bwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
@@ -66,11 +66,13 @@ def tl_fused_chunk_bwd_kernel(
             dh = T.alloc_fragment([BK, BV], accum_dtype)
             dh_shared = T.alloc_shared([BK, BV], dtype)
 
-            T.annotate_layout({
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared)
-            })
+            T.annotate_layout(
+                {
+                    dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
+                    dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
+                    dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
+                }
+            )
             T.use_swizzle(10)
 
             T.clear(h)
@@ -78,10 +80,9 @@ def tl_fused_chunk_bwd_kernel(
 
             # Calculate dQ
             for i in T.Pipelined(0, NT):
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
-                T.copy(dO[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                       do)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 T.gemm(do, v, ds, transpose_B=True, clear_accum=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -94,29 +95,19 @@ def tl_fused_chunk_bwd_kernel(
                 for row, col in T.Parallel(chunk_size, BK):
                     dq[row, col] *= scale
                 T.copy(dq, dq_shared)
-                T.atomic_add(
-                    dQ[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK],
-                    dq_shared)
+                T.atomic_add(dQ[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dq_shared)
 
             # Calculate dK, dV (reversely)
             for i in T.Pipelined(1, NT + 1):
                 start = NT - i
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, start * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(
-                    K[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_k * BK:(i_k + 1) * BK], k)
-                T.copy(
-                    V[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_v * BV:(i_v + 1) * BV], v)
-                T.copy(
-                    dO[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], do)
+                T.copy(K[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 # Calculate dk
-                T.gemm(
-                    v, do, ds, transpose_B=True, clear_accum=True
-                )  # ds here actually means `s`, but we simply reuse the buffer `ds`
+                T.gemm(v, do, ds, transpose_B=True, clear_accum=True)  # ds here actually means `s`, but we simply reuse the buffer `ds`
                 for row, col in T.Parallel(chunk_size, chunk_size):
                     ds_shared[row, col] = T.if_then_else(row <= col, ds[row, col], 0)
                 T.gemm(ds_shared, q, dk, clear_accum=True)
@@ -134,13 +125,9 @@ def tl_fused_chunk_bwd_kernel(
                 T.gemm(q, do, dh, transpose_A=True)
 
                 T.copy(dk, dk_shared)
-                T.atomic_add(
-                    dK[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_k * BK:(i_k + 1) * BK], dk_shared)
+                T.atomic_add(dK[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dk_shared)
                 T.copy(dv, dv_shared)
-                T.atomic_add(
-                    dV[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], dv_shared)
+                T.atomic_add(dV[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], dv_shared)
 
     return fused_chunk_linear_attn_bwd
 
@@ -155,34 +142,31 @@ def tl_fused_chunk_bwd(Q, K, V, dO):
     return dQ.to(torch.float16), dK.to(torch.float16), dV.to(torch.float16)
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=1024, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    do = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q = l2norm_fwd(q)[0].requires_grad_(True)
@@ -193,30 +177,27 @@ def main(B=1, S=1024, H=16, D=128):
     o_ref, _ = ref_program(q, k, v)
     o_ref.backward(do, retain_graph=True)
 
-    assert torch.allclose(
-        dq, q.grad, atol=1e-2, rtol=1e-2), f'dq max err: {(dq - q.grad).abs().max()}'
-    assert torch.allclose(
-        dk, k.grad, atol=1e-2, rtol=1e-2), f'dk max err: {(dk - k.grad).abs().max()}'
-    assert torch.allclose(
-        dv, v.grad, atol=1e-2, rtol=1e-2), f'dv max err: {(dv - v.grad).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(dq, q.grad, atol=1e-2, rtol=1e-2), f"dq max err: {(dq - q.grad).abs().max()}"
+    assert torch.allclose(dk, k.grad, atol=1e-2, rtol=1e-2), f"dk max err: {(dk - k.grad).abs().max()}"
+    assert torch.allclose(dv, v.grad, atol=1e-2, rtol=1e-2), f"dv max err: {(dv - v.grad).abs().max()}"
+    print("Passed all tests!✅")
 
     # Benchmark
     q.grad = k.grad = v.grad = None
     o_ref, _ = fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False)
-    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index 03900a7e..3d28f92b 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -14,20 +14,20 @@ from typing import Optional, Tuple
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 def tl_fused_chunk_fwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: str = "float16",
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = "float"
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -38,11 +38,12 @@ def tl_fused_chunk_fwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
-            final_state: T.Tensor([B, H, DK, DV], accum_dtype)):  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        final_state: T.Tensor([B, H, DK, DV], accum_dtype),
+    ):  # type: ignore
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
@@ -65,8 +66,8 @@ def tl_fused_chunk_fwd_kernel(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -77,12 +78,10 @@ def tl_fused_chunk_fwd_kernel(
                 T.gemm(k, v, h, transpose_A=True)
                 T.gemm(q, h_shared, o)
                 T.copy(o, o_shared)
-                T.atomic_add(
-                    O[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                    o_shared)
+                T.atomic_add(O[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], o_shared)
 
             # Output final state
-            T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
+            T.copy(h, final_state[i_b, i_h, i_k * BK : (i_k + 1) * BK, i_v * BV : (i_v + 1) * BV])
 
     return fused_chunk_linear_attn_fwd
 
@@ -91,38 +90,35 @@ def tl_fused_chunk_fwd(q, k, v):
     B, S, H, D = q.shape
     kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
     print(kernel.get_kernel_source())
-    o = torch.zeros((B, S, H, D), device='cuda', dtype=torch.float32)
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
     h = kernel(q, k, v, o)
     return o, h
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=512, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q, _ = l2norm_fwd(q)
@@ -131,25 +127,23 @@ def main(B=1, S=512, H=16, D=128):
     o, h = tl_fused_chunk_fwd(q, k, v)
     o_ref, h_ref = ref_program(q, k, v)
 
-    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f'o max err: {(o - o_ref).abs().max()}'
-    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f'h max err: {(h - h_ref).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f"o max err: {(o - o_ref).abs().max()}"
+    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f"h max err: {(h - h_ref).abs().max()}"
+    print("Passed all tests!✅")
 
-    t1 = do_bench(
-        lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False),
-        backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_mamba_chunk_scan.py b/examples/linear_attention/example_mamba_chunk_scan.py
index add49052..53b6cf9f 100644
--- a/examples/linear_attention/example_mamba_chunk_scan.py
+++ b/examples/linear_attention/example_mamba_chunk_scan.py
@@ -9,6 +9,7 @@ import itertools
 
 def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
     from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+
     out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
     return out
 
@@ -43,14 +44,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -61,12 +63,7 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -77,19 +74,21 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
     dtype = "float16"
     accum_dtype = "float"
     nchunks = T.ceildiv(seqlen, chunk_size)
@@ -97,20 +96,20 @@ def chunk_scan_fwd(batch,
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
             cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
@@ -136,27 +135,32 @@ def chunk_scan_fwd(batch,
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -165,34 +169,47 @@ def chunk_scan_fwd(batch,
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -200,27 +217,40 @@ def chunk_scan_fwd(batch,
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_scan_fwd(
             batch,
             seq_len,
@@ -234,7 +264,8 @@ if __name__ == "__main__":
             block_K=64,
             block_Dstate=128,
             num_stages=2,
-            threads=128)
+            threads=128,
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_mamba_chunk_state.py b/examples/linear_attention/example_mamba_chunk_state.py
index ad3df0df..6aefde7b 100644
--- a/examples/linear_attention/example_mamba_chunk_state.py
+++ b/examples/linear_attention/example_mamba_chunk_state.py
@@ -10,6 +10,7 @@ import itertools
 
 def chunk_state_triton(B, x, dt, dA_cumsum):
     from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_fwd
+
     return _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=False)
 
 
@@ -41,46 +42,33 @@ def ref_program(B, x, dt, dA_cumsum):
     x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
     B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
     decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype),
-                        dt.to(x.dtype), x)
+    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[4])
-def chunk_state_fwd(batch,
-                    seqlen,
-                    chunk_size,
-                    ngroups,
-                    nheads,
-                    headdim,
-                    dstate,
-                    block_M=64,
-                    block_N=64,
-                    block_K=64,
-                    num_stages=2,
-                    threads=128):
+def chunk_state_fwd(
+    batch, seqlen, chunk_size, ngroups, nheads, headdim, dstate, block_M=64, block_N=64, block_K=64, num_stages=2, threads=128
+):
     dtype = "float16"
     accum_dtype = "float"
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
-    def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
-        (batch, seqlen, nheads, headdim), dtype), dt: T.Tensor(
-            (batch, nheads, nchunks, chunk_size), dtype), dA_cumsum: T.Tensor(
-                (batch, nheads, nchunks, chunk_size), dtype), Output: T.Tensor(
-                    (batch, nchunks, nheads, headdim, dstate), dtype)):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+    def main(
+        B: T.Tensor((batch, seqlen, ngroups, dstate), dtype),
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        Output: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),
+    ):
+        with T.Kernel(nheads, T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N), batch * nchunks, threads=threads) as (bz, bx, by):
             x_shared = T.alloc_shared((block_K, block_M), dtype)
             x_local = T.alloc_fragment((block_K, block_M), dtype)
             xt_local = T.alloc_fragment((block_M, block_K), dtype)
@@ -101,20 +89,24 @@ def chunk_state_fwd(batch,
             m_idx = bx // T.ceildiv(dstate, block_N)
             n_idx = bx % T.ceildiv(dstate, block_N)
 
-            T.annotate_layout({
-                x_shared: tilelang.layout.make_swizzled_layout(x_shared),
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)
-            })
+            T.annotate_layout(
+                {x_shared: tilelang.layout.make_swizzled_layout(x_shared), acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)}
+            )
 
             dA_cs_last[0] = dA_cumsum[batch_idx, bz, chunk_idx, chunk_size - 1]
             T.clear(acc_o)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, m_idx * block_M:(m_idx + 1) * block_M], x_shared)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cumsum_shared)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                    ],
+                    x_shared,
+                )
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cumsum_shared)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dA_cumsum_shared, dA_cumsum_local)
                 T.copy(dt_shared, dt_local)
                 for i in T.Parallel(block_K):
@@ -123,47 +115,50 @@ def chunk_state_fwd(batch,
                 for i, j in T.Parallel(block_M, block_K):
                     xt_local[i, j] = x_local[j, i] * scale[j]
                 T.copy(
-                    B[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz // (nheads // ngroups),
-                      n_idx * block_N:(n_idx + 1) * block_N], B_shared)
+                    B[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz // (nheads // ngroups),
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    B_shared,
+                )
                 T.gemm(xt_local, B_shared, acc_o)
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx, bz, m_idx * block_M:(m_idx + 1) * block_M,
-                       n_idx * block_N:(n_idx + 1) * block_N])
+                Output[batch_idx, chunk_idx, bz, m_idx * block_M : (m_idx + 1) * block_M, n_idx * block_N : (n_idx + 1) * block_N],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_state_fwd(
-            batch,
-            seq_len,
-            chunk_size,
-            groups,
-            heads,
-            dim,
-            dstate,
-            block_M=64,
-            block_N=128,
-            block_K=64,
-            num_stages=4,
-            threads=128)
+            batch, seq_len, chunk_size, groups, heads, dim, dstate, block_M=64, block_N=128, block_K=64, num_stages=4, threads=128
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index 59445419..ccb11fe1 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -13,13 +13,12 @@ def chunk_retention_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: str = "float16",
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = "float"
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -30,16 +29,16 @@ def chunk_retention_fwd_kernel(
 
     @T.prim_func
     def chunk_retention_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
-            log_decay = T.alloc_var('float32')
-            log_decay = T.log2(1 - T.exp2(-5. - 1. * i_h))  # Head-specific log decay
+            log_decay = T.alloc_var("float32")
+            log_decay = T.log2(1 - T.exp2(-5.0 - 1.0 * i_h))  # Head-specific log decay
 
             q = T.alloc_shared([chunk_size, BK], dtype)
             k = T.alloc_shared([chunk_size, BK], dtype)
@@ -56,14 +55,12 @@ def chunk_retention_fwd_kernel(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
-                    s_shared[row,
-                             col] = T.if_then_else(row >= col, s[row, col] * T.exp2(
-                                 (row - col) * log_decay), 0)
+                    s_shared[row, col] = T.if_then_else(row >= col, s[row, col] * T.exp2((row - col) * log_decay), 0)
 
                 T.copy(h, h_shared)
                 T.gemm(q, h_shared, o, clear_accum=True)
@@ -75,9 +72,7 @@ def chunk_retention_fwd_kernel(
                     v[row, col] = v[row, col] * T.exp2((chunk_size - row - 1) * log_decay)
                 for row, col in T.Parallel(BK, BV):
                     h[row, col] = T.exp2(chunk_size * log_decay) * h[row, col]
-                T.copy(
-                    o, O[i_k, i_b, i * chunk_size:(i + 1) * chunk_size, i_h,
-                         i_v * BV:(i_v + 1) * BV])
+                T.copy(o, O[i_k, i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV])
                 T.gemm(k, v, h, transpose_A=True)
 
     return chunk_retention_fwd
@@ -89,24 +84,24 @@ def postprocess(o):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=4096, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=4096, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
     B, S, H, D = args.B, args.S, args.H, args.D
     total_flops = 2.0 * B * S * S * H * D  # causal
 
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     kernel = chunk_retention_fwd_kernel(B, S, H, D, D)
 
     t = do_bench(lambda: postprocess(kernel(q, k, v)), warmup=25, rep=100)
-    print(f'Tilelang latency: {t:.3f} ms')
-    print(f'Tilelang TFLOPs: {total_flops/t * 1e-9}')
+    print(f"Tilelang latency: {t:.3f} ms")
+    print(f"Tilelang TFLOPs: {total_flops / t * 1e-9}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index 48df3e09..6600bb5e 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -15,12 +15,11 @@ from tilelang.profiler import do_bench
 
 @tilelang.jit(out_idx=[3])
 def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
-
     block_M = 64
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
     shape = [batch, heads, seq_len, dim]
 
     seq_blocks = (seq_len + block_M - 1) // block_M
@@ -30,15 +29,13 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
     offset_shape = count_shape + [slash_size]
     index_shape = count_shape + [vertical_size]
 
-    vertical_size_round, slash_size_round = tilelang.next_power_of_2(
-        vertical_size), tilelang.next_power_of_2(slash_size)
+    vertical_size_round, slash_size_round = tilelang.next_power_of_2(vertical_size), tilelang.next_power_of_2(slash_size)
 
     dtype = "float16"
     accum_dtype = "float"
     int_dtype = "int32"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Prefetch(
             K: T.Tensor(shape, dtype),
@@ -53,32 +50,30 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
         ):
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    K_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    K[bz, by, column_index[k + i], j], 0)
+                    K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
 
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    V_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    V[bz, by, column_index[k + i], j], 0)
+                    V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
 
             T.ptx_commit_group()
 
         @T.macro
         def Compute(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                k: T.int32,
-                column_count: T.int32,
-                Q_shared: T.SharedBuffer([block_M, dim], dtype),
-                K_shared: T.SharedBuffer([block_N, dim], dtype),
-                V_shared: T.SharedBuffer([block_N, dim], dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-                count: T.int32,
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            k: T.int32,
+            column_count: T.int32,
+            Q_shared: T.SharedBuffer([block_M, dim], dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            V_shared: T.SharedBuffer([block_N, dim], dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
+            count: T.int32,
         ):
             T.ptx_wait_group(count)
             for i, j in T.Parallel(block_M, block_N):
@@ -108,17 +103,16 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
         @T.prim_func
         def vs_sparse_flashattn_ws(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                Output: T.Tensor(shape, dtype),
-                BlockCount: T.Tensor(count_shape, int_dtype),
-                BlockOffset: T.Tensor(offset_shape, int_dtype),
-                ColumnCount: T.Tensor(count_shape, int_dtype),
-                ColumnIndex: T.Tensor(index_shape, int_dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            Output: T.Tensor(shape, dtype),
+            BlockCount: T.Tensor(count_shape, int_dtype),
+            BlockOffset: T.Tensor(offset_shape, int_dtype),
+            ColumnCount: T.Tensor(count_shape, int_dtype),
+            ColumnIndex: T.Tensor(index_shape, int_dtype),
         ):
             with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bc, by, bz):
-
                 bx = T.ceildiv(seq_len, block_M) - 1 - bc
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([2, block_N, dim], dtype)
@@ -143,9 +137,11 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
                 T.create_list_of_mbarrier([128] * 9)
 
-                T.annotate_layout({
-                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                })
+                T.annotate_layout(
+                    {
+                        O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    }
+                )
 
                 block_count[0] = BlockCount[bz, by, bx]
                 column_count[0] = ColumnCount[bz, by, bx]
@@ -162,15 +158,15 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
                 if tid >= 128:
                     T.annotate_producer_reg_dealloc()
-                    T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                    T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                     T.mbarrier_arrive(mbarrier=8)
                     for bi in T.serial(block_count[0]):
                         k = block_offset[bi]
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 4, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(K[bz, by, k:k + block_N, :], K_shared[bi % 2, :, :])
+                        T.copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2)
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 6, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(V[bz, by, k:k + block_N, :], V_shared[bi % 2, :, :])
+                        T.copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2 + 2)
                 else:
                     T.annotate_consumer_reg_alloc()
@@ -181,16 +177,10 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                     for bi in T.serial(block_count[0]):
                         k = block_offset[bi]
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0, -T.infinity(acc_s.dtype))
 
                         T.mbarrier_wait_parity(mbarrier=bi % 2, parity=((bi & 3) >> 1))
-                        T.gemm(
-                            Q_shared,
-                            K_shared[bi % 2, :, :],
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared[bi % 2, :, :], acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         T.mbarrier_arrive(mbarrier=bi % 2 + 4)
 
                         T.copy(scores_max, scores_max_prev)
@@ -200,20 +190,15 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                             scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, dim):
                             acc_o[i, j] = acc_o[i, j] * scores_scale[i]
 
                         T.copy(acc_s, acc_s_cast)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=(((bi & 3) >> 1)))
-                        T.gemm(
-                            acc_s_cast,
-                            V_shared[bi % 2, :, :],
-                            acc_o,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=((bi & 3) >> 1))
+                        T.gemm(acc_s_cast, V_shared[bi % 2, :, :], acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                         T.mbarrier_arrive(mbarrier=bi % 2 + 6)
 
@@ -223,38 +208,85 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
                             logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
                     if column_count[0] != 0:
-                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz,
-                                 by)
+                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz, by)
                         for bi in T.serial(T.ceildiv(column_count[0], block_N) - 1):
                             k = bi * block_N
                             if bi % 2 == 0:
-                                Prefetch(K, V, K_shared_2, V_shared_2, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_1, V_shared_1,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_2, V_shared_2, column_index, column_count[0], k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count[0],
+                                    Q_shared,
+                                    K_shared_1,
+                                    V_shared_1,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                             else:
-                                Prefetch(K, V, K_shared_1, V_shared_1, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_2, V_shared_2,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count[0],
+                                    Q_shared,
+                                    K_shared_2,
+                                    V_shared_2,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                         if T.ceildiv(column_count[0], block_N) % 2 == 0:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0],
+                                Q_shared,
+                                K_shared_2,
+                                V_shared_2,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                         else:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0],
+                                Q_shared,
+                                K_shared_1,
+                                V_shared_1,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] /= logsum[i]
                     T.copy(acc_o, O_shared)
-                    T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                    T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return vs_sparse_flashattn_ws
 
@@ -470,11 +502,8 @@ def vertical_slash_sparse_attention(
     import os
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    sources = [
-        os.path.join(current_dir, 'ops', 'kernels.cpp'),
-        os.path.join(current_dir, 'ops', 'vertical_slash_index.cu')
-    ]
-    ops = load(name='convert', sources=sources, verbose=False)
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
     convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
     batch_size, num_heads, context_size, head_dim = query.shape
     pad = (block_size_M - context_size) & (block_size_M - 1)
@@ -485,15 +514,13 @@ def vertical_slash_sparse_attention(
     value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
 
     if head_dim not in [16, 32, 64, 128, 256, 512]:
-        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
         query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
         key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
         value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
 
-    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=False)[0]
-    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=True)[0]
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
 
     seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
     sm_scale = head_dim**-0.5
@@ -506,8 +533,7 @@ def vertical_slash_sparse_attention(
         block_size_N,
     )
 
-    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim,
-                                        v_idx.shape[2], s_idx.shape[2])
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, v_idx.shape[2], s_idx.shape[2])
 
     def run(is_triton: bool = True):
         if is_triton:
@@ -525,8 +551,7 @@ def vertical_slash_sparse_attention(
                 block_size_N,
             )
         else:
-            out = tl_kernel(query, key, value, block_count, block_offset, column_count,
-                            column_index)
+            out = tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
         return out[..., :context_size, :head_dim]
 
     return run
@@ -536,8 +561,7 @@ def sum_all_diagonal_matrix(mat: torch.tensor):
     b, h, n, m = mat.shape
     zero_mat = torch.zeros((b, h, n, n)).to(mat.device)  # Zero matrix used for padding
     mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)  # pads the matrix on left and right
-    mat_strided = mat_padded.as_strided(
-        (1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
+    mat_strided = mat_padded.as_strided((1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
     sum_diags = torch.sum(mat_strided, 2)  # Sums the resulting matrix's columns
     return sum_diags[:, :, 1:]
 
@@ -559,24 +583,23 @@ def main(argv=None):
     vertical_size, slash_size = args.vertical_size, args.slash_size
 
     torch.manual_seed(0)
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     q_len = SEQ_LEN
 
     vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
     last_q = 64
-    qk = torch.einsum('bhmk, bhnk -> bhmn', q[:, :, -last_q:, :], k)
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
     arange = torch.arange(last_q, device="cuda")
-    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :],
-                                        qk[:, :, :, -last_q:], -torch.inf)
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
     qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
     vertical = qk.sum(-2, keepdim=True)
     vertical[..., :30] = torch.inf
     vertical_topk = torch.topk(vertical, vertical_size, -1).indices
 
-    slash = sum_all_diagonal_matrix(qk)[..., :-last_q + 1]
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
     slash[..., -30:] = torch.inf
 
     slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index 40d367c2..a7a06b9c 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -45,7 +45,7 @@ def rms_norm(M, N, blk_m):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
@@ -54,7 +54,7 @@ def rms_norm(M, N, blk_m):
                 A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index a05f9b08..124a212f 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -45,7 +45,7 @@ def rms_norm(M, N, blk_m):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
@@ -54,7 +54,7 @@ def rms_norm(M, N, blk_m):
                 A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/online_softmax/online_softmax.py b/examples/online_softmax/online_softmax.py
index 432482d0..32f1c001 100644
--- a/examples/online_softmax/online_softmax.py
+++ b/examples/online_softmax/online_softmax.py
@@ -20,8 +20,8 @@ def softmax_kernel(
 
     @T.prim_func
     def main(
-            X: T.Tensor([M, N], dtype),
-            Y: T.Tensor([M, N], dtype),
+        X: T.Tensor([M, N], dtype),
+        Y: T.Tensor([M, N], dtype),
     ):
         with T.Kernel(M, threads=128) as (i_m):
             x = T.alloc_fragment([BN], dtype)
@@ -33,7 +33,7 @@ def softmax_kernel(
             T.fill(lse, -T.infinity(accum_dtype))
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 T.reduce_max(x, max_x, dim=0, clear=True)
 
@@ -45,12 +45,12 @@ def softmax_kernel(
                 lse[0] = max_x[0] * scale + T.log2(T.exp2(lse[0] - max_x[0] * scale) + sum_exp_x[0])
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 for j in T.Parallel(BN):
                     y[j] = T.exp2(x[j] * scale - lse[0])
 
-                T.copy(y, Y[i_m, i_n * BN:(i_n + 1) * BN])
+                T.copy(y, Y[i_m, i_n * BN : (i_n + 1) * BN])
 
     return main
 
@@ -69,4 +69,4 @@ t1 = do_bench(lambda: X.softmax(dim=1), warmup=25, rep=100)
 t2 = do_bench(lambda: kernel(X), warmup=25, rep=100)
 print(f"torch latency: {t1:.3f} ms")
 print(f"TileLang latency: {t2:.3f} ms")
-print(f"Speedup: {t1/t2:.3f}x")
+print(f"Speedup: {t1 / t2:.3f}x")
diff --git a/examples/plot_layout/fragment_mfma_load_a.py b/examples/plot_layout/fragment_mfma_load_a.py
index 2c3b282a..a7e8f890 100644
--- a/examples/plot_layout/fragment_mfma_load_a.py
+++ b/examples/plot_layout/fragment_mfma_load_a.py
@@ -11,10 +11,9 @@ from tilelang.intrinsics.mfma_layout import (
 )
 
 
-def make_mfma_load_base_layout(dtype: str = "float16",
-                               matrix: Literal["A", "B"] = "A",
-                               k_dim: int = 16,
-                               transposed: bool = False) -> T.Fragment:
+def make_mfma_load_base_layout(
+    dtype: str = "float16", matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
+) -> T.Fragment:
     """
     Create a layout function for storing MFMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mfma_store_layout` to
@@ -72,12 +71,10 @@ def make_mfma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
@@ -120,14 +117,11 @@ print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
 # warp layout 32x32
-warp_layout = base_layout.repeat([warp_rows, warp_cols],
-                                 repeat_on_thread=False,
-                                 lower_dim_first=False)
+warp_layout = base_layout.repeat([warp_rows, warp_cols], repeat_on_thread=False, lower_dim_first=False)
 print(warp_layout)
 plot_layout(warp_layout, name="warp_layout")
 
 # block layout 64x32
-block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True,
-                                  lower_dim_first=True).replicate(block_cols)
+block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True, lower_dim_first=True).replicate(block_cols)
 print(block_layout)
 plot_layout(block_layout, name="block_layout")
diff --git a/examples/plot_layout/fragment_mma_load_a.py b/examples/plot_layout/fragment_mma_load_a.py
index 98889944..17d1c6d5 100644
--- a/examples/plot_layout/fragment_mma_load_a.py
+++ b/examples/plot_layout/fragment_mma_load_a.py
@@ -5,9 +5,7 @@ from tvm.tir import IndexMap
 from tilelang.intrinsics.utils import get_mma_micro_size
 
 
-def make_mma_load_base_layout(dtype: str = "float16",
-                              matrix: Literal["A", "B"] = "A",
-                              transposed: bool = False) -> T.Fragment:
+def make_mma_load_base_layout(dtype: str = "float16", matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
     """
     Create a layout function for storing MMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -36,6 +34,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
         shared_16x16_to_mma_32x8_layout_sr_b,
         shared_16x32_to_mma_32x16_layout_sr_b,
     )
+
     assert matrix in ["A", "B"], "matrix should be either A or B"
     dtype_bits = DataType(dtype).bits
     # s represents spatial axis
@@ -67,12 +66,10 @@ def make_mma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 39ad348b..4b765ce1 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -7,12 +7,11 @@ import tilelang.language as T
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index 219d3ee3..f5f7fe7b 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,15 +27,17 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
@@ -48,16 +47,15 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
     block_mask_dtype = "int8"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -83,19 +81,19 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(q_shape, dtype),
-                K: T.Tensor(kv_shape, dtype),
-                V: T.Tensor(kv_shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(q_shape, dtype),
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(q_shape, dtype),
         ):
             with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -112,7 +110,7 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -124,33 +122,25 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                         if is_causal:
                             past_len = seq_kv - seq_q
                             for i, j in T.Parallel(block_M, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    bx * block_M + i + past_len >= k * block_N + j, 0,
-                                    -T.infinity(acc_s.dtype))
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i + past_len >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                         else:
                             T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
-                        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
 
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -165,44 +155,40 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.float16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
     # Run tilelang kernel
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
 
     # Verify accuracy
-    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "TileLang output doesn't match reference"
+    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), "TileLang output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -215,42 +201,40 @@ def test_topk_sparse_attention_qlen_lt_klen():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.float16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
     print(kernel.get_kernel_source())
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
diff --git a/examples/seer_attention/block_sparse_attn_triton.py b/examples/seer_attention/block_sparse_attn_triton.py
index ed33cc1e..b4cc3cd0 100644
--- a/examples/seer_attention/block_sparse_attn_triton.py
+++ b/examples/seer_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -54,7 +51,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -69,7 +65,7 @@ def _fwd_kernel_inner(
         qk *= sm_scale
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
-        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -149,7 +145,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -185,24 +181,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -247,7 +231,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,9 +254,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -281,9 +264,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -295,22 +276,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -322,16 +302,15 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -340,26 +319,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 8707c943..6c37dc09 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -28,24 +28,22 @@ def matmul_sp(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // 8), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // 8), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // 8), 'uint8')
+            E_shared = T.alloc_shared((block_M, block_K // 8), "uint8")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_cutlass_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", block_k=block_K),
-                E_shared:
-                    make_cutlass_metadata_layout(
-                        E_shared, mma_dtype="float16", arch="9.0", block_k=block_K),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="9.0", block_k=block_K),
+                }
+            )
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // 8], E_shared)
@@ -57,7 +55,7 @@ def matmul_sp(
     return main
 
 
-def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device='cpu'):
+def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device="cpu"):
     if shape[-1] % 4 != 0:
         raise ValueError("Last dimension must be divisible by 4 for 2:4 sparsity.")
 
@@ -102,9 +100,9 @@ def run_gemm_sp(
         num_threads,
     )
 
-    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device='cuda')
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
     A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
-    B = torch.randn((K, N), device='cuda', dtype=torch.float16)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
 
     C_sp = kernel(A_sparse, E, B).half()
     C = torch.matmul(A, B)
diff --git a/examples/topk/example_topk.py b/examples/topk/example_topk.py
index 0ca19fb1..c0cf09bc 100644
--- a/examples/topk/example_topk.py
+++ b/examples/topk/example_topk.py
@@ -26,9 +26,9 @@ def tl_topk(
 
     @T.prim_func
     def topk_kernel(
-            logits: T.Tensor([M, N], dtype),
-            topk_gates: T.Tensor([M, topk], dtype),
-            topk_indices: T.Tensor([M, topk], "int32"),
+        logits: T.Tensor([M, N], dtype),
+        topk_gates: T.Tensor([M, topk], dtype),
+        topk_indices: T.Tensor([M, topk], "int32"),
     ):
         with T.Kernel(T.ceildiv(M, blk_m), threads=threads) as bx:
             logits_frag = T.alloc_fragment([blk_m, N], dtype=dtype)
@@ -43,15 +43,12 @@ def tl_topk(
                 T.reduce_max(logits_frag, max_val, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j,
-                                                          expand_max_idx[i, j])
+                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j, expand_max_idx[i, j])
 
                 T.reduce_max(expand_max_idx, max_idx, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-
-                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0,
-                                                       logits_frag[i, j])
+                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0, logits_frag[i, j])
 
                 for i in T.Parallel(blk_m):
                     topk_gates[bx * blk_m + i, k] = max_val[i]
@@ -61,7 +58,6 @@ def tl_topk(
 
 
 def ref_program(logits, top_k):
-
     top_k_gates, top_k_indices = logits.topk(top_k, dim=1)
 
     return top_k_gates, top_k_indices.to(torch.int32)
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
index 3677d475..dbb39f78 100644
--- a/examples/visual_layout_inference/visual_layout_inference.py
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -7,15 +7,15 @@ import tilelang.language as T
     out_idx=[-1],
     pass_configs={
         tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
-        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg"
-    })
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg",
+    },
+)
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -49,12 +49,12 @@ def main():
     print("All check passed.")
 
     # print the layout visualization result and save figures to ./tmp.
-    '''
+    """
     C_local inferenced layout:
     Shape: [32, 32] -> [8]
     Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
     Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
-    '''
+    """
 
 
 if __name__ == "__main__":
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 4a8f41ee..4f4417e7 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,7 +9,7 @@ import argparse
 
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     dtype = "float16"
     accum_dtype = "float"
     kv_group_num = heads // kv_head_num
@@ -19,11 +19,11 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.macro
     def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             # smem_sQ
@@ -81,10 +81,12 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             cur_kv_head = hid // (kv_group_num // block_H)
 
-            T.annotate_layout({
-                O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
-                O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
-            })
+            T.annotate_layout(
+                {
+                    O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
+                    O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
+                }
+            )
 
             # barriers_Q
             q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
@@ -108,9 +110,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
 
@@ -123,25 +125,18 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                 T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                 T.barrier_arrive(kv_shared_1_r_is_ready)
 
-                T.copy(K_pe[bid, block_N:2 * block_N, cur_kv_head, :], K_pe_shared_1)
+                T.copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1)
                 T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     T.barrier_wait(kv_shared_0_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_0_l,
-                        acc_s_0,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True, wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
@@ -161,8 +156,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     for i, j in T.Parallel(block_H, block_N):
                         acc_s_0[i, j] = T.exp2(acc_s_0[i, j] * scale - scores_max[i] * scale)
                     for i in T.Parallel(block_H):
-                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale - scores_max[i] * scale)
 
                     T.reduce_sum(acc_s_0, scores_sum_0, dim=1)
 
@@ -182,9 +176,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_0_l)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim], KV_shared_0_l)
                         T.barrier_arrive(kv_shared_0_l_is_ready)
 
                     # Step 11.
@@ -204,15 +196,10 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_1_l)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head, :],
-                            K_pe_shared_1)
+                        T.copy(K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :], K_pe_shared_1)
                         T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 T.copy(logsum_0, logsum)
@@ -221,8 +208,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_l[i, j] /= logsum[i]
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid,
-                                          hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim])
 
             else:
                 T.copy(Q_pe_shared, Q_pe_local_1)
@@ -237,16 +223,9 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     # Step 2.
                     T.barrier_wait(kv_shared_1_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_1_l,
-                        acc_s_1,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
@@ -265,8 +244,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.copy(scores_max_1, scores_max)
 
                     for i in T.Parallel(block_H):
-                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale - scores_max[i] * scale)
 
                     # Step 8.
                     for i, j in T.Parallel(block_H, block_N):
@@ -279,8 +257,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                         acc_o_r[i, j] = acc_o_r[i, j] * (scores_scale_0[i] * scores_scale_1[i])
 
                     for i in T.Parallel(block_H):
-                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[
-                            i] + scores_sum_1[i]
+                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[i] + scores_sum_1[i]
 
                     T.barrier_arrive(scale_1_ready_barrier)
 
@@ -291,9 +268,7 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.barrier_arrive(s_shared_ready_barrier)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_1_r)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                         T.barrier_arrive(kv_shared_1_r_is_ready)
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
@@ -301,15 +276,10 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                     T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_0_r)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:], KV_shared_0_r)
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head, :],
-                            K_pe_shared_0)
+                        T.copy(K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :], K_pe_shared_0)
                         T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 T.barrier_wait(lse_0_ready_barrier, 0)
@@ -319,18 +289,17 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_r[i, j] /= logsum[i]
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          h_dim:])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:])
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         flash_attn(Q, Q_pe, KV, K_pe, Output)
 
@@ -352,31 +321,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -399,12 +361,12 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index b738a4b9..5d438b5d 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -8,7 +8,6 @@ tilelang.disable_cache()
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     num_stages = 2
     mbarrier_list = [128, 128] * num_stages
 
@@ -32,19 +31,13 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
             for ko in range(T.ceildiv(K, block_K)):
                 with T.ws(1):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages + num_stages,
-                        parity=((ko // num_stages) % num_stages) ^ 1)
-                    T.copy(A[by * block_M:(by + 1) * block_M, ko * block_K:(ko + 1) * block_K],
-                           A_shared[ko % num_stages, :, :])
-                    T.copy(B[ko * block_K:(ko + 1) * block_K, bx * block_N:(bx + 1) * block_N],
-                           B_shared[ko % num_stages, :, :])
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages + num_stages, parity=((ko // num_stages) % num_stages) ^ 1)
+                    T.copy(A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K], A_shared[ko % num_stages, :, :])
+                    T.copy(B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[ko % num_stages, :, :])
                     T.mbarrier_arrive(mbarrier=ko % num_stages)
                 with T.ws(0):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
-                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :],
-                           C_local)
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
+                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :], C_local)
                     T.mbarrier_arrive(mbarrier=ko % num_stages + num_stages)
 
             with T.ws(0):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index 9ba9f681..03ddf812 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -5,20 +5,12 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_0_gemm_1(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index faaf48c6..63aed2be 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -5,20 +5,12 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index c9127454..f24d76a2 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -5,26 +5,20 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    })
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+    },
+)
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     warp_group_num = 2
     threads = 128 * warp_group_num
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index 3b1d8671..f3f8a665 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -6,7 +6,6 @@ import tilelang.language as T
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
         A: T.Tensor[(M, K), dtype],
diff --git a/format.sh b/format.sh
index e820b588..3cc4390d 100755
--- a/format.sh
+++ b/format.sh
@@ -9,7 +9,7 @@
 #    bash format.sh --all
 #
 #
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# Ruff (format) + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 # You are encouraged to run this locally before pushing changes for review.
 
 # Cause the script to exit if a single command fails
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
index 33a58129..e7a82254 100644
--- a/maint/gemm_v2/correctness_evaluation.py
+++ b/maint/gemm_v2/correctness_evaluation.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -66,7 +66,8 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -151,9 +152,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -238,9 +239,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -326,9 +327,9 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -394,37 +395,48 @@ M_VALUES = [64, 128, 256]
 N_VALUES = [16, 32, 64, 128, 256, 512]
 K_VALUES = [16, 32, 64, 128]
 K_VALUES_8Bit = [32, 64, 128]
-FALSE_TRUE_CASES = ([
-    pytest.param(
-        k,
-        "float16",
-        "float16",
-        "float16",
-        id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
-] + [pytest.param(
-    k,
-    "int8",
-    "int32",
-    "int32",
-    id="K32-int8-int32-int32",
-) for k in K_VALUES_8Bit] + [
-    pytest.param(
-        k,
-        "float8_e5m2",
-        "float32",
-        "float32",
-        id="K32-float8_e5m2-float32-float32",
-    ) for k in K_VALUES_8Bit
-] + [
-    pytest.param(
-        k,
-        "float8_e4m3",
-        "float32",
-        "float32",
-        id="K32-float8_e4m3-float32-float32",
-    ) for k in K_VALUES_8Bit
-])
+FALSE_TRUE_CASES = (
+    [
+        pytest.param(
+            k,
+            "float16",
+            "float16",
+            "float16",
+            id=f"K{k}-float16-float16-float16",
+        )
+        for k in K_VALUES
+    ]
+    + [
+        pytest.param(
+            k,
+            "int8",
+            "int32",
+            "int32",
+            id="K32-int8-int32-int32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            "float8_e5m2",
+            "float32",
+            "float32",
+            id="K32-float8_e5m2-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            "float8_e4m3",
+            "float32",
+            "float32",
+            id="K32-float8_e4m3-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+)
 
 
 def _ensure_torch_dtypes(*dtype_names):
diff --git a/maint/gemm_v2/correctness_evaluation_sm70.py b/maint/gemm_v2/correctness_evaluation_sm70.py
index 128f4abc..3b4503d4 100644
--- a/maint/gemm_v2/correctness_evaluation_sm70.py
+++ b/maint/gemm_v2/correctness_evaluation_sm70.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -67,7 +67,8 @@ def _compile_and_check(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -150,9 +151,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
@@ -213,14 +214,15 @@ def run_gemm_rs(
 M_VALUES = [64, 128]
 N_VALUES = [32, 64, 128]
 K_VALUES = [16, 32, 64]
-FALSE_TRUE_CASES = ([
+FALSE_TRUE_CASES = [
     pytest.param(
         k,
         "float16",
         "float16",
         "float16",
         id=f"K{k}-float16-float16-float16",
-    ) for k in K_VALUES
+    )
+    for k in K_VALUES
 ] + [
     pytest.param(
         k,
@@ -228,8 +230,9 @@ FALSE_TRUE_CASES = ([
         "float16",
         "float32",
         id=f"K{k}-float16-float16-float32",
-    ) for k in K_VALUES
-])
+    )
+    for k in K_VALUES
+]
 
 
 def _ensure_torch_dtypes(*dtype_names):
diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
index 1831ac8a..4ce8691e 100644
--- a/maint/gemm_v2/correctness_evaluation_tcgen05.py
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -27,9 +27,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -42,15 +42,7 @@ def matmul(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -74,7 +66,8 @@ def _compile_and_check(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     print(kernel.get_kernel_source())
 
@@ -138,14 +131,15 @@ M_VALUES = [32, 64, 128, 256]
 N_VALUES = [64, 128, 256, 512]
 K_VALUES = [16, 32, 64, 128]
 K_VALUES_8Bit = [32, 64, 128]
-FALSE_TRUE_CASES = ([
+FALSE_TRUE_CASES = [
     pytest.param(
         k,
         "float16",
         "float32",
         "float32",
         id=f"K{k}-float16-float-float",
-    ) for k in K_VALUES
+    )
+    for k in K_VALUES
 ] + [
     pytest.param(
         k,
@@ -153,8 +147,9 @@ FALSE_TRUE_CASES = ([
         "float32",
         "float32",
         id="K32-float8_e5m2-float32-float32",
-    ) for k in K_VALUES_8Bit
-])
+    )
+    for k in K_VALUES_8Bit
+]
 
 TRANS_CASES = [
     pytest.param(False, True, id="nt"),
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
index 07a50201..4dcb7cf9 100644
--- a/maint/gemm_v2/latency.py
+++ b/maint/gemm_v2/latency.py
@@ -14,12 +14,11 @@ use_v2 = args.use_v2
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_gemm.py b/maint/gemm_v2/latency_gemm.py
index 13392dec..a66167d4 100644
--- a/maint/gemm_v2/latency_gemm.py
+++ b/maint/gemm_v2/latency_gemm.py
@@ -14,12 +14,11 @@ use_v2 = args.use_v2
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/maint/gemm_v2/latency_mha_fwd_bhsd.py b/maint/gemm_v2/latency_mha_fwd_bhsd.py
index 4126bb9d..3fd56001 100644
--- a/maint/gemm_v2/latency_mha_fwd_bhsd.py
+++ b/maint/gemm_v2/latency_mha_fwd_bhsd.py
@@ -8,13 +8,13 @@ import argparse
 from functools import partial
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--batch', type=int, default=128, help='batch size')
-parser.add_argument('--heads', type=int, default=16, help='heads')
-parser.add_argument('--seq_q', type=int, default=1024, help='query sequence length')
-parser.add_argument('--seq_kv', type=int, default=1024, help='key/value sequence length')
-parser.add_argument('--dim', type=int, default=256, help='dim')
-parser.add_argument('--is_causal', action='store_true', help='causal')
-parser.add_argument('--tune', action='store_true', help='tune configs')
+parser.add_argument("--batch", type=int, default=128, help="batch size")
+parser.add_argument("--heads", type=int, default=16, help="heads")
+parser.add_argument("--seq_q", type=int, default=1024, help="query sequence length")
+parser.add_argument("--seq_kv", type=int, default=1024, help="key/value sequence length")
+parser.add_argument("--dim", type=int, default=256, help="dim")
+parser.add_argument("--is_causal", action="store_true", help="causal")
+parser.add_argument("--tune", action="store_true", help="tune configs")
 parser.add_argument("--use_v2", action="store_true")
 
 args = parser.parse_args()
@@ -29,20 +29,13 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     dtype = "float16"
@@ -62,7 +55,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
         if is_causal:
             for i, j in T.Parallel(block_M, block_N):
                 q_idx = bx * block_M + i + past_len
@@ -85,7 +78,7 @@ def flashattn(batch,
         by: T.int32,
         bz: T.int32,
     ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
         # T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
         if use_v2:
             T.gemm_v2(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
@@ -94,13 +87,13 @@ def flashattn(batch,
 
     @T.macro
     def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
     ):
         T.copy(scores_max, scores_max_prev)
         T.fill(scores_max, -T.infinity(accum_dtype))
@@ -125,18 +118,18 @@ def flashattn(batch,
 
     @T.macro
     def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
     ):
         for i, j in T.Parallel(block_M, dim):
             acc_o[i, j] *= scores_scale[i]
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -152,43 +145,42 @@ def flashattn(batch,
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                 Rescale(acc_o, scores_scale)
                 MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -206,18 +198,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=0,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128)
         print(kernel.get_kernel_source())
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
diff --git a/maint/host_checks/01_num_args_mismatch.py b/maint/host_checks/01_num_args_mismatch.py
index 8ba36646..9528652e 100644
--- a/maint/host_checks/01_num_args_mismatch.py
+++ b/maint/host_checks/01_num_args_mismatch.py
@@ -3,6 +3,7 @@
 Note: The adapter-level wrapper expects only inputs (A, B) because C is marked as output.
 Calling with the wrong number of inputs raises a ValueError before host entry.
 """
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/02_pointer_type_error.py b/maint/host_checks/02_pointer_type_error.py
index fd358540..188a4f8c 100644
--- a/maint/host_checks/02_pointer_type_error.py
+++ b/maint/host_checks/02_pointer_type_error.py
@@ -3,6 +3,7 @@
 We pass an integer for A; wrapper forwards it to the host where a pointer is expected.
 Expected: error like "Expect buffer A_handle to be pointer or tensor" (exact name depends on kernel param).
 """
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/03_ndim_mismatch.py b/maint/host_checks/03_ndim_mismatch.py
index 994ce23e..76637e8d 100644
--- a/maint/host_checks/03_ndim_mismatch.py
+++ b/maint/host_checks/03_ndim_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: ndim (rank) mismatch for A.
-"""
+"""Reproduce: ndim (rank) mismatch for A."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/04_dtype_mismatch.py b/maint/host_checks/04_dtype_mismatch.py
index 6e6a0503..f3554c1d 100644
--- a/maint/host_checks/04_dtype_mismatch.py
+++ b/maint/host_checks/04_dtype_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: dtype mismatch for A (float32 vs expected float16).
-"""
+"""Reproduce: dtype mismatch for A (float32 vs expected float16)."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/05_shape_mismatch.py b/maint/host_checks/05_shape_mismatch.py
index 8b41ae36..a4824817 100644
--- a/maint/host_checks/05_shape_mismatch.py
+++ b/maint/host_checks/05_shape_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: shape constant/symbol mismatch on A.
-"""
+"""Reproduce: shape constant/symbol mismatch on A."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/06_strides_mismatch.py b/maint/host_checks/06_strides_mismatch.py
index 477d200b..7e523cd6 100644
--- a/maint/host_checks/06_strides_mismatch.py
+++ b/maint/host_checks/06_strides_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: strides check failure (non-contiguous A via transpose).
-"""
+"""Reproduce: strides check failure (non-contiguous A via transpose)."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/07_device_type_mismatch.py b/maint/host_checks/07_device_type_mismatch.py
index 67cb7718..af8e5efd 100644
--- a/maint/host_checks/07_device_type_mismatch.py
+++ b/maint/host_checks/07_device_type_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel.
-"""
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/08_device_id_mismatch.py b/maint/host_checks/08_device_id_mismatch.py
index 64910966..280aca15 100644
--- a/maint/host_checks/08_device_id_mismatch.py
+++ b/maint/host_checks/08_device_id_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: device_id mismatch (requires >=2 CUDA devices).
-"""
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/09_null_data_pointer.py b/maint/host_checks/09_null_data_pointer.py
index 00bac67d..09f5de1a 100644
--- a/maint/host_checks/09_null_data_pointer.py
+++ b/maint/host_checks/09_null_data_pointer.py
@@ -7,6 +7,7 @@ or a host-side non-NULL pointer check.
 Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
 demonstrates passing None, which still reproduces the intended class of failure.
 """
+
 import torch
 from common import build_matmul_kernel
 
diff --git a/maint/host_checks/10_scalar_type_mismatch.py b/maint/host_checks/10_scalar_type_mismatch.py
index f1fcba27..4f2c90b8 100644
--- a/maint/host_checks/10_scalar_type_mismatch.py
+++ b/maint/host_checks/10_scalar_type_mismatch.py
@@ -1,5 +1,5 @@
-"""Reproduce: scalar parameter type mismatch (int/bool).
-"""
+"""Reproduce: scalar parameter type mismatch (int/bool)."""
+
 from common import build_scalar_check_kernel
 
 
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
index cdafc8bd..649527d4 100644
--- a/maint/host_checks/common.py
+++ b/maint/host_checks/common.py
@@ -3,20 +3,12 @@ import tilelang.language as T
 import torch
 
 
-def make_matmul_prim(M,
-                     N,
-                     K,
-                     block_M=128,
-                     block_N=128,
-                     block_K=32,
-                     dtype="float16",
-                     accum_dtype="float"):
-
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype="float16", accum_dtype="float"):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -42,7 +34,6 @@ def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
 
 
 def build_scalar_check_kernel(target="cuda"):
-
     @T.prim_func
     def scalar_check(x: T.int32, flag: T.bool()):
         T.evaluate(0)
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
index 7d0d67db..985c3bd9 100644
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -37,7 +37,7 @@ OP_NAMES: Dict[int, str] = {
     6: "sqrt",
     7: "tanh",
     8: "rsqrt",
-    9: "inv_sqrt"
+    9: "inv_sqrt",
 }
 
 # Block sizes for kernels
@@ -49,8 +49,7 @@ TILELANG_THREADS = 128
 
 def parse_arguments() -> argparse.Namespace:
     """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Precision comparison tool for various CUDA implementations")
+    parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
     parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
     parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
     parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
     return load(
         name="cuda_ops",
         sources=["cuda_ops.cu"],
-        extra_cuda_cflags=[]  # No fast_math flags
+        extra_cuda_cflags=[],  # No fast_math flags
     )
 
 
@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S
 
 
 @triton.jit
-def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
-                                  BLOCK_SIZE: tl.constexpr):
+def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     """LibDevice Triton kernel for unary operations."""
     pid = tl.program_id(0)
     block_start = pid * BLOCK_SIZE
@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
 
     @T.prim_func
     def tilelang_unary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), "float32"),
+        B: T.Tensor((M, N), "float32"),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):
 
     @T.prim_func
     def tilelang_binary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
-            C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), "float32"),
+        B: T.Tensor((M, N), "float32"),
+        C: T.Tensor((M, N), "float32"),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -247,10 +239,7 @@ def make_tilelang_binary_kernel(M: int, N: int):
     return tilelang_binary_kernel
 
 
-def tilelang_op(x: torch.Tensor,
-                op_id: int,
-                y: Optional[torch.Tensor] = None,
-                use_fastmath: bool = False) -> torch.Tensor:
+def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
     """TileLang operation interface."""
     assert x.is_cuda
 
@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x, y)
     else:  # Unary operation
         kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x)
 
     # Restore original shape
@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     """Standard Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     return out
 
 
-def triton_libdevice_op(x: torch.Tensor,
-                        op_id: int,
-                        y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """LibDevice Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
     return out
 
 
-def get_pytorch_reference(x: torch.Tensor,
-                          op_id: int,
-                          y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Get PyTorch reference implementation for the given operation."""
     if op_id == 0:
         assert y is not None, "Division requires second operand"
@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T
 
     abs_err = (output_double - reference_double).abs()
     rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
-    print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
-          f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
+    print(
+        f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
+        f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
+    )
 
 
 # Precision comparison function
@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
             results[name] = None
 
     # Print comparison header
-    print(
-        f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
-    )
+    print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
     print("-" * 90)
 
     # Compare all implementations against double precision reference
@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
         summarize_error(tag, output, ref_double)
 
 
-def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
-                       high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Generate appropriate test data for each operation."""
     if op_id == 0:  # Division
         x = torch.empty(n, device=device).uniform_(low, high)
@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
 
 def main() -> None:
     """Main execution function."""
-    print(
-        "Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
-    )
+    print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
     print("=" * 90)
 
     for op_id in range(len(OP_NAMES)):
diff --git a/maint/scripts/ci_performance.py b/maint/scripts/ci_performance.py
index 998e7b65..8a353c0a 100644
--- a/maint/scripts/ci_performance.py
+++ b/maint/scripts/ci_performance.py
@@ -10,39 +10,32 @@ env["TILELANG_CLEAR_CACHE"] = "1"
 
 def parse_output(output):
     data = {}
-    for line in output.split('\n'):
+    for line in output.split("\n"):
         line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
+        if line.startswith("Latency:"):
+            match = re.search(r"Latency: ([\d.]+)", line)
+            data["latency"] = match.group(1) if match else "N/A"
+        elif line.startswith("TFlops:"):
+            match = re.search(r"TFlops: ([\d.]+)", line)
+            data["best_tflops"] = match.group(1) if match else "N/A"
+        elif line.startswith("Config:"):
+            data["config"] = line.split("Config: ")[-1]
+        elif line.startswith("Reference TFlops:"):
+            match = re.search(r"Reference TFlops: ([\d.]+)", line)
+            data["ref_tflops"] = match.group(1) if match else "N/A"
     return data
 
 
-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v1 = subprocess.run(["./tl/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v1 = parse_output(output_v1)
 
-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
+output_v2 = subprocess.run(["./tll/bin/python", "./maint/scripts/performance.py"], capture_output=True, text=True, env=env).stdout
 data_v2 = parse_output(output_v2)
 
-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
+table = [
+    ["original", data_v1["latency"], data_v1["best_tflops"], data_v1["ref_tflops"], data_v1["config"]],
+    ["current", data_v2["latency"], data_v2["best_tflops"], data_v2["ref_tflops"], data_v2["config"]],
+]
 
 headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
 
diff --git a/maint/scripts/performance.py b/maint/scripts/performance.py
index 24c4a21e..849bcf36 100644
--- a/maint/scripts/performance.py
+++ b/maint/scripts/performance.py
@@ -8,19 +8,20 @@ def ref_program(A, B):
 
 
 def get_configs():
-    configs = [{
-        "block_M": 128,
-        "block_N": 128,
-        "block_K": 64,
-        "num_stages": 2,
-        "thread_num": 256,
-        "enable_rasteration": True,  # keep param name for backward-compat
-    }]
+    configs = [
+        {
+            "block_M": 128,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 256,
+            "enable_rasteration": True,  # keep param name for backward-compat
+        }
+    ]
     return configs
 
 
 def run(M, N, K):
-
     def kernel(
         block_M=None,
         block_N=None,
@@ -34,12 +35,11 @@ def run(M, N, K):
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -60,12 +60,16 @@ def run(M, N, K):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs()).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs())
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 22467134..992eba55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,10 +122,7 @@ tilelang = "tilelang"
 "tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
 "tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
 
-[tool.yapf]
-based_on_style = "yapf"
-column_limit = 100
-indent_width = 4
+ 
 
 [tool.codespell]
 ignore-words = "docs/spelling_wordlist.txt"
@@ -138,7 +135,7 @@ skip = [
 
 [tool.ruff]
 target-version = "py39"
-line-length = 100
+line-length = 140
 output-format = "full"
 
 exclude = [
@@ -146,6 +143,14 @@ exclude = [
     "examples/deepseek_v32/inference",
 ]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
+
 [tool.ruff.lint.per-file-ignores]
 # Do not upgrade type hint in testing and examples.
 # See https://github.com/tile-ai/tilelang/issues/1079 for more information.
diff --git a/requirements-lint.txt b/requirements-lint.txt
index e64eee16..54f03638 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -4,4 +4,3 @@ clang-format==21.1.2
 clang-tidy==21.1.1
 codespell[toml]==2.4.1
 ruff==0.14.3
-yapf==0.43.0
diff --git a/testing/conftest.py b/testing/conftest.py
index 9f49d40a..4010e0d8 100644
--- a/testing/conftest.py
+++ b/testing/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index a01bd459..4007bebe 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -4,7 +4,8 @@ from tilelang import tvm as tvm
 import tilelang.language as T
 from tilelang.intrinsics import make_mfma_swizzle_layout as make_swizzle_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(0)
@@ -22,7 +23,6 @@ def tl_matmul(
     b_transposed=True,
     k_pack=1,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
     if in_dtype in {"float8_e4m3fnuz", "int8"}:
@@ -78,12 +78,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -91,10 +90,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -102,7 +103,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -116,7 +116,6 @@ def tl_matmul(
                     T.copy(B[ko * block_K, bx * block_N], B_shared)
 
                 for ki in T.serial(0, (block_K // (k_pack * micro_size_k))):
-
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -160,17 +159,8 @@ def tl_matmul(
     return main
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack)
+def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype="float32", a_transposed=False, b_transposed=True, k_pack=1):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -201,16 +191,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -228,16 +215,13 @@ def test_assert_tl_matmul():
     assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", accum_dtype="int32")
     assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32")
     assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
+    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
     assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
-    assert_tl_matmul_correctness(
-        128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index b215f0d4..393a77b7 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -23,7 +23,6 @@ def tl_matmul(
     b_preshuffle=False,
     b_g2l_load=False,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
     if in_dtype in {"float8_e4m3fnuz", "int8"}:
@@ -53,18 +52,21 @@ def tl_matmul(
 
     A_shape = (K, M) if a_transposed else (M, K)
     if b_preshuffle:
-        B_shape = (N // micro_size_y, K // pack_size_k, micro_size_y,
-                   pack_size_k) if b_transposed else (K // pack_size_k, N // micro_size_y,
-                                                      pack_size_k, micro_size_y)
+        B_shape = (
+            (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shape = (N, K) if b_transposed else (K, N)
 
     A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
     if b_preshuffle:
-        B_shared_shape = (block_N // micro_size_y, block_K // pack_size_k, micro_size_y,
-                          pack_size_k) if b_transposed else (block_K // pack_size_k,
-                                                             block_N // micro_size_y, pack_size_k,
-                                                             micro_size_y)
+        B_shared_shape = (
+            (block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shared_shape = (block_N, block_K) if b_transposed else (block_K, block_N)
 
@@ -94,21 +96,22 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             num_ko = K // block_K
             num_ki = block_K // (k_pack * micro_size_k)
@@ -119,7 +122,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined(num_ko, num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -129,20 +131,13 @@ def tl_matmul(
                 # Load B into shared memory
                 if b_g2l_load is False:
                     if b_transposed:
-                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y,
-                                                       block_K // pack_size_k, micro_size_y,
-                                                       pack_size_k):
-                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j,
-                                                       ko * block_K // pack_size_k + k, jj, kk]
+                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
+                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
                     else:
-                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k,
-                                                       block_N // micro_size_y, pack_size_k,
-                                                       micro_size_y):
-                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k,
-                                                       bx * block_N // micro_size_y + j, kk, jj]
+                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
+                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
 
                 for ki in T.serial(0, num_ki):
-
                     # Load A S2L
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -176,10 +171,10 @@ def tl_matmul(
 
 
 def shuffle_weight(
-        x: torch.Tensor,
-        layout=(16, 32),
-        k_pack=1,
-        is_transpose=False,
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
 ) -> torch.Tensor:
     IN, IK = layout
     BK = IK * k_pack
@@ -194,19 +189,20 @@ def shuffle_weight(
     return x.contiguous()
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1,
-                                 b_preshuffle=False,
-                                 b_g2l_load=False):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack, b_preshuffle, b_g2l_load)
+def assert_tl_matmul_correctness(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype="float32",
+    a_transposed=False,
+    b_transposed=True,
+    k_pack=1,
+    b_preshuffle=False,
+    b_g2l_load=False,
+):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -244,16 +240,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -266,40 +259,17 @@ def assert_tl_matmul_correctness(M,
 
 @tilelang.testing.requires_rocm
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
-
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "int8",
-        "int32",
-        b_transposed=False,
-        accum_dtype="int32",
-        k_pack=2,
-        b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
+
+    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2, b_preshuffle=True)
 
     assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "float8_e4m3fnuz", "float32", b_transposed=False, b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "float8_e4m3fnuz",
-        "float32",
-        k_pack=2,
-        b_transposed=False,
-        b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", b_transposed=False, b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_preshuffle=True)
+    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_transposed=False, b_preshuffle=True)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index 456a3ae4..0666fd47 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -27,8 +27,7 @@ def matmul(
     vec_size = 4 * k_pack
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -111,8 +110,7 @@ def test_gemm_bf16f32f32_nt():
     run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
     run_gemm(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
     run_gemm(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
+    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
 
 
 @tilelang.testing.requires_rocm
@@ -121,8 +119,7 @@ def test_gemm_bf16bf16f32():
     run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
     run_gemm(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
     run_gemm(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
+    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
 
 
 def matmul_rs(
@@ -149,9 +146,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
index df88573f..85aa5189 100644
--- a/testing/python/analysis/test_tilelang_fragment_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -5,14 +5,12 @@ import pytest
 
 
 @tilelang.jit
-def simple_invalid_loop(dtype: str = "bfloat16",
-                        accum_dtype: str = "float32",
-                        num_threads: int = 128):
+def simple_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_frag = T.alloc_fragment([128], accum_dtype)
@@ -28,14 +26,12 @@ def simple_invalid_loop(dtype: str = "bfloat16",
 
 
 @tilelang.jit
-def nested_invalid_loop(dtype: str = "bfloat16",
-                        accum_dtype: str = "float32",
-                        num_threads: int = 128):
+def nested_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_frag = T.alloc_fragment([128], accum_dtype)
@@ -52,14 +48,12 @@ def nested_invalid_loop(dtype: str = "bfloat16",
 
 
 @tilelang.jit
-def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16",
-                                       accum_dtype: str = "float32",
-                                       num_threads: int = 128):
+def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_frag = T.alloc_fragment([128], accum_dtype)
@@ -75,14 +69,12 @@ def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16",
 
 
 @tilelang.jit
-def valid_loop_not_use_loop_var(dtype: str = "bfloat16",
-                                accum_dtype: str = "float32",
-                                num_threads: int = 128):
+def valid_loop_not_use_loop_var(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_frag = T.alloc_fragment([128], accum_dtype)
@@ -99,14 +91,12 @@ def valid_loop_not_use_loop_var(dtype: str = "bfloat16",
 
 
 @tilelang.jit
-def valid_loop_not_frag(dtype: str = "bfloat16",
-                        accum_dtype: str = "float32",
-                        num_threads: int = 128):
+def valid_loop_not_frag(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_shared = T.alloc_shared([128], accum_dtype)
@@ -122,14 +112,12 @@ def valid_loop_not_frag(dtype: str = "bfloat16",
 
 
 @tilelang.jit
-def valid_loop_serial(dtype: str = "bfloat16",
-                      accum_dtype: str = "float32",
-                      num_threads: int = 128):
+def valid_loop_serial(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
     def main(
-            data: T.Tensor((128, A), dtype),  # type: ignore
+        data: T.Tensor((128, A), dtype),  # type: ignore
     ):
         with T.Kernel(128, threads=num_threads) as (tid,):
             data_shared = T.alloc_shared([128], accum_dtype)
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
index d3c2ec20..e282c8e3 100644
--- a/testing/python/analysis/test_tilelang_nested_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -30,11 +30,10 @@ Rule:
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_parallels(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block):
@@ -46,29 +45,26 @@ def nested_continuous_parallels(length=256, block=16, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block1 // block2):
                 for j in T.Parallel(block1):
                     for k in T.Parallel(block2):
-                        B[i * block1 * block2 + j * block2 +
-                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
 
     return main
 
 
 @tilelang.jit(out_idx=[1])
 def nested_noncontinuous_parallels(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block):
@@ -103,8 +99,9 @@ is OK.
 """
 
 
-def matmul_nested_pipelines(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                            out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats):
+def matmul_nested_pipelines(
+    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats
+):
     A_shape = (K, M) if trans_A else (M, K)
     B_shape = (N, K) if trans_B else (K, N)
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
@@ -114,9 +111,9 @@ def matmul_nested_pipelines(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -180,7 +177,8 @@ def run_gemm_nested_pipelines(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -193,8 +191,8 @@ def run_gemm_nested_pipelines(
         if in_dtype == "float32":
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -218,11 +216,10 @@ is OK.
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_serials(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.serial(length // block):
@@ -234,11 +231,10 @@ def nested_continuous_serials(length=256, block=16, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def nested_noncontinuous_serials(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.serial(length // block):
@@ -277,11 +273,10 @@ Rule:
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_sp(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.serial(length // block):
@@ -293,11 +288,10 @@ def nested_continuous_sp(length=256, block=16, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_ps(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block):
@@ -309,36 +303,32 @@ def nested_continuous_ps(length=256, block=16, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_psp(length=256, block1=8, block2=2, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block1 // block2):
                 for j in T.serial(block1):
                     for k in T.Parallel(block2):
-                        B[i * block1 * block2 + j * block2 +
-                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
 
     return main
 
 
 @tilelang.jit(out_idx=[1])
 def nested_continuous_sps(length=256, block1=8, block2=2, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.serial(length // block1 // block2):
                 for j in T.Parallel(block1):
                     for k in T.serial(block2):
-                        B[i * block1 * block2 + j * block2 +
-                          k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
 
     return main
 
@@ -399,9 +389,9 @@ def matmul_nested_pipa(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -444,9 +434,9 @@ def matmul_nested_papipa(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -505,7 +495,8 @@ def run_gemm_mixed_pp(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -514,8 +505,8 @@ def run_gemm_mixed_pp(
         if in_dtype == "float32":
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -543,7 +534,8 @@ def run_gemm_mixed_pp(
             pass_configs={
                 tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
                 tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            })
+            },
+        )
 
 
 def test_mixed_pp():
@@ -576,9 +568,9 @@ def matmul_with_parallel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -637,7 +629,8 @@ def run_gemm_tiled_op_with_parallel(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -646,8 +639,8 @@ def run_gemm_tiled_op_with_parallel(
         if in_dtype == "float32":
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -675,16 +668,16 @@ def run_gemm_tiled_op_with_parallel(
             pass_configs={
                 tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
                 tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            })
+            },
+        )
 
 
 @tilelang.jit(out_idx=[1])
 def tir_op_with_parallel(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block):
@@ -696,11 +689,10 @@ def tir_op_with_parallel(length=256, block=16, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def customize_op_with_parallel(length=256, block=16, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length // block):
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 85e2e480..3e6a05a2 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -48,6 +48,7 @@ def get_configs(M, N, K, with_roller=False):
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
         from tilelang.carver.roller.rasterization import NoRasterization
+
         arch = CUDA("cuda")
         topk = 20
 
@@ -84,7 +85,6 @@ def get_configs(M, N, K, with_roller=False):
         for config in configs:
             print(config)
     else:
-
         block_M = [64]
         block_N = [64]
         block_K = [32]
@@ -100,7 +100,8 @@ def get_configs(M, N, K, with_roller=False):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -110,7 +111,8 @@ def get_configs(M, N, K, with_roller=False):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
@@ -190,9 +192,9 @@ def matmul(M, N, K, with_roller):
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -206,9 +208,7 @@ def matmul(M, N, K, with_roller):
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -247,12 +247,16 @@ def matmul(M, N, K, with_roller):
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 39efce6b..8f9a6098 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -30,38 +30,23 @@ def ref_program(A, B):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64],
-        block_N=[64],
-        block_K=[32],
-        num_stages=[0, 1],
-        thread_num=[128],
-        enable_rasterization=[False])
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M=128,
-           block_N=128,
-           block_K=32,
-           num_stages=0,
-           thread_num=128,
-           enable_rasterization=False):
+    iter_params = dict(block_M=[64], block_N=[64], block_K=[32], num_stages=[0, 1], thread_num=[128], enable_rasterization=[False])
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
 
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(out_idx=[-1])
+def matmul(M, N, K, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
     dtype = "float16"
     accum_dtype = "float"
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -76,7 +61,6 @@ def matmul(M,
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/testing/python/cache/test_tilelang_cache_matmul.py b/testing/python/cache/test_tilelang_cache_matmul.py
index 6e966a88..f38ed487 100644
--- a/testing/python/cache/test_tilelang_cache_matmul.py
+++ b/testing/python/cache/test_tilelang_cache_matmul.py
@@ -28,9 +28,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -63,6 +63,7 @@ def run_cache_matmul():
         Reference PyTorch matrix multiplication for comparison.
         """
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.half)  # Assuming dtype="float16" in matmul
         return C
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
index 46b17bf0..67d20b89 100644
--- a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -29,9 +29,7 @@ class _cudaDeviceAttrNames:
 def test_driver_get_device_properties():
     prop = get_cuda_device_properties()
     assert prop is not None, "Failed to get CUDA device properties"
-    assert isinstance(
-        prop,
-        torch.cuda._CudaDeviceProperties), ("Returned object is not of type _CudaDeviceProperties")
+    assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
 
 
 def test_device_get_device_name():
@@ -48,8 +46,7 @@ def test_device_get_shared_memory_per_block():
 
 def test_device_get_persisting_l2_cache_size():
     tl_cache_size = get_persisting_l2_cache_max_size()
-    driver_cache_size = get_device_attribute(
-        _cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
+    driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
     assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
 
 
@@ -61,17 +58,14 @@ def test_device_get_num_sms():
 
 def test_device_get_registers_per_block():
     tl_regs_per_block = get_registers_per_block()
-    driver_regs_per_block = get_device_attribute(
-        _cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
     assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
 
 
 def test_device_get_max_dynamic_shared_size_bytes():
     tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
-    driver_dynamic_smem = get_device_attribute(
-        _cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
-    assert tl_dynamic_smem == driver_dynamic_smem, (
-        "Max dynamic shared size bytes values do not match")
+    driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
+    assert tl_dynamic_smem == driver_dynamic_smem, "Max dynamic shared size bytes values do not match"
 
 
 if __name__ == "__main__":
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index 43cdb27e..313dc857 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -9,16 +9,13 @@ def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype="float16")
+        B = te.placeholder((N, K), name="B", dtype="float16")
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype("float16") * B[j, k].astype("float16"), axis=[k]), name="C")
 
         return A, B, C
 
@@ -29,8 +26,7 @@ def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
 
     tensorized_func, tags = carver.utils.get_tensorized_func_and_tags(func, arch.target)
     print(tags)
-    policy = carver.TensorCorePolicy.from_prim_func(
-        func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
+    policy = carver.TensorCorePolicy.from_prim_func(func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
 
     hints = policy.emit_config(topk=topk)
 
@@ -59,16 +55,13 @@ def run_general_matmul_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype="float16")
+        B = te.placeholder((N, K), name="B", dtype="float16")
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype("float16") * B[j, k].astype("float16"), axis=[k]), name="C")
 
         return A, B, C
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index fee46761..4973c24d 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -4,10 +4,7 @@ from tilelang.carver.arch import auto_infer_current_arch
 from typing import List
 
 
-def run_general_reduction_recommend_hints(structure: str = "SSR",
-                                          shape: List[int] = None,
-                                          dtype: str = "float16",
-                                          topk: int = 20):
+def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: str = "float16", topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.GeneralReductionTemplate(
         structure=structure,
@@ -28,9 +25,7 @@ def test_general_reduction_recommend_hints():
     run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], "float16")
 
 
-def run_elementwise_recommend_hints(shape: List[int] = None,
-                                    dtype: str = "float16",
-                                    topk: int = 20):
+def run_elementwise_recommend_hints(shape: List[int] = None, dtype: str = "float16", topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.ElementwiseTemplate(
         shape=shape,
@@ -81,11 +76,9 @@ def test_matmul_recommend_hints():
     run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float32", "float16")
 
 
-def run_gemv_recommend_hints(N: int = 1024,
-                             K: int = 1024,
-                             in_dtype: str = "float16",
-                             out_dtype: str = "float16",
-                             accum_dtype: str = "float16"):
+def run_gemv_recommend_hints(
+    N: int = 1024, K: int = 1024, in_dtype: str = "float16", out_dtype: str = "float16", accum_dtype: str = "float16"
+):
     arch = auto_infer_current_arch()
     carve_template = carver.GEMVTemplate(
         N=N,
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 1d60708f..bd0a64d3 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -23,7 +23,8 @@ def _compile_kernel_without_inplace():
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_STORAGE_REWRITE_DETECT_INPLACE: True,
-    },)
+    },
+)
 def _compile_kernel_with_inplace():
     num_tokens = T.symbolic("num_tokens")
 
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index 499f3346..323f7645 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -26,9 +26,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -88,7 +88,8 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 0129b373..4a878f32 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -10,9 +10,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
             A_local = T.alloc_local((block_M, block_K), dtype)
@@ -31,7 +31,6 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
             # )
 
             for ko in T.Pipelined(K // block_K, num_stages=num_stages):
-
                 T.copy(A[by * block_M, ko * block_K], A_local)
 
                 # Or Copy with Parallel
@@ -62,14 +61,13 @@ def test_matmul_codegen():
 
 
 def test_matmul_compile():
-
     def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
         # a simple kernel just for jit test
         @T.prim_func
         def matmul(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
                 A_local = T.alloc_local((block_M, block_K), dtype)
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
index 1602c30c..210b8966 100644
--- a/testing/python/debug/test_device_assert.py
+++ b/testing/python/debug/test_device_assert.py
@@ -7,7 +7,6 @@ import tilelang.language as T
 # TODO(dyq) It intentionally triggers a device-side assert so we can't include this in CI
 # Please run manually when you want to verify that device_assert actually traps on GPU.
 def _manual_device_assert_triggered():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
@@ -20,7 +19,6 @@ def _manual_device_assert_triggered():
 
 
 def test_device_assert_no_trigger():
-
     @T.prim_func
     def program():
         with T.Kernel(threads=128):
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index a1aa42ed..e2629661 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -6,7 +6,6 @@ import tilelang.language as T
 
 
 def debug_print_buffer(M=16, N=16, dtype="float16"):
-
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
@@ -19,24 +18,24 @@ def debug_print_buffer(M=16, N=16, dtype="float16"):
 
 
 def test_debug_print_buffer():
-    debug_print_buffer(dtype='bool')
-    debug_print_buffer(dtype='int8')
-    debug_print_buffer(dtype='int16')
-    debug_print_buffer(dtype='int32')
-    debug_print_buffer(dtype='int64')
-    debug_print_buffer(dtype='uint8')
-    debug_print_buffer(dtype='uint16')
-    debug_print_buffer(dtype='uint32')
-    debug_print_buffer(dtype='uint64')
-    debug_print_buffer(dtype='float16')
-    debug_print_buffer(dtype='float32')
-    debug_print_buffer(dtype='float64')
-    debug_print_buffer(dtype='bfloat16')
-    debug_print_buffer(dtype='float8_e4m3')
-    debug_print_buffer(dtype='float8_e4m3fn')
-    debug_print_buffer(dtype='float8_e4m3fnuz')
-    debug_print_buffer(dtype='float8_e5m2')
-    debug_print_buffer(dtype='float8_e5m2fnuz')
+    debug_print_buffer(dtype="bool")
+    debug_print_buffer(dtype="int8")
+    debug_print_buffer(dtype="int16")
+    debug_print_buffer(dtype="int32")
+    debug_print_buffer(dtype="int64")
+    debug_print_buffer(dtype="uint8")
+    debug_print_buffer(dtype="uint16")
+    debug_print_buffer(dtype="uint32")
+    debug_print_buffer(dtype="uint64")
+    debug_print_buffer(dtype="float16")
+    debug_print_buffer(dtype="float32")
+    debug_print_buffer(dtype="float64")
+    debug_print_buffer(dtype="bfloat16")
+    debug_print_buffer(dtype="float8_e4m3")
+    debug_print_buffer(dtype="float8_e4m3fn")
+    debug_print_buffer(dtype="float8_e4m3fnuz")
+    debug_print_buffer(dtype="float8_e5m2")
+    debug_print_buffer(dtype="float8_e5m2fnuz")
 
 
 def debug_print_buffer_conditional(M=16, N=16):
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
index 4b9dff71..8e50a275 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
@@ -5,7 +5,7 @@ import tilelang.testing
 from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics.utils import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (TensorCoreIntrinEmitter)
+from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
 
 tilelang.testing.set_random_seed(0)
 
@@ -96,12 +96,11 @@ def tl_matmul_macro(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -109,10 +108,12 @@ def tl_matmul_macro(
             B_local = T.alloc_local((warp_cols * local_size), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -120,7 +121,6 @@ def tl_matmul_macro(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -130,7 +130,6 @@ def tl_matmul_macro(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -207,8 +206,7 @@ def tl_matmul_block(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -306,8 +304,7 @@ def tl_matmul_block_all_dynamic(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -417,7 +414,7 @@ def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
     )
     pass_configs = {
         tilelang.PassConfigKey.TL_DISABLE_DYNAMIC_TAIL_SPLIT: dynamic_alignment != 0,
-        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment
+        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment,
     }
     if M % 64 == 0 or N % 64 == 0 or K % 64 != 0:
         # workaround for hopper tma lower pass
@@ -462,55 +459,31 @@ def test_assert_tl_matmul_macro():
 
 
 def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
+    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16",
-                                                   "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
+        128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+    )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
+        64, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+    )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4)
+        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4
+    )
     # Tail split is enabled with dynamic alignment 0
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0)
+        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
index b5ccbda9..1bee1356 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
@@ -25,10 +25,8 @@ def tl_matmul_block_static(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -137,10 +135,8 @@ def tl_matmul_block_dynamic_m(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -247,10 +243,8 @@ def tl_matmul_block_dynamic_mn(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -357,10 +351,8 @@ def tl_matmul_block_dynamic_mnk(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -445,8 +437,7 @@ def assert_tl_matmul_block_dynamic_mnk(
 
 
 def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16",
-                                  "float16", "float32")
+    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16", "float16", "float32")
 
 
 def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
@@ -462,10 +453,8 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
+    )
     assert_tl_matmul_block_dynamic_m(
         M,
         N,
@@ -478,7 +467,8 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
@@ -494,10 +484,8 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
+    )
     assert_tl_matmul_block_dynamic_mn(
         M,
         N,
@@ -510,7 +498,8 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
@@ -526,10 +515,8 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 4
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 4},
+    )
     assert_tl_matmul_block_dynamic_mnk(
         M,
         N,
@@ -542,7 +529,8 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def test_all():
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index c3b5d1b5..7809983e 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -7,16 +7,16 @@ import re
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +27,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +57,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +71,8 @@ def run_single_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +85,22 @@ def run_single_arg_mathop_test(mathop_name,
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +109,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +119,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -171,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), "float32"),
+        B: T.Tensor((M, N), "float32"),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +172,8 @@ def run_abs_test():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +188,19 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,14 +209,15 @@ def run_fastmath_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
index 77d8cc1f..a4283daa 100644
--- a/testing/python/issue/test_tilelang_issue_1001.py
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -8,14 +8,15 @@ from tilelang import language as T
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _cumsum_view_infer_layout(hidden):
-    num_tokens = T.dynamic('num_tokens')
+    num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), 'float']):
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), "float"]):
         with T.Kernel(num_tokens, threads=128) as pid:
-            smem = T.alloc_shared((hidden,), dtype='float')
+            smem = T.alloc_shared((hidden,), dtype="float")
             T.copy(x[pid, :], smem)
             T.cumsum(T.view(smem, (1, hidden)), dim=1)
 
@@ -24,10 +25,10 @@ def _cumsum_view_infer_layout(hidden):
 
 def test_cumsum_view_infer_layout():
     hidden = 128
-    x = torch.randn(1, hidden, device='cuda', dtype=torch.float)
+    x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
     kernel = _cumsum_view_infer_layout(hidden)
     kernel(x)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index 395593d8..2d86d164 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -8,12 +8,13 @@ from tilelang import language as T
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_static_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
             T.fill(x[0:128], 0)
 
@@ -24,14 +25,15 @@ def _fill_with_static_region_kernel():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },)
+    },
+)
 def _fill_with_dynamic_region_kernel():
-    num_tokens = T.symbolic('num_tokens')
+    num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), 'int64']):  # noqa: F821
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
-            a, b = T.alloc_var('int'), T.alloc_var('int')
+            a, b = T.alloc_var("int"), T.alloc_var("int")
             T.fill(x[a:b], 0)
 
     return buggy_kernel
@@ -39,15 +41,15 @@ def _fill_with_dynamic_region_kernel():
 
 def test_fill_with_static_region_kernel():
     kernel = _fill_with_static_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
 def test_fill_with_dynamic_region_kernel():
     kernel = _fill_with_dynamic_region_kernel()
-    x = torch.zeros((256,), dtype=torch.int64, device='cuda')
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1115.py b/testing/python/issue/test_tilelang_issue_1115.py
index 17698623..ce21a3b0 100644
--- a/testing/python/issue/test_tilelang_issue_1115.py
+++ b/testing/python/issue/test_tilelang_issue_1115.py
@@ -4,25 +4,23 @@ import tilelang.language as T
 
 
 def test_int64_address():
-
     @tilelang.jit
     def set_cache_kernel(
         S,
         D,
-        pos_ty='int64',
+        pos_ty="int64",
         dtype="float32",
     ):
-
         @T.prim_func
         def main(
-                pos: T
-            .Tensor(
+            pos: T.Tensor(
                 [
                     S,
-                ], pos_ty
+                ],
+                pos_ty,
             ),  # type: ignore  `TypeError: Check failed: (a.dtype() == b.dtype()) is false: mismatched types. int64 vs. int32`
-                value: T.Tensor([S, D], dtype),  # type: ignore
-                cache: T.Tensor([S, D], dtype),  # type: ignore
+            value: T.Tensor([S, D], dtype),  # type: ignore
+            cache: T.Tensor([S, D], dtype),  # type: ignore
         ):
             with T.Kernel(S, threads=128) as bx:
                 slot = pos[bx]
@@ -34,11 +32,11 @@ def test_int64_address():
     D = 2
     S = 10
     cache = torch.rand((S, D), device="cuda", dtype=torch.float32)
-    value = torch.rand((S, D), device='cuda', dtype=torch.float32)
-    pos_int64 = torch.arange(S, device='cuda', dtype=torch.int64)
-    pos_int32 = torch.arange(S, device='cuda', dtype=torch.int32)
-    kernel_int64 = set_cache_kernel(S, D, 'int64')
-    kernel_int32 = set_cache_kernel(S, D, 'int32')
+    value = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    pos_int64 = torch.arange(S, device="cuda", dtype=torch.int64)
+    pos_int32 = torch.arange(S, device="cuda", dtype=torch.int32)
+    kernel_int64 = set_cache_kernel(S, D, "int64")
+    kernel_int32 = set_cache_kernel(S, D, "int32")
     kernel_int64(pos_int64, value, cache)
     torch.testing.assert_close(cache, value)
     kernel_int32(pos_int32, value, cache)
diff --git a/testing/python/issue/test_tilelang_issue_1198.py b/testing/python/issue/test_tilelang_issue_1198.py
index eb9ed459..08f36822 100644
--- a/testing/python/issue/test_tilelang_issue_1198.py
+++ b/testing/python/issue/test_tilelang_issue_1198.py
@@ -3,13 +3,17 @@ import tilelang.language as T
 
 
 def test_issue_1198():
-
     @T.prim_func
-    def foo(x: T.Buffer([
-        32,
-    ], "int32")):
+    def foo(
+        x: T.Buffer(
+            [
+                32,
+            ],
+            "int32",
+        ),
+    ):
         pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_814.py b/testing/python/issue/test_tilelang_issue_814.py
index 1a9e63d2..a202bd96 100644
--- a/testing/python/issue/test_tilelang_issue_814.py
+++ b/testing/python/issue/test_tilelang_issue_814.py
@@ -6,11 +6,10 @@ import torch
 
 @tilelang.jit
 def _tmp_var_kernel(N, block_N, dtype="float"):
-
     @T.prim_func
     def kernel(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:
             for i in T.Parallel(block_N):
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index 950b8583..74ceed3d 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -8,7 +8,6 @@ import tilelang.language as T
 
 @tilelang.jit
 def _empty_kernel():
-
     @T.prim_func
     def empty_kernel():
         with T.Kernel(1, threads=32) as thread_idx:
@@ -51,7 +50,6 @@ def test_empty_with_dead_code_kernel():
 
 @tilelang.jit
 def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
-
     @T.prim_func
     def kernel_with_tuple_kernel_binding():
         with T.Kernel(1, threads=32) as (pid,):
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index e42ebb59..6ab7fe47 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -5,18 +5,16 @@ import torch
 
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
diff --git a/testing/python/issue/test_tilelang_issue_merge_if.py b/testing/python/issue/test_tilelang_issue_merge_if.py
index 1db7f337..fa9432fc 100644
--- a/testing/python/issue/test_tilelang_issue_merge_if.py
+++ b/testing/python/issue/test_tilelang_issue_merge_if.py
@@ -6,7 +6,6 @@ import tilelang.language as T
 
 
 def merge_if_test():
-
     @T.prim_func
     def main():
         A = T.alloc_fragment((1,), "float16")
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index e987368d..7d76a64d 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -29,9 +29,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -141,9 +141,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -208,6 +208,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 25c19a05..153f06cb 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -31,9 +31,9 @@ def matmul_kernel_jit(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -96,6 +96,7 @@ def run_gemm_kernel_jit(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 12524f12..4ea4ba88 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -138,9 +138,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -208,6 +208,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -235,19 +236,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_cython_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cython_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -287,23 +278,12 @@ def run_cython_kernel_do_bench(M,
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_cython_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cython_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -342,23 +322,12 @@ def run_cython_kernel_multi_stream(M,
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -398,36 +367,20 @@ def run_cython_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape_with_out_idx(M,
-                                          N,
-                                          K,
-                                          trans_A,
-                                          trans_B,
-                                          in_dtype,
-                                          out_dtype,
-                                          dtypeAccum,
-                                          block_M,
-                                          block_N,
-                                          block_K,
-                                          num_stages=3,
-                                          num_threads=128):
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape_with_out_idx(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -467,13 +420,11 @@ def run_cython_dynamic_shape_with_out_idx(M,
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
 
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
 
 def matmul_int_variable(
@@ -498,10 +449,10 @@ def matmul_int_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.int32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.int32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -525,10 +476,10 @@ def matmul_int_variable(
     return main
 
 
-def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                            out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                  out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_int_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -544,8 +495,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                            "float32", 0, 128)
+    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16", "float32", 0, 128)
 
 
 def matmul_float_variable(
@@ -570,10 +520,10 @@ def matmul_float_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.float32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.float32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -597,10 +547,10 @@ def matmul_float_variable(
     return main
 
 
-def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                              out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                    out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_float_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -616,8 +566,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                              "float32", 0, 128)
+    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16", "float32", 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index cce1fce8..8965e2ad 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -7,22 +7,13 @@ from tilelang.utils import map_torch_type
 
 
 @tl.jit
-def tensor_null_test(M,
-                     N,
-                     K,
-                     block_M,
-                     block_N,
-                     block_K,
-                     dtype="float16",
-                     accum_dtype="float",
-                     with_bias=False):
-
+def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float", with_bias=False):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), accum_dtype),
-            Bias: T.Tensor((N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), accum_dtype),
+        Bias: T.Tensor((N), accum_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -48,12 +39,10 @@ def tensor_null_test(M,
 
 
 def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
-    kernel = tensor_null_test(
-        M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
+    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
     kernel(a, b, c, None)
 
 
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
index c7076861..2b150277 100644
--- a/testing/python/jit/test_tilelang_jit_nvrtc.py
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -136,9 +136,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -206,6 +206,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -233,19 +234,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_nvrtc_kernel_do_bench(M,
-                              N,
-                              K,
-                              trans_A,
-                              trans_B,
-                              in_dtype,
-                              out_dtype,
-                              dtypeAccum,
-                              block_M,
-                              block_N,
-                              block_K,
-                              num_stages=3,
-                              num_threads=128):
+def run_nvrtc_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -278,23 +269,12 @@ def run_nvrtc_kernel_do_bench(M,
 
 
 def test_nvrtc_kernel_do_bench():
-    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                              256, 32, 2)
-
-
-def run_nvrtc_kernel_multi_stream(M,
-                                  N,
-                                  K,
-                                  trans_A,
-                                  trans_B,
-                                  in_dtype,
-                                  out_dtype,
-                                  dtypeAccum,
-                                  block_M,
-                                  block_N,
-                                  block_K,
-                                  num_stages=3,
-                                  num_threads=128):
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_nvrtc_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -331,23 +311,12 @@ def run_nvrtc_kernel_multi_stream(M,
 
 
 def test_nvrtc_kernel_multi_stream():
-    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                  128, 256, 32, 2)
-
-
-def run_nvrtc_dynamic_shape(M,
-                            N,
-                            K,
-                            trans_A,
-                            trans_B,
-                            in_dtype,
-                            out_dtype,
-                            dtypeAccum,
-                            block_M,
-                            block_N,
-                            block_K,
-                            num_stages=3,
-                            num_threads=128):
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_nvrtc_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -387,21 +356,15 @@ def run_nvrtc_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_nvrtc_dynamic_shape():
-    run_nvrtc_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
-    run_nvrtc_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
-    run_nvrtc_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
 
 def check_hopper():
@@ -412,35 +375,18 @@ def check_hopper():
     return compute_capability == (9, 0)
 
 
-def convolution_im2col(N,
-                       C,
-                       H,
-                       W,
-                       F,
-                       K,
-                       S,
-                       D,
-                       P,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       threads,
-                       dtype="float16",
-                       accum_dtype="float"):
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -449,11 +395,13 @@ def convolution_im2col(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -467,23 +415,9 @@ def convolution_im2col(N,
     return main
 
 
-def run_nvrtc_im2col_tma_desc(N,
-                              C,
-                              H,
-                              W,
-                              F,
-                              K,
-                              S,
-                              D,
-                              P,
-                              block_M,
-                              block_N,
-                              block_K,
-                              num_stages=3,
-                              num_threads=256):
+def run_nvrtc_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
     """Test im2col TMA descriptor functionality in NVRTC backend."""
-    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages,
-                                 num_threads)
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
 
     conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
 
@@ -501,32 +435,20 @@ def run_nvrtc_im2col_tma_desc(N,
         return C
 
     ref_c = ref_program(a, b)
-    tilelang.testing.torch_assert_close(
-        out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_nvrtc_im2col_tma_desc():
     """Test im2col TMA descriptor with NVRTC backend."""
     if not check_hopper():
         import pytest
+
         pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
 
     # Small test case for im2col TMA descriptor
     run_nvrtc_im2col_tma_desc(
-        N=4,
-        C=64,
-        H=32,
-        W=32,
-        F=64,
-        K=3,
-        S=1,
-        D=1,
-        P=1,
-        block_M=64,
-        block_N=128,
-        block_K=32,
-        num_stages=3,
-        num_threads=256)
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
 
 
 def test_nvrtc_l2_persistent_map():
@@ -543,12 +465,11 @@ def test_nvrtc_l2_persistent_map():
         block_size=256,
         dtype="float32",
     ):
-
         @T.prim_func
         def kernel(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(M * N // block_size, threads=block_size) as bx:
                 # Annotate L2 persistent cache for buffer B
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index e7bcec41..0a6e9062 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -16,9 +16,9 @@ def matmul_kernel_jit(
     block_K,
     trans_A=False,
     trans_B=True,
-    in_dtype='float16',
-    out_dtype='float32',
-    accum_dtype='float32',
+    in_dtype="float16",
+    out_dtype="float32",
+    accum_dtype="float32",
     num_stages=2,
     threads=128,
 ):
@@ -31,9 +31,9 @@ def matmul_kernel_jit(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index f7bde6af..5daaf308 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -74,9 +74,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -144,6 +144,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -171,19 +172,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_tvm_ffi_kernel_do_bench(M,
-                                N,
-                                K,
-                                trans_A,
-                                trans_B,
-                                in_dtype,
-                                out_dtype,
-                                dtypeAccum,
-                                block_M,
-                                block_N,
-                                block_K,
-                                num_stages=3,
-                                num_threads=128):
+def run_tvm_ffi_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -216,23 +207,12 @@ def run_tvm_ffi_kernel_do_bench(M,
 
 
 def test_tvm_ffi_kernel_do_bench():
-    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                                256, 32, 2)
-
-
-def run_tvm_ffi_kernel_multi_stream(M,
-                                    N,
-                                    K,
-                                    trans_A,
-                                    trans_B,
-                                    in_dtype,
-                                    out_dtype,
-                                    dtypeAccum,
-                                    block_M,
-                                    block_N,
-                                    block_K,
-                                    num_stages=3,
-                                    num_threads=128):
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_tvm_ffi_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -269,23 +249,12 @@ def run_tvm_ffi_kernel_multi_stream(M,
 
 
 def test_tvm_ffi_kernel_multi_stream():
-    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                    128, 256, 32, 2)
-
-
-def run_tvm_ffi_dynamic_shape(M,
-                              N,
-                              K,
-                              trans_A,
-                              trans_B,
-                              in_dtype,
-                              out_dtype,
-                              dtypeAccum,
-                              block_M,
-                              block_N,
-                              block_K,
-                              num_stages=3,
-                              num_threads=128):
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_tvm_ffi_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -325,21 +294,17 @@ def run_tvm_ffi_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_tvm_ffi_dynamic_shape():
-    run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
-    run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
     run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+    )
 
 
 def check_hopper():
@@ -350,35 +315,18 @@ def check_hopper():
     return compute_capability == (9, 0)
 
 
-def convolution_im2col(N,
-                       C,
-                       H,
-                       W,
-                       F,
-                       K,
-                       S,
-                       D,
-                       P,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       threads,
-                       dtype="float16",
-                       accum_dtype="float"):
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -387,11 +335,13 @@ def convolution_im2col(N,
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
+            T.annotate_layout(
+                {
+                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
+                    data_shared: tilelang.layout.make_swizzled_layout(data_shared),
+                    kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
+                }
+            )
 
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
@@ -405,23 +355,9 @@ def convolution_im2col(N,
     return main
 
 
-def run_tvm_ffi_im2col_tma_desc(N,
-                                C,
-                                H,
-                                W,
-                                F,
-                                K,
-                                S,
-                                D,
-                                P,
-                                block_M,
-                                block_N,
-                                block_K,
-                                num_stages=3,
-                                num_threads=256):
+def run_tvm_ffi_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
     """Test im2col TMA descriptor functionality in tvm_ffi backend."""
-    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages,
-                                 num_threads)
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
 
     conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
 
@@ -439,32 +375,20 @@ def run_tvm_ffi_im2col_tma_desc(N,
         return C
 
     ref_c = ref_program(a, b)
-    tilelang.testing.torch_assert_close(
-        out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_tvm_ffi_im2col_tma_desc():
     """Test im2col TMA descriptor with tvm_ffi backend."""
     if not check_hopper():
         import pytest
+
         pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
 
     # Small test case for im2col TMA descriptor
     run_tvm_ffi_im2col_tma_desc(
-        N=4,
-        C=64,
-        H=32,
-        W=32,
-        F=64,
-        K=3,
-        S=1,
-        D=1,
-        P=1,
-        block_M=64,
-        block_N=128,
-        block_K=32,
-        num_stages=3,
-        num_threads=256)
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
 
 
 def test_tvm_ffi_l2_persistent_map():
@@ -481,12 +405,11 @@ def test_tvm_ffi_l2_persistent_map():
         block_size=256,
         dtype="float32",
     ):
-
         @T.prim_func
         def kernel(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(M * N // block_size, threads=block_size) as bx:
                 # Annotate L2 persistent cache for buffer B
@@ -506,8 +429,12 @@ def test_tvm_ffi_l2_persistent_map():
     kernel = elementwise_add_with_l2_cache(M, N)
 
     source = kernel.get_host_source()
-    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
-    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
+    )
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+    )
 
     # Create test tensors
     a = torch.randn(M, N, dtype=torch.float32).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index 13135d41..e7d7021c 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -111,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -124,10 +124,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -135,7 +137,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -145,7 +146,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
diff --git a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
index 3ec6ae03..52763c81 100644
--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
@@ -16,15 +16,15 @@ def elementwise_add(
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), in_dtype),
-            B: T.Tensor((M, N), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, N), in_dtype),
+        B: T.Tensor((M, N), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             start_x = bx * block_N
             start_y = by * block_M
 
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 y = start_y + local_y
                 x = start_x + local_x
 
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 19f327d6..63c82120 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -12,12 +12,11 @@ def calc_diff(x, y):
 
 
 def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((N, K), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=128) as (bx, by):
             A_shared = T.alloc_shared((bM, bK), in_dtype)
@@ -44,8 +43,7 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 
     C = kernel(A, B)
 
-    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)),
-                         B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
+    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)), B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
     print(C)
     print(ref_c)
     diff = calc_diff(C, ref_c)
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 46f4e123..eec3a9ca 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -110,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -123,10 +123,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -134,7 +136,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -144,7 +145,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index afd01f33..4a48b656 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -50,16 +50,15 @@ def gemv_simt(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def gemv_simt(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def gemv_simt(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 5dcde1d5..6c01297a 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -26,9 +26,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -95,8 +95,8 @@ def run_gemm(
         if in_dtype == "float32":
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -321,9 +321,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -441,9 +441,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index 6e20754e..3633d3ec 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -6,7 +6,8 @@ from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -111,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -124,10 +124,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -135,7 +137,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -145,7 +146,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 548497c7..e4da44b2 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -76,12 +76,11 @@ def tl_matmul_simt(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
 
@@ -97,7 +96,6 @@ def tl_matmul_simt(
             T.clear(C_local)
 
             for ko in T.serial(K // block_K):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -109,29 +107,24 @@ def tl_matmul_simt(
                 for ki in T.serial((block_K // micro_size_k)):
                     for i in T.serial(local_size_a):
                         for mk in T.vectorized(micro_size_k):
-                            A_local[i, mk] = A_shared[warp_m * local_size_a + i,
-                                                      ki * micro_size_k + mk]
+                            A_local[i, mk] = A_shared[warp_m * local_size_a + i, ki * micro_size_k + mk]
 
                     for i in T.serial(local_size_b):
                         for mk in T.vectorized(micro_size_k):
-                            B_local[i, mk] = B_shared[warp_n * local_size_b + i,
-                                                      ki * micro_size_k + mk]
+                            B_local[i, mk] = B_shared[warp_n * local_size_b + i, ki * micro_size_k + mk]
 
                     for i, j in T.grid(local_size_a, local_size_b):
                         for mk in T.serial(micro_size_k // dp4a_size):
                             if use_dp4a:
-                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size],
-                                       C_local[i * local_size_b + j])
+                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size], C_local[i * local_size_b + j])
                             else:
                                 for dp4a_idx in T.serial(dp4a_size):
-                                    C_local[i * local_size_b +
-                                            j] += A_local[i, mk * dp4a_size +
-                                                          dp4a_idx] * B_local[j, mk * dp4a_size +
-                                                                              dp4a_idx]
+                                    C_local[i * local_size_b + j] += (
+                                        A_local[i, mk * dp4a_size + dp4a_idx] * B_local[j, mk * dp4a_size + dp4a_idx]
+                                    )
 
             for i, j in T.grid(local_size_a, local_size_b):
-                C[by * block_M + warp_m * local_size_a + i,
-                  bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
+                C[by * block_M + warp_m * local_size_a + i, bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
 
     return main
 
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index bbc2e79e..2def480d 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -5,12 +5,11 @@ import torch
 
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -59,7 +58,8 @@ def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create random input tensors on the GPU
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
     b = torch.randn(K, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 86d6acbd..5825f695 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -50,16 +50,15 @@ def gemv_simt(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def gemv_simt(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def gemv_simt(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 5cdd6710..affeb3dd 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -4,7 +4,8 @@ from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang.language as T
 from tilelang.intrinsics import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 
 from tilelang.intrinsics.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
@@ -91,12 +92,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -104,10 +104,12 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -115,7 +117,6 @@ def tl_matmul(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -125,7 +126,6 @@ def tl_matmul(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -168,7 +168,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
         out_idx=[2],
         pass_configs={
             tilelang.PassConfigKey.TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS: True,
-        })
+        },
+    )
     print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
@@ -285,12 +286,11 @@ def tl_matmul_weight_only_transform(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -298,10 +298,12 @@ def tl_matmul_weight_only_transform(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -309,19 +311,15 @@ def tl_matmul_weight_only_transform(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
 
                 # Load B into shared memory
-                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k,
-                                               micro_size_y, micro_size_k):
-                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j,
-                                               ko * (block_K // micro_size_k) + k, jj, kk]
+                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k, micro_size_y, micro_size_k):
+                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j, ko * (block_K // micro_size_k) + k, jj, kk]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -359,6 +357,7 @@ def tl_matmul_weight_only_transform(
 
 def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     import bitblas
+
     matmul = tl_matmul_weight_only_transform(M, N, K, in_dtype, out_dtype, accum_dtype)
     kernel = tilelang.compile(matmul, out_idx=[2])
     profiler = kernel.get_profiler()
diff --git a/testing/python/language/test_tilelang_capture.py b/testing/python/language/test_tilelang_capture.py
index 875fa681..47fec999 100644
--- a/testing/python/language/test_tilelang_capture.py
+++ b/testing/python/language/test_tilelang_capture.py
@@ -6,16 +6,17 @@ import gc
 
 
 def test_tilelang_capture():
-
     @tilelang.jit(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },)
+        },
+    )
     def get_dummy_kernel():
-
         @T.prim_func
-        def dummy_kernel(a: T.Tensor[(1,), T.float32],):
+        def dummy_kernel(
+            a: T.Tensor[(1,), T.float32],
+        ):
             with T.Kernel(1) as _:
                 a[0] = 1
 
@@ -36,5 +37,5 @@ def test_tilelang_capture():
     #     objgraph.show_backrefs([a_upgrade], max_depth=5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_intimm.py b/testing/python/language/test_tilelang_intimm.py
index 58fea31d..46c2c798 100644
--- a/testing/python/language/test_tilelang_intimm.py
+++ b/testing/python/language/test_tilelang_intimm.py
@@ -4,25 +4,25 @@ import tilelang.language as T
 
 
 def test_tilelang_intimm():
-    T.int32(0x7fffffff)
-    T.int32(-0x7fffffff - 1)
-    T.uint32(0xffffffff)
-    T.int64(0x7fffffffffffffff)
-    T.int64(-0x7fffffffffffffff - 1)
-    T.uint64(0xffffffffffffffff)
+    T.int32(0x7FFFFFFF)
+    T.int32(-0x7FFFFFFF - 1)
+    T.uint32(0xFFFFFFFF)
+    T.int64(0x7FFFFFFFFFFFFFFF)
+    T.int64(-0x7FFFFFFFFFFFFFFF - 1)
+    T.uint64(0xFFFFFFFFFFFFFFFF)
 
     a = T.int32()
-    a & 0x7fffffff
+    a & 0x7FFFFFFF
 
     a = T.uint32()
-    a & 0xffffffff
+    a & 0xFFFFFFFF
 
     a = T.int64()
-    a & 0x7fffffffffffffff
+    a & 0x7FFFFFFFFFFFFFFF
 
     a = T.uint64()
-    a & T.uint64(0xffffffffffffffff)
+    a & T.uint64(0xFFFFFFFFFFFFFFFF)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index c99d3610..f55d9e85 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -5,12 +5,11 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index 73233ec8..48412127 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.all(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -35,15 +34,14 @@ def blocksparse_matmul_global(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -80,15 +78,14 @@ def blocksparse_matmul_shared(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -130,15 +127,14 @@ def blocksparse_matmul_local(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 149a1c28..6695e934 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -10,8 +10,8 @@ def alloc_var(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -50,8 +50,8 @@ def alloc_var_add(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -91,8 +91,8 @@ def alloc_var_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp = T.alloc_var(dtype, init_value)
@@ -129,8 +129,8 @@ def alloc_multi_vars_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp0 = T.alloc_var(dtype, 1)
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
index 7425bf5c..5c9aeeac 100644
--- a/testing/python/language/test_tilelang_language_annot.py
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -5,13 +5,14 @@ import torch
 
 
 def test_tensor_annot_mul():
-
     @tilelang.jit
     def example_tensor_annot():
-        n = T.symbolic('n')
+        n = T.symbolic("n")
 
         @T.prim_func
-        def kernel(A: T.Tensor((n * 4,), T.int32),):
+        def kernel(
+            A: T.Tensor((n * 4,), T.int32),
+        ):
             with T.Kernel(1) as _:
                 for i in range(n * 4):
                     A[i] = 0
@@ -19,20 +20,21 @@ def test_tensor_annot_mul():
         return kernel
 
     ker = example_tensor_annot()
-    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
     ker(A)
-    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
     assert torch.equal(A, expected)
 
 
 def test_tensor_annot_add():
-
     @tilelang.jit
     def example_tensor_annot():
-        n = T.symbolic('n')
+        n = T.symbolic("n")
 
         @T.prim_func
-        def kernel(A: T.Tensor((n + 1,), T.int32),):
+        def kernel(
+            A: T.Tensor((n + 1,), T.int32),
+        ):
             with T.Kernel(1) as _:
                 for i in range(n + 1):
                     A[i] = 0
@@ -40,20 +42,21 @@ def test_tensor_annot_add():
         return kernel
 
     ker = example_tensor_annot()
-    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
     ker(A)
-    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
     assert torch.equal(A, expected)
 
 
 def test_tensor_annot_mul_add():
-
     @tilelang.jit
     def example_tensor_annot():
-        n = T.symbolic('n')
+        n = T.symbolic("n")
 
         @T.prim_func
-        def kernel(A: T.Tensor((n * 3 + 1,), T.int32),):
+        def kernel(
+            A: T.Tensor((n * 3 + 1,), T.int32),
+        ):
             with T.Kernel(1) as _:
                 for i in range(n * 3 + 1):
                     A[i] = 0
@@ -61,11 +64,11 @@ def test_tensor_annot_mul_add():
         return kernel
 
     ker = example_tensor_annot()
-    A = torch.arange(16, dtype=torch.int32, device='cuda')
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
     ker(A)
-    expected = torch.zeros(16, dtype=torch.int32, device='cuda')
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
     assert torch.equal(A, expected)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3d616ac1..442172b6 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -7,11 +7,10 @@ import torch
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -30,13 +29,8 @@ def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
 def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16", pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 354d32cd..37605e5a 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.any(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -35,15 +34,14 @@ def blocksparse_matmul_global(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -80,15 +78,14 @@ def blocksparse_matmul_shared(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -130,15 +127,14 @@ def blocksparse_matmul_local(
     dtype="float16",
     accum_dtype="float",
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
index 9c75a5ac..32e6b1c3 100644
--- a/testing/python/language/test_tilelang_language_assume.py
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -4,10 +4,9 @@ import tilelang.testing
 
 
 def test_assume_remove_boundary_check():
-
     @tilelang.jit
     def kernel_with_assume():
-        N = T.dynamic('N')
+        N = T.dynamic("N")
 
         @T.prim_func
         def main(A: T.Tensor((N,), "float32"), l: T.int32, r: T.int32):
@@ -21,20 +20,19 @@ def test_assume_remove_boundary_check():
     jit_kernel = kernel_with_assume()
     source = jit_kernel.get_kernel_source()
 
-    assert ("if (" not in source)
+    assert "if (" not in source
 
 
 def test_assume_enable_vectorization():
-
     @tilelang.jit
     def kernel_vectorize(M):
-        N = T.dynamic('N')
+        N = T.dynamic("N")
         vectorize_size = 4
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, N), "float32"),
-                B: T.Tensor((M, N), "float32"),
+            A: T.Tensor((M, N), "float32"),
+            B: T.Tensor((M, N), "float32"),
         ):
             with T.Kernel(1, threads=32) as _:
                 tid = T.get_thread_binding()
@@ -55,16 +53,15 @@ def test_assume_enable_vectorization():
 
 
 def test_assume_complex_indexing():
-
     @tilelang.jit
     def kernel_complex():
-        M = T.dynamic('M')
-        N = T.dynamic('N')
+        M = T.dynamic("M")
+        N = T.dynamic("N")
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, N), "float32"),
-                B: T.Tensor((M, N), "float32"),
+            A: T.Tensor((M, N), "float32"),
+            B: T.Tensor((M, N), "float32"),
         ):
             with T.Kernel(1, threads=32) as _:
                 tid = T.get_thread_binding()
@@ -82,8 +79,8 @@ def test_assume_complex_indexing():
     jit_kernel = kernel_complex()
     source = jit_kernel.get_kernel_source()
 
-    assert ("if (" not in source)
+    assert "if (" not in source
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index b157966a..eaf5ae1e 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -4,14 +4,12 @@ import tilelang.language as T
 
 @tilelang.jit
 def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -39,14 +37,12 @@ def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             T.atomic_add(B[bx * block_M, by * block_N], A_shared)
 
@@ -76,14 +72,12 @@ def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -111,14 +105,12 @@ def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -137,7 +129,7 @@ def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
                     B[i, j] = min(B[i, j], A[k, i, j])
 
     A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
     ref_B = B.clone()
     ref_program(A, ref_B)
     kernel(A, B)
@@ -146,7 +138,6 @@ def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -172,18 +163,15 @@ def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
     def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
-                T.atomic_add(
-                    B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
 
     return atomic_with_memory_order
 
@@ -208,7 +196,6 @@ def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
 
 @tilelang.jit
 def atomic_addx2_program(M, N, block_M, block_N):
-
     @T.prim_func
     def atomic_addx2(A: T.Tensor((M, N), "float16"), B: T.Tensor((M, N), "float16")):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -262,10 +249,10 @@ def test_atomic_addx2():
 
 @tilelang.jit
 def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
-    def atomic_different_orders(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor(
-        (M, N), dtype), D: T.Tensor((M, N), dtype)):
+    def atomic_different_orders(
+        A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
@@ -286,18 +273,17 @@ def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
     A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
     B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    D = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    D = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
 
     kernel(A, B, C, D)
 
     torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
-    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float('inf')), A))
+    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float("inf")), A))
 
 
 @tilelang.jit
 def atomic_addx4_program(M, N, block_M, block_N):
-
     @T.prim_func
     def atomic_addx4(A: T.Tensor((M, N), "float32"), B: T.Tensor((M, N), "float32")):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -330,17 +316,14 @@ def run_atomic_addx4(M, N, block_M, block_N):
 
 @tilelang.jit
 def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
-
     @T.prim_func
-    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype),
-                                old_vals: T.Tensor((M, N), dtype)):
+    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
-                    old_vals[idx_i, idx_j] = T.atomic_add(
-                        B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
+                    old_vals[idx_i, idx_j] = T.atomic_add(B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
 
     return atomic_with_return_prev
 
diff --git a/testing/python/language/test_tilelang_language_ceildiv.py b/testing/python/language/test_tilelang_language_ceildiv.py
index 35201a07..66215abc 100644
--- a/testing/python/language/test_tilelang_language_ceildiv.py
+++ b/testing/python/language/test_tilelang_language_ceildiv.py
@@ -5,7 +5,6 @@ import torch
 
 @tilelang.jit(out_idx=[-1])
 def _ceildiv_kernel(a: int, b: int):
-
     @T.prim_func
     def ceildiv_kernel(A: T.Tensor((1,), "int32")):
         with T.Kernel(1, threads=1) as _:
@@ -30,7 +29,6 @@ def test_ceildiv():
 
 @tilelang.jit
 def _ceildiv_kernel_dyn(b: int):
-
     @T.prim_func
     def ceildiv_kernel(A: T.Tensor((1,), "int32"), a: T.int32):
         with T.Kernel(1, threads=1) as _:
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
index 696a9c70..0a9623fa 100644
--- a/testing/python/language/test_tilelang_language_chain_equal.py
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -8,14 +8,14 @@ import torch
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },)
+    },
+)
 def chain_equal(N, block_size, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as bx:
             for lane in T.Parallel(block_size):
diff --git a/testing/python/language/test_tilelang_language_clamp.py b/testing/python/language/test_tilelang_language_clamp.py
index 4a2f1779..06e558fd 100644
--- a/testing/python/language/test_tilelang_language_clamp.py
+++ b/testing/python/language/test_tilelang_language_clamp.py
@@ -13,8 +13,8 @@ def clamp_within_bounds(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -56,8 +56,8 @@ def clamp_value_range(
 
     @T.prim_func
     def main(
-            A: T.Tensor((1, N), dtype),
-            B: T.Tensor((1, N), dtype),
+        A: T.Tensor((1, N), dtype),
+        B: T.Tensor((1, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             # A_shared = T.alloc_shared([1, block_N], dtype=dtype)
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index be3d808f..19ae0bbd 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -5,12 +5,11 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -42,10 +41,10 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
     import torch
     from tilelang.utils import map_torch_type
+
     a = torch.randn((M, K), dtype=map_torch_type(dtype)).cuda()
     b = torch.randn((N, K), dtype=map_torch_type(dtype)).cuda()
     c = kernel(a, b)
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index ac2254f3..8a586956 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -7,11 +7,10 @@ import torch
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M * N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M * N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -35,7 +34,8 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b.flatten(), a.flatten(), rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 4a2ddee8..367f8ed1 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -7,11 +7,10 @@ import tilelang.testing
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -27,10 +26,8 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
         program,
         out_idx=[1],
         target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -43,11 +40,10 @@ def test_tilelang_copy():
 
 
 def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.StridedTensor((M, N), (NN, 1), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.StridedTensor((M, N), (NN, 1), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -57,12 +53,7 @@ def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_with_stride(M=1024,
-                                  N=1024,
-                                  NN=2048,
-                                  block_M=128,
-                                  block_N=128,
-                                  dtype="float16"):
+def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype="float16"):
     if isinstance(NN, int):
         assert NN > N, "NN must be greater than N"
     program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
@@ -73,7 +64,8 @@ def run_tilelang_copy_with_stride(M=1024,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
+        },
+    )
     if isinstance(NN, T.Var):
         NN = N * 2
     a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
@@ -87,11 +79,10 @@ def test_tilelang_copy_with_stride():
 
 
 def tilelang_copy_bufferload(num_tokens, dtype="float16"):
-
     @T.prim_func
     def main(
-            indices: T.Tensor((num_tokens,), "int32"),
-            x: T.Tensor((num_tokens,), dtype),
+        indices: T.Tensor((num_tokens,), "int32"),
+        x: T.Tensor((num_tokens,), dtype),
     ):
         with T.Kernel(num_tokens, threads=32) as pid:
             idx = T.alloc_local([1], "int32")
@@ -107,10 +98,8 @@ def run_tilelang_copy_bufferload(num_tokens=128, dtype="float16"):
     tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
 
 
 def test_tilelang_copy_bufferload():
@@ -118,11 +107,10 @@ def test_tilelang_copy_bufferload():
 
 
 def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -132,20 +120,14 @@ def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float
     return main
 
 
-def run_tilelang_copy_buffer_load_with_parallel(M=1024,
-                                                N=1024,
-                                                block_M=128,
-                                                block_N=128,
-                                                dtype="float16"):
+def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
     program = tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
         target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index 00464053..76982a4e 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -9,8 +9,8 @@ def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float3
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -28,8 +28,8 @@ def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="fl
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -57,13 +57,16 @@ def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", sc
         ref_b = torch.empty_like(A)
         for i in range(M // block_M):
             for j in range(N // block_N):
-                ref_b[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = A[i * block_M:(i + 1) * block_M, j *
-                                                         block_N:(j + 1) * block_N].cumsum(dim=dim)
+                ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = A[
+                    i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N
+                ].cumsum(dim=dim)
                 if reverse:
-                    ref_b[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                          block_N] = A[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                                       block_N].flip(dims=[dim]).cumsum(dim=dim).flip(dims=[dim])
+                    ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = (
+                        A[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N]
+                        .flip(dims=[dim])
+                        .cumsum(dim=dim)
+                        .flip(dims=[dim])
+                    )
         return ref_b
 
     tilelang_res = jit_kernel(A)
@@ -76,8 +79,8 @@ def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -94,8 +97,8 @@ def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index 78d38f3a..b0191b4d 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -8,7 +8,6 @@ from tvm.tir.expr import IntImm, Var
 
 
 def test_argument():
-
     @T.prim_func
     def test_argument(
         t_1: T.bool,
@@ -41,6 +40,7 @@ def test_argument():
 
 def test_expr():
     from tilelang.language.v2.dtypes import _all_dtypes
+
     errors = []
     for name in _all_dtypes:
         dtype = getattr(T, name)
@@ -116,33 +116,32 @@ def test_expr():
 
 
 def test_dtype_str_repr():
-
     @T.prim_func
     def test_str_repr():
-        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope='shared')  # noqa F841
-        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope='shared')  # noqa F841
-        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope='shared')  # noqa F841
-        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope='shared')  # noqa F841
-        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope='shared')  # noqa F841
-        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope='shared')  # noqa F841
-        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope='shared')  # noqa F841
-        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope='shared')  # noqa F841
-        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope='shared')  # noqa F841
-        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope='shared')  # noqa F841
-        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope='shared')  # noqa F841
-        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope='shared')  # noqa F841
-        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope='shared')  # noqa F841
-        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope='shared')  # noqa F841
-        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope='shared')  # noqa F841
-        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope='shared')  # noqa F841
-        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope='shared')  # noqa F841
-        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope='shared')  # noqa F841
-        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope='shared')  # noqa F841
-        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope='shared')  # noqa F841
-        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope='shared')  # noqa F841
-        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope='shared')  # noqa F841
-        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope='shared')  # noqa F841
+        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope="shared")  # noqa F841
+        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope="shared")  # noqa F841
+        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope="shared")  # noqa F841
+        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope="shared")  # noqa F841
+        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope="shared")  # noqa F841
+        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope="shared")  # noqa F841
+        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope="shared")  # noqa F841
+        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope="shared")  # noqa F841
+        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope="shared")  # noqa F841
+        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope="shared")  # noqa F841
+        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope="shared")  # noqa F841
+        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope="shared")  # noqa F841
+        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope="shared")  # noqa F841
+        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope="shared")  # noqa F841
+        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope="shared")  # noqa F841
+        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope="shared")  # noqa F841
+        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope="shared")  # noqa F841
+        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope="shared")  # noqa F841
+        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope="shared")  # noqa F841
+        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope="shared")  # noqa F841
+        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope="shared")  # noqa F841
+        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope="shared")  # noqa F841
 
 
 # not supported now
@@ -205,7 +204,6 @@ def test_dtype_str_repr():
 
 
 def test_var_assign():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_var_assign(A: T.Tensor((2,), T.int32)):
@@ -223,7 +221,6 @@ def test_var_assign():
 
 
 def test_marco_return():
-
     @T.macro
     def macro_return_constant():
         return 0
@@ -258,11 +255,10 @@ def test_marco_return():
 
 
 def test_prim_func_generator():
-
     @T.prim_func(generator=True)
     def prim_func_gen(
-            A=T.Tensor((128,), T.float32),  # noqa: B008
-            B=T.Tensor((128,), T.float32),  # noqa: B008
+        A=T.Tensor((128,), T.float32),  # noqa: B008
+        B=T.Tensor((128,), T.float32),  # noqa: B008
     ):
         with T.Kernel(128) as (tx,):
             T.copy(A[tx], B[tx])
@@ -277,7 +273,6 @@ def test_prim_func_generator():
 
 
 def test_serial_for_with_step():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_stepped_serial(A: T.Tensor((10,), T.int32)):
@@ -291,7 +286,7 @@ def test_serial_for_with_step():
 
     ker = test_stepped_serial()
     res = ker()
-    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device='cuda')
+    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     @tilelang.jit(out_idx=-1)
@@ -304,17 +299,16 @@ def test_serial_for_with_step():
 
     ker = test_serial_step_neg()
     res = ker()
-    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device='cuda')
+    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     assert isinstance(T.serial(1, 10, 1), IRBuilderFrame)
-    assert isinstance(T.serial(1, 10, IntImm('int32', 1)), IRBuilderFrame)
-    assert not isinstance(T.serial(1, 10, Var('tmp', 'int32')), IRBuilderFrame)
+    assert isinstance(T.serial(1, 10, IntImm("int32", 1)), IRBuilderFrame)
+    assert not isinstance(T.serial(1, 10, Var("tmp", "int32")), IRBuilderFrame)
     assert not isinstance(T.serial(10, -1, -1), IRBuilderFrame)
 
 
 def test_swap_logic():
-
     @tilelang.jit
     @T.prim_func
     def swap_var(A: T.Tensor[(2,), T.float32]):
@@ -344,7 +338,6 @@ def test_swap_logic():
 
 
 def test_while_loop():
-
     @tilelang.jit(out_idx=-1)
     @T.prim_func
     def test_while_loop(A: T.Tensor((1,), T.int32)):
@@ -374,7 +367,7 @@ def test_var_macro():
                 x = T.alloc_var(T.int32)
                 macro_with_var(x)
 
-        assert 'x[0] = 1' in prim_call_macro.script()
+        assert "x[0] = 1" in prim_call_macro.script()
     finally:
         pass
 
@@ -406,7 +399,7 @@ def test_var_macro():
                 x = T.alloc_var(T.int32)
                 macro_with_var(x)
 
-        assert 'x[0] = 1' in prim_call_macro.script()
+        assert "x[0] = 1" in prim_call_macro.script()
     finally:
         pass
 
@@ -428,10 +421,8 @@ def test_var_macro():
 
 
 def test_frame_inside_macro():
-
     @tilelang.jit
     def get_sample_kernel():
-
         @T.macro
         def transform(x):
             return x + 1
@@ -442,7 +433,7 @@ def test_frame_inside_macro():
             idx_out: T.Tensor[(32,), T.int32],
         ):
             with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
-                fragment = T.alloc_fragment(32, 'int32')
+                fragment = T.alloc_fragment(32, "int32")
                 T.copy(idx_out, fragment)
 
                 for i in T.Parallel(32):
@@ -467,10 +458,10 @@ def test_buffer_slice_step():
 
 
 def test_boolop():
-    a = Var('a', 'int32')
-    b = Var('b', 'int32')
-    c = Var('c', 'int32')
-    d = Var('d', 'int32')
+    a = Var("a", "int32")
+    b = Var("b", "int32")
+    c = Var("c", "int32")
+    d = Var("d", "int32")
 
     @T.macro
     def cond():
@@ -479,5 +470,5 @@ def test_boolop():
     cond()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index 68b65fcd..edbc511d 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -23,7 +23,6 @@ def _resolve_warps_per_group(warps_per_group: Optional[int]) -> int:
 
 @tilelang.jit(out_idx=[-1])
 def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
     def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
         with T.Kernel(1, threads=num_threads) as _:
@@ -35,7 +34,6 @@ def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
     def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
         with T.Kernel(1, threads=num_threads) as _:
@@ -47,7 +45,6 @@ def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] =
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
     def warp_idx_kernel(A: T.Tensor((num_threads,), "int32")):
         with T.Kernel(1, threads=num_threads) as _:
@@ -63,7 +60,6 @@ def _get_warp_group_idx_kernel(
     warp_size: Optional[int] = None,
     warps_per_group: Optional[int] = None,
 ):
-
     @T.prim_func
     def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
         with T.Kernel(1, threads=num_threads) as _:
@@ -75,7 +71,6 @@ def _get_warp_group_idx_kernel(
 
 @tilelang.jit(out_idx=[-1])
 def _shuffle_elect_kernel(num_threads: int = 128, thread_extent: int = 64):
-
     @T.prim_func
     def shuffle_elect_kernel(A: T.Tensor((num_threads,), "int32")):
         with T.Kernel(1, threads=num_threads) as _:
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index b3550f58..9c984569 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -4,13 +4,14 @@ import torch
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
+@tilelang.jit(
+    out_idx=[1],
+)
 def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 0779bff5..5d251866 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -5,7 +5,6 @@ import tilelang.language as T
 
 @tilelang.jit(out_idx=-1)
 def get_inf_kernel(dtype: str):
-
     @T.prim_func
     def main(A: T.Tensor((32,), dtype)):
         with T.Kernel(1, threads=32):
@@ -18,7 +17,7 @@ def _test_infinity(dtype: str):
     kernel = get_inf_kernel(dtype)
     output = kernel()
 
-    assert torch.all(output == torch.inf), f'check failed for {dtype=}'
+    assert torch.all(output == torch.inf), f"check failed for {dtype=}"
 
 
 @tilelang.testing.requires_cuda
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
index f817be26..80318242 100644
--- a/testing/python/language/test_tilelang_language_intrinsics_codegen.py
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -9,8 +9,8 @@ def test_language_ldg_codegen():
 
     @T.prim_func
     def main(
-            x: T.Tensor((N,), "float32"),
-            y: T.Tensor((N,), "float32"),
+        x: T.Tensor((N,), "float32"),
+        y: T.Tensor((N,), "float32"),
     ):
         with T.Kernel(N, threads=32) as pid:
             # Explicitly request read-only cache load for x[pid]
diff --git a/testing/python/language/test_tilelang_language_lazy_jit.py b/testing/python/language/test_tilelang_language_lazy_jit.py
index d3b20c6b..31da09c5 100644
--- a/testing/python/language/test_tilelang_language_lazy_jit.py
+++ b/testing/python/language/test_tilelang_language_lazy_jit.py
@@ -8,7 +8,6 @@ import torch
 
 
 def _gemm_impl():
-
     @T.macro
     def gemm_impl(
         A: T.Tensor[[int, int], Any],
@@ -37,7 +36,6 @@ def _gemm_impl():
 
 
 def test_jit2_gemm_annot():
-
     @tilelang.lazy_jit
     def gemm(
         A: T.Tensor[[int, int], Any],
@@ -54,24 +52,24 @@ def test_jit2_gemm_annot():
         return C
 
     prod = product([T.float16, T.float32], [T.float32])
-    gemm.par_compile([{
-        'A': T.Tensor((1024, 1024), dtype=in_dtype),
-        'B': T.Tensor((1024, 1024), dtype=in_dtype),
-        'out_dtype': out_dtype
-    } for in_dtype, out_dtype in prod])
+    gemm.par_compile(
+        [
+            {"A": T.Tensor((1024, 1024), dtype=in_dtype), "B": T.Tensor((1024, 1024), dtype=in_dtype), "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
 
     for in_dtype, out_dtype in prod:
         in_dtype = in_dtype.torch()
         out_dtype = out_dtype.torch()
-        A = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
-        B = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         C_ref = out_dtype(A @ B)
         C = gemm(A, B)
         torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)
 
 
 def test_jit2_gemm_ptr():
-
     @tilelang.lazy_jit
     def gemm_ptr(
         A: T.ptr,
@@ -92,23 +90,19 @@ def test_jit2_gemm_ptr():
         _gemm_impl()(A, B, C, out_dtype, block_M, block_N, block_K)
 
     prod = product([T.float16, T.float32], [T.float32])
-    gemm_ptr.par_compile([{
-        'A': T.ptr(),
-        'B': T.ptr(),
-        'C': T.ptr(),
-        'M': 1024,
-        'N': 1024,
-        'K': 1024,
-        'dtype': in_dtype,
-        'out_dtype': out_dtype
-    } for in_dtype, out_dtype in prod])
+    gemm_ptr.par_compile(
+        [
+            {"A": T.ptr(), "B": T.ptr(), "C": T.ptr(), "M": 1024, "N": 1024, "K": 1024, "dtype": in_dtype, "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
     for in_dtype, out_dtype in prod:
         in_dtype = in_dtype.torch()
         out_dtype = out_dtype.torch()
-        A = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
-        B = torch.randn(1024, 1024, dtype=in_dtype, device='cuda')
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         C_ref = out_dtype(A @ B)
-        C = torch.empty(1024, 1024, dtype=out_dtype, device='cuda')
+        C = torch.empty(1024, 1024, dtype=out_dtype, device="cuda")
         gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
         torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
 
@@ -129,8 +123,7 @@ def test_jit2_annot():
         AnnotTest(
             annot=T.Tensor[[int, int], T.float32],
             promote=False,
-            match_ok=[torch.randn(1, 1, dtype=torch.float32),
-                      T.Tensor((1, 1), dtype=T.float32)],
+            match_ok=[torch.randn(1, 1, dtype=torch.float32), T.Tensor((1, 1), dtype=T.float32)],
             match_ng=[
                 torch.randn(1, 1, dtype=torch.float16),
                 T.Tensor(1, dtype=T.float32),
@@ -146,8 +139,8 @@ def test_jit2_annot():
                 T.Tensor((1,), dtype=T.float32),
                 T.Tensor((1,), dtype=T.float16),
             ],
-            match_ng=[torch.randn((1, 1), dtype=torch.float32),
-                      T.Tensor((1, 1), dtype=T.float16)]),
+            match_ng=[torch.randn((1, 1), dtype=torch.float32), T.Tensor((1, 1), dtype=T.float16)],
+        ),
         AnnotTest(
             annot=T.Tensor[[int, 1], Any],
             promote=False,
@@ -157,8 +150,8 @@ def test_jit2_annot():
                 T.Tensor((12, 1), T.float32),
                 T.Tensor((12, 1), T.float16),
             ],
-            match_ng=[torch.randn(12, 12, dtype=torch.float32),
-                      T.Tensor((12, 12), T.float32)]),
+            match_ng=[torch.randn(12, 12, dtype=torch.float32), T.Tensor((12, 12), T.float32)],
+        ),
         AnnotTest(
             annot=T.Tensor[[T.dyn, 1], Any],
             promote=False,
@@ -168,43 +161,39 @@ def test_jit2_annot():
                 T.Tensor((12, 1), T.float32),
                 T.Tensor((12, 1), T.float16),
             ],
-            match_ng=[torch.randn(12, 12, dtype=torch.float32),
-                      T.Tensor((12, 12), T.float32)]),
+            match_ng=[torch.randn(12, 12, dtype=torch.float32), T.Tensor((12, 12), T.float32)],
+        ),
         AnnotTest(
             annot=T.Tensor[[1024, 1024], T.float32],
             promote=True,
         ),
-        AnnotTest(annot=T.dyn[int, 'X'], promote=False, match_ok=[1, 2, 3, 4]),
-        AnnotTest(annot=T.dyn, promote=False, match_ok=[1, 2, 3, 4])
+        AnnotTest(annot=T.dyn[int, "X"], promote=False, match_ok=[1, 2, 3, 4]),
+        AnnotTest(annot=T.dyn, promote=False, match_ok=[1, 2, 3, 4]),
     ]
 
     for test in tests:
         promote = test.annot.promote()
         promoted = promote is not None
         if promoted != test.promote:
-            raise AssertionError(
-                f'Promote mismatch for {test.annot}: expected {test.promote}, got {promoted}')
-        with Builder().prim_func('_test'):
+            raise AssertionError(f"Promote mismatch for {test.annot}: expected {test.promote}, got {promoted}")
+        with Builder().prim_func("_test"):
             for match_ok in test.match_ok:
                 try:
                     vt = ArgVarTable()
-                    test.annot.create_prim_func_arg('arg', match_ok, vt)
+                    test.annot.create_prim_func_arg("arg", match_ok, vt)
                 except Exception as e:
                     traceback.print_exc()
-                    raise AssertionError(
-                        f'Match failed for {test.annot} with value {match_ok}: {e}') from e
+                    raise AssertionError(f"Match failed for {test.annot} with value {match_ok}: {e}") from e
             for match_ng in test.match_ng:
                 try:
                     vt = ArgVarTable()
-                    test.annot.create_prim_func_arg('arg', match_ng, vt)
-                    raise AssertionError(
-                        f'Match unexpectedly succeeded for {test.annot} with value {match_ng}')
+                    test.annot.create_prim_func_arg("arg", match_ng, vt)
+                    raise AssertionError(f"Match unexpectedly succeeded for {test.annot} with value {match_ng}")
                 except Exception:
                     pass
 
 
 def test_jit2_many_annot():
-
     @T.macro
     def copy_impl(A, B):
         M, N = A.shape
@@ -213,8 +202,7 @@ def test_jit2_many_annot():
         assert N == N_, f"N mismatch {N} {N_}"
         # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
         with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
-            T.copy(A[bx * 128:bx * 128 + 128, by * 128:by * 128 + 128], B[bx * 128:bx * 128 + 128,
-                                                                          by * 128:by * 128 + 128])
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
 
     @tilelang.lazy_jit
     def copy1(
@@ -259,20 +247,19 @@ def test_jit2_many_annot():
         copy_impl(A, B)
 
     for copy in [copy1, copy2, copy3, copy4]:
-        A = torch.randn(128, 128, device='cuda')
-        B = torch.empty(128, 128, device='cuda')
+        A = torch.randn(128, 128, device="cuda")
+        B = torch.empty(128, 128, device="cuda")
         copy(A, B)
         assert torch.equal(B, A)
 
     for copy in [copy5, copy6]:
-        A = torch.randn(128, 2, 128, 2, device='cuda')
-        B = torch.randn(128, 2, 128, 2, device='cuda')
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = torch.randn(128, 2, 128, 2, device="cuda")
         copy(A[:, 0, :, 0], B[:, 0, :, 0])
         assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
 
 
 def test_jit2_return():
-
     @T.macro
     def copy_impl(A):
         M, N = A.shape
@@ -283,8 +270,7 @@ def test_jit2_return():
         assert N == N_, f"N mismatch {N} {N_}"
         # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
         with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
-            T.copy(A[bx * 128:bx * 128 + 128, by * 128:by * 128 + 128], B[bx * 128:bx * 128 + 128,
-                                                                          by * 128:by * 128 + 128])
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
         return B
 
     @tilelang.lazy_jit
@@ -292,41 +278,52 @@ def test_jit2_return():
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy1(A: T.Tensor[[int, int], T.float32],):
+    def copy1(
+        A: T.Tensor[[int, int], T.float32],
+    ):
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy2(A: T.Tensor[[128, 128], T.float32],):
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+    ):
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy3(A: T.Tensor[[int, 128], T.float32],):
+    def copy3(
+        A: T.Tensor[[int, 128], T.float32],
+    ):
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy4(A: T.Tensor[[T.dyn, int], T.float32],):
+    def copy4(
+        A: T.Tensor[[T.dyn, int], T.float32],
+    ):
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy5(A: T.StridedTensor[[int, int], [int, int], T.float32],):
+    def copy5(
+        A: T.StridedTensor[[int, int], [int, int], T.float32],
+    ):
         return copy_impl(A)
 
     @tilelang.lazy_jit
-    def copy6(A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],):
+    def copy6(
+        A: T.StridedTensor[[T.dyn, int], [int, int], T.float32],
+    ):
         return copy_impl(A)
 
     for copy in [copy0, copy1, copy2, copy3, copy4]:
-        A = torch.randn(128, 128, device='cuda')
+        A = torch.randn(128, 128, device="cuda")
         B = copy(A)
         assert torch.equal(B, A)
     for copy in [copy5, copy6]:
-        A = torch.randn(128, 2, 128, 2, device='cuda')
+        A = torch.randn(128, 2, 128, 2, device="cuda")
         B = copy(A[:, 0, :, 0])
         assert torch.equal(A[:, 0, :, 0], B)
 
 
 def test_jit2_deepseek_deepgemm():
-
     @tilelang.lazy_jit
     def deep_gemm(
         A: T.Tensor[[int, int], T.float8_e4m3],
@@ -351,13 +348,9 @@ def test_jit2_deepseek_deepgemm():
         N, K = B.shape
         C = T.empty(M, N, dtype=out_dtype)
 
-        assert out_dtype in [
-            T.bfloat16, T.float32
-        ], f"Expect out_dtype to be one of [T.float16, T.float32], got {out_dtype}"
-        assert scales_a.shape == [M, T.ceildiv(K, group_size)
-                                 ], f"Expect scales_a shape to be f{[M, T.ceildiv(K, group_size)]}"
-        assert scales_b.shape == [N, T.ceildiv(K, group_size)
-                                 ], f"Expect scales_b shape to be f{[N, T.ceildiv(K, group_size)]}"
+        assert out_dtype in [T.bfloat16, T.float32], f"Expect out_dtype to be one of [T.float16, T.float32], got {out_dtype}"
+        assert scales_a.shape == [M, T.ceildiv(K, group_size)], f"Expect scales_a shape to be f{[M, T.ceildiv(K, group_size)]}"
+        assert scales_b.shape == [N, T.ceildiv(K, group_size)], f"Expect scales_b shape to be f{[N, T.ceildiv(K, group_size)]}"
 
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), in_dtype)
@@ -421,5 +414,5 @@ def test_jit2_deepseek_deepgemm():
 #     M, N, K = 1024, 1024, 8192
 #     A = torch.randn((M, K), dtype=torch.float8_e4m3fn, )
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index a2af09c6..a2905952 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -4,7 +4,6 @@ from tilelang import language as T
 
 
 def test_let_vectorize_load():
-
     @T.prim_func
     def main(A_ptr: T.handle):
         A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index ad90785f..37b52045 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -6,11 +6,10 @@ import torch
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -30,13 +29,8 @@ def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
 def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -49,11 +43,10 @@ def test_tilelang_copy_mask_parallel():
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -72,13 +65,8 @@ def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
 def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -91,11 +79,10 @@ def test_tilelang_copy_mask_copy():
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -112,20 +99,11 @@ def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_parallel_range(M=1024,
-                                          N=1024,
-                                          block_M=128,
-                                          block_N=128,
-                                          dtype="float16"):
+def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -138,11 +116,10 @@ def test_tilelang_copy_mask_parallel_range():
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -161,13 +138,8 @@ def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
 def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_negative_index.py b/testing/python/language/test_tilelang_language_negative_index.py
index 4a0df878..c052ccb9 100644
--- a/testing/python/language/test_tilelang_language_negative_index.py
+++ b/testing/python/language/test_tilelang_language_negative_index.py
@@ -31,8 +31,7 @@ def negative_index_loop_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((4,)
 
 
 @T.prim_func
-def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), "float32"),
-                                   B: T.Buffer((16,), "float32")):
+def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(16):
         B[i] = A[shift + i]
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index b51ca8b6..b0e85ff4 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -9,11 +9,10 @@ tilelang.testing.set_random_seed()
 
 @tilelang.jit(out_idx=[1])
 def parallel_elementwise_static(length=256, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length):
@@ -24,12 +23,11 @@ def parallel_elementwise_static(length=256, dtype="float32"):
 
 @tilelang.jit(out_idx=[1])
 def parallel_elementwise_dynamic(max_len=512, threads=256, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((max_len,), dtype),
-            B: T.Tensor((max_len,), dtype),
-            valid_len: T.int32,
+        A: T.Tensor((max_len,), dtype),
+        B: T.Tensor((max_len,), dtype),
+        valid_len: T.int32,
     ):
         with T.Kernel(1, threads=threads) as _:
             for i in T.Parallel(max_len):
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 212f281e..54e10550 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -27,9 +27,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -90,7 +90,8 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -103,8 +104,8 @@ def run_gemm(
         if in_dtype == "float32":
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -124,27 +125,19 @@ def test_pipeline_order_stage():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+    },
+)
+def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype="float16", accum_dtype="float"):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     import tilelang.language as T
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -183,8 +176,7 @@ def run_blocksparse_matmul(num_stages):
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
 
-    kernel = blocksparse_matmul(
-        M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
+    kernel = blocksparse_matmul(M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
     print(kernel.get_kernel_source())
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -200,12 +192,10 @@ def run_blocksparse_matmul(num_stages):
                 accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
                 for k in range(K // block_K):
                     if BlockMask[i, j, k]:
-                        accu += (
-                            A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                                torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                                   j * block_N:(j + 1) * block_N].to(torch.float32))
-                ref_c[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                        accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                            k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                        ].to(torch.float32)
+                ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
         return ref_c
 
     # Compute the reference result using the naive PyTorch implementation
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index e4659ecc..0e60ddd7 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -7,7 +7,6 @@ from tilelang.utils import map_torch_type
 
 
 def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
         a_ptr: T.ptr,
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index cecfaa09..7ec50039 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -10,8 +10,8 @@ def _make_shared_reduce(M, N, dtype, reduce_cb):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((M, N), dtype)
@@ -35,8 +35,8 @@ def reduce_max_test(M, N, dtype="float16"):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -54,8 +54,8 @@ def reduce_sum_test(M, N, dtype="float32"):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -145,8 +145,8 @@ def reduce_sum_test_clear(M, N, dtype="float32"):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -186,8 +186,8 @@ def reduce_max_test_clear(M, N, dtype="float16"):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index 60588b4a..3c343309 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -10,8 +10,8 @@ def reshape_test(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_reshaped = T.reshape(A, [N // M, M])
@@ -30,7 +30,8 @@ def run_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -50,8 +51,8 @@ def reshape_test_smem_1d_2_2d(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N,), dtype)
@@ -74,7 +75,8 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -93,8 +95,8 @@ def reshape_test_smem_2d_2_1d(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N // M, M), dtype)
@@ -117,7 +119,8 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -136,8 +139,8 @@ def reshape_fragment_test(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
@@ -161,7 +164,8 @@ def run_reshape_fragment(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -181,15 +185,17 @@ def reshape_layout_transform_shared(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
 
-            T.annotate_layout({
-                A_shared: make_mma_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_mma_swizzle_layout(A_shared),
+                }
+            )
             T.copy(A, A_shared)
             A_shared_reshape = T.reshape(A_shared, [N])
             T.copy(A_shared_reshape, B)
@@ -205,7 +211,8 @@ def run_reshape_layout_transform_shared(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -224,8 +231,8 @@ def reduce_after_reshape_test(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_shared = T.alloc_shared((N,), dtype, scope="shared")
@@ -249,7 +256,8 @@ def run_reduce_after_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -268,8 +276,8 @@ def reshape_shape_mismatch_test(N, M, dtype):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_reshaped = T.reshape(A, [N // M, M + 1])
diff --git a/testing/python/language/test_tilelang_language_ternary.py b/testing/python/language/test_tilelang_language_ternary.py
index 821231ab..632dcf7b 100644
--- a/testing/python/language/test_tilelang_language_ternary.py
+++ b/testing/python/language/test_tilelang_language_ternary.py
@@ -4,19 +4,19 @@ import torch
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
+@tilelang.jit(
+    out_idx=[1],
+)
 def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = (
-                    A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0)
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0
 
     return main
 
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
index efb665ba..90022b5e 100644
--- a/testing/python/language/test_tilelang_language_tma_1d.py
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -9,10 +9,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -21,7 +19,7 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
index 1796302e..416840a1 100644
--- a/testing/python/language/test_tilelang_language_unroll.py
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -4,7 +4,6 @@ from tilelang import language as T
 
 
 def test_unroll_with_step():
-
     @T.prim_func
     def main(A_ptr: T.handle):
         A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
@@ -19,7 +18,6 @@ def test_unroll_with_step():
 
 
 def test_unroll_with_unroll_factor():
-
     @T.prim_func
     def main(A_ptr: T.handle):
         A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index a5a7dded..d4f9062b 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -4,17 +4,15 @@ import tilelang.testing
 
 
 def test_var_assign() -> None:
-
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
-
         @T.prim_func
-        def test_var_assign(A: T.Tensor((2,), 'int32')):
+        def test_var_assign(A: T.Tensor((2,), "int32")):
             with T.Kernel(1) as _:
-                a = T.alloc_var('int32', init=1)
-                b = T.alloc_var('int32', init=a)  # b gets value of a
+                a = T.alloc_var("int32", init=1)
+                b = T.alloc_var("int32", init=a)  # b gets value of a
                 a = 2
-                d = T.alloc_var('int32', init=a)  # c gets new value of a
+                d = T.alloc_var("int32", init=a)  # c gets new value of a
                 A[0] = b
                 A[1] = d
 
@@ -28,5 +26,5 @@ def test_var_assign() -> None:
     assert res[1] == 2
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index bc2d3144..6867079c 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -5,11 +5,10 @@ import tilelang.language as T
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test(N, M, stride_A, stride_B):
-
     @T.prim_func
     def main(
-            A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
-            B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
+        A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
+        B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
     ):
         with T.Kernel(M // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
@@ -39,9 +38,7 @@ def run_vectorize(N, M, stride_A, stride_B):
     code = jit_kernel.get_kernel_source()
 
     vectorize_size = 1
-    while vectorize_size <= 2 and \
-          stride_A % (vectorize_size * 2) == 0 and \
-          stride_B % (vectorize_size * 2) == 0:
+    while vectorize_size <= 2 and stride_A % (vectorize_size * 2) == 0 and stride_B % (vectorize_size * 2) == 0:
         vectorize_size *= 2
 
     if vectorize_size == 4:
@@ -61,12 +58,11 @@ def test_vectorize():
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test_invariant_index(N, M, K):
-
     @T.prim_func
     def main(
-            A: T.Tensor[(N, M), "float32"],  # noqa: F821
-            B: T.Tensor[(N, M), "float32"],  # noqa: F821
-            C: T.Tensor[(N, M // K), "float32"],  # noqa: F821
+        A: T.Tensor[(N, M), "float32"],  # noqa: F821
+        B: T.Tensor[(N, M), "float32"],  # noqa: F821
+        C: T.Tensor[(N, M // K), "float32"],  # noqa: F821
     ):
         with T.Kernel(N // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index afb8a05d..adb59a6b 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -17,8 +17,8 @@ def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             T.copy(A, B)
@@ -32,8 +32,8 @@ def parallel_vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M,), dtype_A],  # noqa: F821
-            B: T.Tensor[(M,), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             A_local = T.alloc_fragment((M,), dtype_A)
@@ -73,8 +73,7 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
     code = kernel.get_kernel_source()
     code_parallel = kernel_parallel.get_kernel_source()
 
-    assert check_str in code and check_str in code_parallel, \
-        f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
+    assert check_str in code and check_str in code_parallel, f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
 
 
 def test_vectorized_cast():
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index a79d428b..ff050e31 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -10,6 +10,7 @@ def view_test(N, M, dtype, new_dtype=None):
     new_shape = [N // M, M]
     if new_dtype:
         from tvm import DataType
+
         dtype_src = DataType(dtype)
         dtype_dst = DataType(new_dtype)
         src_bits = dtype_src.bits
@@ -19,8 +20,8 @@ def view_test(N, M, dtype, new_dtype=None):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
     ):
         with T.Kernel(1) as _:
             A_viewed = T.view(A, new_shape, dtype=new_dtype)
@@ -37,6 +38,7 @@ def run_view(N, M, dtype, new_dtype=None):
     def ref_program(A):
         if new_dtype:
             from tilelang.utils.tensor import map_torch_type
+
             torch_dtype = map_torch_type(new_dtype)
             return A.view(N // M, M).view(dtype=torch_dtype)
         return A.view(N // M, M)
@@ -45,7 +47,6 @@ def run_view(N, M, dtype, new_dtype=None):
 
 
 def test_reshape_view():
-
     # Test view with same dtype
     run_view(1024, 32, "float32")
     run_view(2048, 64, "float16")
@@ -61,6 +62,7 @@ def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
     new_shape = [N // M, M + 1]
     if new_dtype:
         from tvm import DataType
+
         dtype_src = DataType(dtype)
         dtype_dst = DataType(new_dtype)
         src_bits = dtype_src.bits
@@ -70,8 +72,8 @@ def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
     ):
         with T.Kernel(1) as _:
             A_viewed = T.view(A, new_shape, dtype=new_dtype)
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
index 681b2347..0a0fb70b 100644
--- a/testing/python/language/test_tilelang_language_warp_reduce.py
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -7,7 +7,6 @@ import tilelang.language as T
 
 @tilelang.jit
 def get_kernel(reduce_op: str, dtype: str):
-
     assert reduce_op in ["sum", "max", "min", "bitand", "bitor"]
 
     @T.prim_func
@@ -33,16 +32,16 @@ def get_kernel(reduce_op: str, dtype: str):
 
 
 def test_warp_reduce_sum():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel('sum', 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("sum", "float32")
     ref = torch.full_like(a, a.sum())
     kernel(a)
     torch.testing.assert_close(a, ref)
 
 
 def test_warp_reduce_max():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel("max", 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("max", "float32")
     print(kernel.get_kernel_source())
     ref = torch.full_like(a, a.max())
     kernel(a)
@@ -50,16 +49,16 @@ def test_warp_reduce_max():
 
 
 def test_warp_reduce_min():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel("min", 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("min", "float32")
     ref = torch.full_like(a, a.min())
     kernel(a)
     torch.testing.assert_close(a, ref)
 
 
 def test_warp_reduce_bitand():
-    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
-    kernel = get_kernel("bitand", 'int32')
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitand", "int32")
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val & a[i]
@@ -69,8 +68,8 @@ def test_warp_reduce_bitand():
 
 
 def test_warp_reduce_bitor():
-    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
-    kernel = get_kernel("bitor", 'int32')
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitor", "int32")
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val | a[i]
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
index d67a87bc..6d3c2682 100644
--- a/testing/python/layout/test_tilelang_layout_fused_replicate.py
+++ b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -12,17 +12,16 @@ VEC_SIZE = 32
 
 @tilelang.jit
 def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
-
     @T.prim_func
     def main(
-            a: T.Buffer((B, M, N), "bfloat16"),
-            a_out: T.Buffer((B, M, N), "float32"),
+        a: T.Buffer((B, M, N), "bfloat16"),
+        a_out: T.Buffer((B, M, N), "float32"),
     ):
         with T.Kernel(
-                T.ceildiv(M, BLOCK_MN),
-                T.ceildiv(N, BLOCK_K),
-                B,
-                threads=128,
+            T.ceildiv(M, BLOCK_MN),
+            T.ceildiv(N, BLOCK_K),
+            B,
+            threads=128,
         ) as (pid_m, pid_n, pid_b):
             a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), "float32")
             offs_m = pid_m * BLOCK_MN
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 9c229466..8d7f5a1a 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -19,12 +19,11 @@ def bitwise_reduce(
     func,
     clear=True,
 ):
-
     @T.prim_func
     def reduce_func(
-            A: T.Tensor((M, N), "int32"),
-            B: T.Tensor((M), "int32"),
-            Output: T.Tensor((M), "int32"),
+        A: T.Tensor((M, N), "int32"),
+        B: T.Tensor((M), "int32"),
+        Output: T.Tensor((M), "int32"),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), "int32")
@@ -64,7 +63,7 @@ def run_single_bitwise_reduce(
             row_pattern = (i & 0xF) << (i % 4)  # 4-bit patterns shifted by row
 
             # Column-based pattern: different bit positions set based on column
-            col_pattern = (1 << (j % 31))  # Single bit set at different positions
+            col_pattern = 1 << (j % 31)  # Single bit set at different positions
 
             # Combine patterns with XOR to create diverse bit distributions
             # Add some deterministic "noise" based on position
@@ -76,7 +75,7 @@ def run_single_bitwise_reduce(
             if i % 4 == 0:
                 a[i, j] &= ~(0x1 << (i // 4))
             elif i % 2 == 0:
-                a[i, j] |= (0x1 << (i // 2))
+                a[i, j] |= 0x1 << (i // 2)
 
     if name == "reduce_bitand":
         expected = torch.full((M,), -1, device="cuda", dtype=torch.int32)
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index c3b5d1b5..7809983e 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -7,16 +7,16 @@ import re
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +27,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +57,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +71,8 @@ def run_single_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +85,22 @@ def run_single_arg_mathop_test(mathop_name,
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +109,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +119,8 @@ def run_two_arg_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -171,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), "float32"),
+        B: T.Tensor((M, N), "float32"),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +172,8 @@ def run_abs_test():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +188,19 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,14 +209,15 @@ def run_fastmath_mathop_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 0b04e3ba..193092ec 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -5,14 +5,7 @@ import tilelang.testing
 import pytest
 
 
-def run_ieee_math_test(mathop_name,
-                       mathop_func,
-                       rounding_mode="rn",
-                       M=128,
-                       N=128,
-                       block_M=32,
-                       block_N=32,
-                       dtype="float32"):
+def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype="float32"):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -22,18 +15,19 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
-                D: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+            D: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    D[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i, bx * block_N + j],
-                                                      C[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    D[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j],
+                        B[by * block_M + i, bx * block_N + j],
+                        C[by * block_M + i, bx * block_N + j],
+                        rounding_mode,
+                    )
 
         out_idx = [3]
         num_inputs = 3
@@ -41,16 +35,15 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    C[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    C[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j], rounding_mode
+                    )
 
         out_idx = [2]
         num_inputs = 2
@@ -58,14 +51,12 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    B[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      rounding_mode)
+                    B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j], rounding_mode)
 
         out_idx = [1]
         num_inputs = 1
@@ -77,7 +68,8 @@ def run_ieee_math_test(mathop_name,
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
     print(f"✓ {mathop_name} compilation test passed")
@@ -194,8 +186,8 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-            A: T.Tensor((128, 128), "float32"),
-            B: T.Tensor((128, 128), "float32"),
+        A: T.Tensor((128, 128), "float32"),
+        B: T.Tensor((128, 128), "float32"),
     ):
         with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
             for i, j in T.Parallel(32, 32):
@@ -207,7 +199,8 @@ def test_ieee_frsqrt_rn_only():
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print("\n=== Testing ieee_frsqrt (rn only) ===")
     print("✓ ieee_frsqrt compilation test passed")
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index 22f4beb8..ea088aea 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -5,18 +5,17 @@ import tilelang.language as T
 import torch
 
 
-@tilelang.jit(execution_backend='torch')
+@tilelang.jit(execution_backend="torch")
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float32", accum_dtype="float"):
-
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype, scope='shared')
-            B_shared = T.alloc_shared((block_K, block_N), dtype, scope='shared')
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
@@ -48,13 +47,13 @@ def assert_gemm(
 
     torch_dtype = getattr(torch, dtype)
     a, b = None, None
-    if 'int' in dtype:
-        a = torch.randint(100, (M, K), dtype=torch_dtype, device='mps')
-        b = torch.randint(100, (K, N), dtype=torch_dtype, device='mps')
+    if "int" in dtype:
+        a = torch.randint(100, (M, K), dtype=torch_dtype, device="mps")
+        b = torch.randint(100, (K, N), dtype=torch_dtype, device="mps")
     else:
-        a = torch.randn(M, K, dtype=torch_dtype, device='mps')
-        b = torch.randn(K, N, dtype=torch_dtype, device='mps')
-    c = torch.zeros(M, N, dtype=torch_dtype, device='mps')
+        a = torch.randn(M, K, dtype=torch_dtype, device="mps")
+        b = torch.randn(K, N, dtype=torch_dtype, device="mps")
+    c = torch.zeros(M, N, dtype=torch_dtype, device="mps")
 
     jit_kernel(a, b, c)
 
@@ -70,12 +69,12 @@ def test_gemm_float32():
 
 @tilelang.testing.requires_metal
 def test_gemm_float16():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='float16', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype="float16", atol=1)
 
 
 @tilelang.testing.requires_metal
 def test_gemm_int32():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='int32', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype="int32", atol=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
index fcda9878..97ce3231 100644
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ b/testing/python/primitives/test_tilelang_primitives_mma.py
@@ -27,9 +27,9 @@ def matmul_ssr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
@@ -88,7 +88,8 @@ def run_matmul_ssr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -106,24 +107,9 @@ def run_matmul_ssr(
 
 
 def test_gemm_f16f16f16_nt_ssr():
-    run_matmul_ssr(
-        16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
-    run_matmul_ssr(
-        128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
-    run_matmul_ssr(
-        1024,
-        1024,
-        1024,
-        False,
-        True,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        128,
-        32,
-        2,
-        num_threads=128)
+    run_matmul_ssr(16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
+    run_matmul_ssr(128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
+    run_matmul_ssr(1024, 1024, 1024, False, True, "float16", "float16", "float16", 128, 128, 32, 2, num_threads=128)
 
 
 def matmul_rsr(
@@ -151,9 +137,9 @@ def matmul_rsr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
@@ -214,7 +200,8 @@ def run_matmul_rsr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -276,9 +263,9 @@ def matmul_rrr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -342,7 +329,8 @@ def run_matmul_rrr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index ee46725b..8aa54708 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -4,12 +4,11 @@ import tilelang.language as T
 
 @tilelang.jit(out_idx=[-1])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index d984ad4b..a13e4533 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -27,9 +27,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -89,7 +89,8 @@ def run_gemm_ss(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
@@ -159,9 +160,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -169,9 +170,11 @@ def matmul_rs(
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -225,7 +228,8 @@ def run_gemm_rs(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -294,9 +298,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -304,9 +308,11 @@ def matmul_sr(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -360,7 +366,8 @@ def run_gemm_sr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -430,9 +437,9 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -441,10 +448,12 @@ def matmul_rr(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -499,7 +508,8 @@ def run_gemm_rr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index cefe986a..4ced4f83 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -20,27 +20,11 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
             low, high = (0, 4) if is_unsigned else (-2, 2)
         else:
             low, high = (0, 128) if is_unsigned else (-64, 64)
-        A = randint_semi_sparse(
-            M,
-            K,
-            low=low,
-            high=high,
-            dtype=map_torch_type(in_dtype),
-            device='cuda',
-            transposed=trans_A)
-        B = torch.randint(
-            size=(N, K) if trans_B else (K, N),
-            low=low,
-            high=high,
-            dtype=map_torch_type(in_dtype),
-            device='cuda')
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
     else:
-        A = randn_semi_sparse(
-            M, K, dtype=torch.float32, device='cuda',
-            transposed=trans_A).to(map_torch_type(in_dtype))
-        B = torch.randn(
-            (N, K) if trans_B else (K, N), device='cuda',
-            dtype=torch.float32).to(map_torch_type(in_dtype))
+        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(map_torch_type(in_dtype))
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
     return A, B
 
 
@@ -69,24 +53,22 @@ def matmul_sp_sm90(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), "uint8")
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_cutlass_metadata_layout(
-                        E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
-                E_shared:
-                    make_cutlass_metadata_layout(
-                        E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                }
+            )
             T.disable_warp_group_reg_alloc()
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
@@ -121,7 +103,7 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    metadata_dtype = "int32" if is_8_bit else "int16"
     E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
@@ -132,20 +114,22 @@ def matmul_sp_sm80(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), metadata_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
-                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -216,7 +200,7 @@ def run_gemm_sp(
 
     C = _matmul(A, B)
 
-    if 'float8' in in_dtype:
+    if "float8" in in_dtype:
         diff = calc_diff(C_sp, C)
         assert diff < 1e-3, f"{diff=}"
     else:
@@ -332,15 +316,11 @@ def test_gemm_sp_sm90():
     run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128)
     run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128)
 
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     False)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     True)
+    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True)
+    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, False)
+    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, True)
 
-    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False,
-                     True)
+    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False, True)
     run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
 
 
@@ -352,12 +332,9 @@ def test_gemm_sp_sm80():
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
 
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
+    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False, True)
+    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, True)
+    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True)
 
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128)
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
index a82c29f3..276bce4d 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -34,20 +34,22 @@ def matmul(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), metadata_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
-                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -80,7 +82,7 @@ def run_gemm_ss(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
     program = matmul(
         M,
         N,
@@ -105,7 +107,8 @@ def run_gemm_ss(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
 
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
@@ -142,26 +145,11 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
             low, high = (0, 4) if is_unsigned else (-2, 2)
         else:
             low, high = (0, 128) if is_unsigned else (-64, 64)
-        A = randint_semi_sparse(
-            M,
-            K,
-            low=low,
-            high=high,
-            dtype=map_torch_type(in_dtype),
-            device='cuda',
-            transposed=trans_A)
-        B = torch.randint(
-            size=(N, K) if trans_B else (K, N),
-            low=low,
-            high=high,
-            dtype=map_torch_type(in_dtype),
-            device='cuda')
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
     else:
-        A = randn_semi_sparse(
-            M, K, dtype=map_torch_type(in_dtype), device='cuda', transposed=trans_A)
-        B = torch.randn(
-            (N, K) if trans_B else (K, N), device='cuda',
-            dtype=torch.float32).to(map_torch_type(in_dtype))
+        A = randn_semi_sparse(M, K, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
     return A, B
 
 
@@ -184,8 +172,7 @@ def test_gemm_ss():
     run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
 
     # float8 tests
-    run_gemm_ss(128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64,
-                2)
+    run_gemm_ss(128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
     run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
 
     # tfloat32 test
@@ -222,10 +209,10 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), metadata_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -233,11 +220,13 @@ def matmul_rs(
             E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
-                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -271,7 +260,7 @@ def run_gemm_rs(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
     program = matmul_rs(
         M,
         N,
@@ -296,7 +285,8 @@ def run_gemm_rs(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
     C_sp = kernel(A_sparse, E, B)
@@ -376,10 +366,10 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), metadata_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -387,11 +377,13 @@ def matmul_sr(
             E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
-                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -425,7 +417,7 @@ def run_gemm_sr(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
     program = matmul_sr(
         M,
         N,
@@ -450,7 +442,8 @@ def run_gemm_sr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
     C_sp = kernel(A_sparse, E, B)
@@ -531,10 +524,10 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), metadata_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -543,12 +536,14 @@ def matmul_rr(
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
-                E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -583,7 +578,7 @@ def run_gemm_rr(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = 'int32' if ('8' in in_dtype) else 'int16'
+    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
     program = matmul_rr(
         M,
         N,
@@ -608,7 +603,8 @@ def run_gemm_rr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
     C_sp = kernel(A_sparse, E, B)
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index 7cb1b551..d3f45c5e 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -11,22 +11,14 @@ def _check(original, transformed):
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
     mod = tl.transform.Simplify()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_trival_pipeline():
-
     @T.prim_func
     def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
         for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
-            for i in T.serial(
-                    0,
-                    1,
-                    annotations={
-                        "software_pipeline_stage": [0, 1],
-                        "software_pipeline_order": [0, 1]
-                    }):
+            for i in T.serial(0, 1, annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]}):
                 with T.block():
                     T.reads(A[tx, i])
                     T.writes(C[tx, i])
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index 8029305a..2ec6321e 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -21,10 +21,8 @@ def _check(original, transformed):
 
 
 def test_cluster_planning():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor((1024, 1024), "float16")):
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), "float16")
             B_shared = T.alloc_shared((32, 128), "float16")
@@ -41,8 +39,7 @@ def test_cluster_planning():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor((1024, 1024), "float16")):
         T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), "float16")
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index 1ef1589a..339b283e 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -9,7 +9,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
 
     batch = T.int32(batch)
     heads = T.int32(heads)
@@ -24,7 +24,6 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -36,37 +35,36 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def MMA1(
-                V: T.Tensor(shape, dtype),
-                V_shared: T.Tensor([block_M, dim], dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                k: T.int32,
-                by: T.int32,
-                bz: T.int32,
+            V: T.Tensor(shape, dtype),
+            V_shared: T.Tensor([block_M, dim], dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            k: T.int32,
+            by: T.int32,
+            bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.Tensor([block_M, block_N], accum_dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                scores_max: T.Tensor([block_M], accum_dtype),
-                scores_max_prev: T.Tensor([block_M], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
-                scores_sum: T.Tensor([block_M], accum_dtype),
-                logsum: T.Tensor([block_M], accum_dtype),
+            acc_s: T.Tensor([block_M, block_N], accum_dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            scores_max: T.Tensor([block_M], accum_dtype),
+            scores_max_prev: T.Tensor([block_M], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
+            scores_sum: T.Tensor([block_M], accum_dtype),
+            logsum: T.Tensor([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -92,22 +90,21 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
 
         @T.macro
         def Rescale(
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -122,7 +119,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -131,19 +128,18 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                            scores_sum, logsum)
+                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                     Rescale(acc_o, scores_scale)
                     MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 2859821c..854a2617 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -22,7 +22,6 @@ def _check(original, transformed):
 
 
 def test_lower_fence_proxy():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -30,12 +29,15 @@ def test_lower_fence_proxy():
             B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+            )
 
     @T.prim_func
     def after():
@@ -44,19 +46,21 @@ def test_lower_fence_proxy():
             B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.fence_proxy_async()
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+            )
 
     _check(before, after)
 
 
 def test_async_to_generic_no_double_fence():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -90,7 +94,6 @@ def test_async_to_generic_no_double_fence():
 
 
 def test_proxy_hint_override():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -123,7 +126,6 @@ def test_proxy_hint_override():
 
 
 def test_tma_store_sync_injection():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -154,7 +156,6 @@ def test_tma_store_sync_injection():
 
 
 def test_wgmma_marked_async():
-
     @T.prim_func
     def before():
         with T.Kernel(1):
@@ -164,9 +165,24 @@ def test_wgmma_marked_async():
             C_local = T.decl_buffer((32,), "float16", scope="local")
             A_shared[0] = T.float16(0)
             T.warpgroup_arrive()
-            T.ptx_wgmma_ss("float16", "m64n64k16", T.bool(True), T.bool(True), "fp16", "fp16",
-                           "fp16", desc_a.data, T.int32(0), desc_b.data, T.int32(0), C_local.data,
-                           T.int32(0), T.bool(True), 1, 1)
+            T.ptx_wgmma_ss(
+                "float16",
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.BindTarget(auto_target)(mod)
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 95cbf2db..0cc79b92 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -35,26 +35,25 @@ def test_inject_set_max_nreg():
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
                     if v - 128 == 0:
                         T.tma_load(
-                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1,
-                                                    0, 2, 2, 0), T.get_mbarrier(k % 3),
-                            T.tvm_access_ptr(
-                                T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                            k * 32, by * 64)
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                            T.get_mbarrier(k % 3),
+                            T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                            k * 32,
+                            by * 64,
+                        )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
             else:
                 # Consumer branch - should have set_max_nreg(240, 1)
                 for k in range(16):
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                     T.call_extern(
-                        "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                        "handle",
+                        "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                    )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     # Apply the InjectSetMaxNReg pass
     func = before
@@ -67,15 +66,18 @@ def test_inject_set_max_nreg():
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # We should have at least 2 set_max_nreg calls (one for producer, one for consumer)
-    assert len(set_max_nreg_calls
-              ) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg test passed!")
 
@@ -116,16 +118,18 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # Should have no set_max_nreg calls when no_set_max_nreg is present
-    assert len(
-        set_max_nreg_calls
-    ) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg with no_set_max_nreg test passed!")
 
diff --git a/testing/python/transform/test_tilelang_transform_layout_inference.py b/testing/python/transform/test_tilelang_transform_layout_inference.py
index 66415aac..270dd31e 100644
--- a/testing/python/transform/test_tilelang_transform_layout_inference.py
+++ b/testing/python/transform/test_tilelang_transform_layout_inference.py
@@ -8,17 +8,21 @@ import pytest
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, "float16"),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
@@ -26,58 +30,62 @@ def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
                         for vec in T.Parallel(vec_load_b):
-                            B_shared[i * (threads * vec_load_b // block_N) + t //
-                                     (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                     (block_N // vec_load_b) + vec] = T.if_then_else(
-                                         k * block_K + i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b) < K and bx * block_N + t %
-                                         (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                         B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                           t // (block_N // vec_load_b), bx * block_N + t %
-                                           (block_N // vec_load_b) * (block_N // vec_load_b) + vec],
-                                         T.float16(0))
+                            B_shared[
+                                i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                            ] = T.if_then_else(
+                                k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                B[
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ],
+                                T.float16(0),
+                            )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.target.Target(auto_target):
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 5202ab64..35a85aaf 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -8,7 +8,9 @@ def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_off
     dtype = "float32"
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -16,17 +18,18 @@ def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_off
                 A_shared[tid, j] = A[tid + M_offset, j + N_offset]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
 
     return main, expected
 
@@ -41,13 +44,13 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
 def issue_1013_buggy_kernel():
     # NOTE: This kernel is mainly to test some corner cases in boundary check
 
-    num_tokens = T.dynamic('num_tokens')
+    num_tokens = T.dynamic("num_tokens")
     num_threads = 128
 
     @T.prim_func
     def main(x: T.Tensor((num_tokens,), dtype="int64")):
         with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
+            count = T.alloc_var("int")
             thread_idx = T.get_thread_binding()
             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
                 idx = thread_idx + i * num_threads
@@ -59,24 +62,22 @@ def issue_1013_buggy_kernel():
     @T.prim_func
     def expected(x: T.Tensor((num_tokens,), dtype="int64")):
         with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
+            count = T.alloc_var("int")
             thread_idx = T.get_thread_binding()
             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
                 idx = thread_idx + i * num_threads
-                count += T.Cast("int32",
-                                T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
+                count += T.Cast("int32", T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
 
     return main, expected
 
 
-def vectorize_access_with_atmoic_add_legalize(M: int = 64,
-                                              N: int = 64,
-                                              M_offset: int = 2,
-                                              N_offset: int = 2):
+def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
     dtype = "float32"
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -85,17 +86,18 @@ def vectorize_access_with_atmoic_add_legalize(M: int = 64,
                 T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
@@ -115,17 +117,21 @@ def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: in
     dtype = "float32"
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
             for j in T.serial(N):
                 A[tid + M_offset, j + N_offset] = 1
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
-            T.writes(A[tid + M_offset, N_offset:N + N_offset])
+            T.writes(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
diff --git a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
index c95af877..ec570d41 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
@@ -9,7 +9,9 @@ def vectorize_access_legalize(M: int = 64, N: int = 64):
     vec_len = 8
 
     @T.prim_func
-    def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def main(
+        A: T.Tensor((M, N, vec_len), dtype="float32"),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
@@ -18,7 +20,9 @@ def vectorize_access_legalize(M: int = 64, N: int = 64):
                     A_shared[tid, j, v] = A[tid, j, v]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def expected(
+        A: T.Tensor((M, N, vec_len), dtype="float32"),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
index aa2638af..6603ecab 100644
--- a/testing/python/transform/test_tilelang_transform_let_inline.py
+++ b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -8,12 +8,10 @@ def _check(original, transformed):
     func = original
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.LetInline()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_let_binding():
-
     @T.prim_func
     def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
         for i in range(128):
@@ -34,7 +32,6 @@ def test_let_binding():
 
 
 def test_parallel_scope():
-
     @T.prim_func
     def before(A: T.Tensor((128,), "float32")):
         for i in T.Parallel(128):
diff --git a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
index ca5042e0..f411b3d5 100644
--- a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
+++ b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
@@ -24,7 +24,6 @@ def _check(original, transformed):
 
 
 def test_lower_hopper_intrin_barrier():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -37,18 +36,10 @@ def test_lower_hopper_intrin_barrier():
             v_1 = T.launch_thread("threadIdx.x", 128)
             T.evaluate(tir.Call("handle", "tir.create_barriers", [4]))
             with T.If(v_1 == 0), T.Then():
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(0), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(1), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(2), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(3), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(0), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(1), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(2), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(3), 128]))
             T.evaluate(tir.Call("handle", "tir.tvm_storage_sync", ["shared"]))
 
     _check(before, after)
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index 07dbd53f..ac584185 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -8,63 +8,69 @@ import pytest
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, "float16"),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
     def before():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     def after():
-
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
 
-        return tvm.IRModule({'main': main})
+        return tvm.IRModule({"main": main})
 
     with tvm.transform.PassContext():
         mod = tvm.tir.transform.BindTarget(auto_target)(before())
diff --git a/testing/python/transform/test_tilelang_transform_make_packed_api.py b/testing/python/transform/test_tilelang_transform_make_packed_api.py
index ff448732..2508a9d1 100644
--- a/testing/python/transform/test_tilelang_transform_make_packed_api.py
+++ b/testing/python/transform/test_tilelang_transform_make_packed_api.py
@@ -80,7 +80,6 @@ def test_target_host_removed():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("cuda", host=host)})
@@ -102,7 +101,6 @@ def test_internal_subroutine_call():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"target": T.target("llvm", host="llvm")})
@@ -121,7 +119,8 @@ def test_internal_subroutine_call():
     subroutine_call_op = compute_scope.body.value.op
     assert isinstance(subroutine_call_op, tvm.ir.GlobalVar), (
         f"The main function's CallNode should use the subroutine's GLobalVar as the operation, "
-        f"but instead has an operation of type {subroutine_call_op}")
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 def test_subroutine_call_to_externally_visible_subroutine():
@@ -135,7 +134,6 @@ def test_subroutine_call_to_externally_visible_subroutine():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("llvm", host="llvm")})
@@ -154,11 +152,10 @@ def test_subroutine_call_to_externally_visible_subroutine():
     assert subroutine_compute_scope is not None
 
     subroutine_call_op = main_compute_scope.body.value.op
-    assert (
-        isinstance(subroutine_call_op, tvm.ir.Op) and
-        subroutine_call_op.name == "tir.tvm_call_cpacked"
-    ), (f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
-        f"but instead has an operation of type {subroutine_call_op}")
+    assert isinstance(subroutine_call_op, tvm.ir.Op) and subroutine_call_op.name == "tir.tvm_call_cpacked", (
+        f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 @tilelang.testing.requires_llvm
@@ -167,10 +164,10 @@ def test_function_call_with_wrong_argument_count():
 
     @T.prim_func
     def func(
-            A: T.Buffer([16, 16], "int32"),
-            B: T.Buffer([16, 16], "int32"),
-            C: T.Buffer([16, 16], "int32"),
-            D: T.Buffer([16, 16], "int32"),
+        A: T.Buffer([16, 16], "int32"),
+        B: T.Buffer([16, 16], "int32"),
+        C: T.Buffer([16, 16], "int32"),
+        D: T.Buffer([16, 16], "int32"),
     ):
         pass
 
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index ddb7f666..0d56ab1a 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -31,7 +31,6 @@ block_K = 32
 
 
 def test_multi_version_buffer():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -49,21 +48,27 @@ def test_multi_version_buffer():
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
                         T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 2),
-                        k * 32, by * 64)
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
                         T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 2),
-                        bx * 64, k * 32)
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
                     T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
                     T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
@@ -82,31 +87,32 @@ def test_multi_version_buffer():
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                )
 
     _check(before, after)
 
 
 def test_multi_version_buffer_with_let():
-
     @T.prim_func
     def before(scales: T.Tensor((4,), "float32")):
         with T.block("root"):
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index b7448a20..f38d6079 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -19,10 +19,8 @@ def _check(original, transformed):
 
 
 def test_simple_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor((1024, 1024), "float32")):
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), "float32")
             B_shared = T.alloc_shared((32, 128), "float32")
@@ -39,8 +37,7 @@ def test_simple_pipeline():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor((1024, 1024), "float32")):
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), "float32")
             B_shared = T.alloc_shared((32, 128), "float32")
@@ -49,14 +46,13 @@ def test_simple_pipeline():
             T.clear(C_local)
 
             for ko in T.serial(
-                    32,
-                    annotations={
-                        "software_pipeline_async_stages": [T.int32(0)],
-                        "software_pipeline_order": [T.int32(0), T.int32(1),
-                                                    T.int32(2)],
-                        "software_pipeline_stage": [T.int32(3), T.int32(3),
-                                                    T.int32(3)]
-                    }):
+                32,
+                annotations={
+                    "software_pipeline_async_stages": [T.int32(0)],
+                    "software_pipeline_order": [T.int32(0), T.int32(1), T.int32(2)],
+                    "software_pipeline_stage": [T.int32(3), T.int32(3), T.int32(3)],
+                },
+            ):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index e1f4f946..657a2e8a 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -8,14 +8,13 @@ def modify(
     with_B: bool = False,
     with_bias: bool = False,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((64, 64)),
-            B: T.Tensor((64, 64)),
-            C: T.Tensor((64, 64)),
-            D: T.Tensor((64, 64)),
-            bias: T.Tensor((64, 64)),
+        A: T.Tensor((64, 64)),
+        B: T.Tensor((64, 64)),
+        C: T.Tensor((64, 64)),
+        D: T.Tensor((64, 64)),
+        bias: T.Tensor((64, 64)),
     ):
         if with_B:
             if with_bias:
@@ -42,7 +41,6 @@ def test_modify(with_B=False, with_bias=False):
 
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
         a: T.handle,
@@ -76,6 +74,7 @@ def test_matmul():
     kernel = tl.compile(mod["main"], out_idx=[2])
 
     import torch
+
     a = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     b = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     c = kernel(a, b)
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index c0b70556..046ed447 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -11,11 +11,7 @@ def run_passes(func: tvm.tir.PrimFunc):
 
     cuda_target = tvm.target.Target("cuda", host="llvm")
 
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
-        "global_symbol": "test",
-        "target": cuda_target
-    }))(
-        mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target}))(mod)
 
     mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
     mod = tvm.tir.transform.SplitHostDevice()(mod)
@@ -24,7 +20,6 @@ def run_passes(func: tvm.tir.PrimFunc):
 
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
-
     @T.prim_func(check_well_formed=False)
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -47,7 +42,6 @@ def test_sync_if_with_same_index():
 
 @tilelang.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
-
     @T.prim_func
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -71,7 +65,6 @@ def test_sync_read_thread_id_independent_location():
 
 @tilelang.testing.requires_cuda
 def test_sync_shared():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 1)
@@ -113,7 +106,6 @@ def test_sync_shared():
 
 @tvm.testing.requires_cuda
 def test_sync_let_stmt():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((16 * 512), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 16)
@@ -136,9 +128,9 @@ def test_sync_let_stmt():
             in_thread_A_temp_1[0] = A_temp
         cross_thread_A_temp_1 = T.Buffer((1,), data=cross_thread_A_temp, scope="local")
         with T.attr(
-                T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
-                "reduce_scope",
-                T.reinterpret("handle", T.uint64(0)),
+            T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
+            "reduce_scope",
+            T.reinterpret("handle", T.uint64(0)),
         ):
             T.tvm_thread_allreduce(
                 T.uint32(1),
@@ -190,16 +182,19 @@ def test_sync_let_stmt():
 
 @tilelang.testing.requires_cuda
 def test_sync_shared_dyn_stmatrix_loop_hoist():
-
     @T.prim_func
     def func():
         buf_dyn_shmem = T.alloc_buffer((98304,), "uint8", scope="shared.dyn")
         tx = T.launch_thread("threadIdx.x", 384)
         for i in T.unroll(8):
             off = (
-                i // 4 * 8192 + tx // 32 * 1024 + tx % 16 * 64 +
-                (tx % 8 // 4 + i % 4 // 2) % 2 * 32 + (tx % 4 // 2 + i % 2) % 2 * 16 +
-                (tx % 32 // 16 + tx % 2) % 2 * 8)
+                i // 4 * 8192
+                + tx // 32 * 1024
+                + tx % 16 * 64
+                + (tx % 8 // 4 + i % 4 // 2) % 2 * 32
+                + (tx % 4 // 2 + i % 2) % 2 * 16
+                + (tx % 32 // 16 + tx % 2) % 2 * 8
+            )
             T.evaluate(
                 T.call_intrin(
                     "handle",
@@ -214,7 +209,8 @@ def test_sync_shared_dyn_stmatrix_loop_hoist():
                         2,
                     ),
                     T.int32(2),
-                ))
+                )
+            )
 
     mod = tvm.IRModule({"main": func})
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index 063ae294..2e101bf8 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -32,7 +32,6 @@ block_K = 32
 
 
 def test_warp_specialized():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -47,25 +46,27 @@ def test_warp_specialized():
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
@@ -85,34 +86,35 @@ def test_warp_specialized():
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v - 128 == 0:
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
         else:
             T.set_max_nreg(240, 1)
             for k in range(16):
                 T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                )
+                T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     _check(before, after)
 
diff --git a/testing/python/utils/test_compress_utils.py b/testing/python/utils/test_compress_utils.py
index 1ec4cace..e8fc2053 100644
--- a/testing/python/utils/test_compress_utils.py
+++ b/testing/python/utils/test_compress_utils.py
@@ -6,7 +6,7 @@ from tilelang.utils.sparse import compress_sm90, randn_semi_sparse
 
 
 def _test_compress_sm90(M, K, block_k, dtype):
-    A = randn_semi_sparse(M, K, dtype=dtype, device='cuda')
+    A = randn_semi_sparse(M, K, dtype=dtype, device="cuda")
     A_sparse, E = compress_sm90(A, block_k, False)
 
 
diff --git a/testing/python/webgpu/test_webgpu_codegen.py b/testing/python/webgpu/test_webgpu_codegen.py
index 0fe4f196..ed175279 100644
--- a/testing/python/webgpu/test_webgpu_codegen.py
+++ b/testing/python/webgpu/test_webgpu_codegen.py
@@ -5,12 +5,11 @@ import tilelang.language as T
 
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 0d8c21ba..1f2a4f40 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -23,6 +23,7 @@ def _compute_version() -> str:
         if version_file.is_file():
             try:
                 from version_provider import dynamic_metadata  # type: ignore
+
                 return dynamic_metadata("version")
             except Exception:
                 # Fall back to the raw VERSION file if provider isn't available.
@@ -33,6 +34,7 @@ def _compute_version() -> str:
 
     try:
         from importlib.metadata import version as _dist_version  # py3.8+
+
         return _dist_version("tilelang")
     except Exception as exc:
         warnings.warn(
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
index 3186b23e..94900a5c 100644
--- a/tilelang/analysis/fragment_loop_checker.py
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from tvm import tir
-from tvm.tir import (PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm)
+from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm
 from tvm.tir.transform import prim_func_pass
 from tvm.tir.stmt_functor import post_order_visit
 
@@ -22,14 +22,14 @@ class _LoopVarUseAnalyzer(PyStmtExprVisitor):
 
 def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
     """
-        Collect local buffer accesses in the loop body.
+    Collect local buffer accesses in the loop body.
 
-        Args:
-            statement: The TIR statement to analyze
+    Args:
+        statement: The TIR statement to analyze
 
-        Returns:
-            Tuple of buffer accesses in the loop body.
-        """
+    Returns:
+        Tuple of buffer accesses in the loop body.
+    """
 
     buffer_accesses = []
 
@@ -44,7 +44,6 @@ def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
 
 @tir.functor.visitor
 class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
-
     def __init__(self) -> None:
         super().__init__()
 
@@ -75,7 +74,8 @@ class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
                         raise ValueError(
                             "[Tilelang Semantic Check] "
                             f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
-                            "a local/fragment buffer, which is not allowed in Tilelang.")
+                            "a local/fragment buffer, which is not allowed in Tilelang."
+                        )
 
             return
 
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
index 782b9126..141fb808 100644
--- a/tilelang/analysis/layout_visual.py
+++ b/tilelang/analysis/layout_visual.py
@@ -23,10 +23,7 @@ def print_fragment_format(layout: T.Fragment) -> str:
     if isinstance(layout, T.Fragment):
         input_shape = layout.get_input_shape()
         output_shape = layout.get_output_shape()
-        lines = [
-            f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}",
-            f"  Index:  {layout.forward_index}"
-        ]
+        lines = [f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}", f"  Index:  {layout.forward_index}"]
         print("\n".join(lines))
     else:
         raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
@@ -82,7 +79,6 @@ class _LayoutVisualVisitor(PyStmtExprVisitor):
 
 
 def LayoutVisual(formats: str = ""):
-
     def pass_fn(func: tir.PrimFunc, mod, ctx):
         _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
         return func
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
index eff0fc2d..51da7f4c 100644
--- a/tilelang/analysis/nested_loop_checker.py
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -11,10 +11,7 @@ from tvm.tir.transform import prim_func_pass
 def is_pipelined_for(op: For) -> bool:
     """Check if a for loop is pipelined."""
 
-    anno_keys = [
-        "num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_sync",
-        "tl_pipeline_group"
-    ]
+    anno_keys = ["num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_sync", "tl_pipeline_group"]
     return any(key in op.annotations for key in anno_keys)
 
 
@@ -26,7 +23,6 @@ def is_tile_op(op: Call) -> bool:
 
 @tir.functor.visitor
 class _NestedLoopCheckVisitor(PyStmtExprVisitor):
-
     def __init__(self) -> None:
         super().__init__()
         self.in_parallel_context = False
@@ -42,27 +38,24 @@ class _NestedLoopCheckVisitor(PyStmtExprVisitor):
 
             # Otherwise
             if self.in_parallel_context:
-                raise ValueError("[Tilelang Semantic Check] "
-                                 "Nested parallel loops are not allowed. "
-                                 "Please check your loop structure.")
+                raise ValueError("[Tilelang Semantic Check] Nested parallel loops are not allowed. Please check your loop structure.")
             self.in_parallel_context = True
             super().visit_for_(op)
             self.in_parallel_context = False
             return
         elif is_pipelined_for(op):
             if self.in_parallel_context:
-                raise ValueError("[Tilelang Semantic Check] "
-                                 "Pipelined loop cannot be nested inside a parallel loop. "
-                                 "Please check your loop structure.")
+                raise ValueError(
+                    "[Tilelang Semantic Check] Pipelined loop cannot be nested inside a parallel loop. Please check your loop structure."
+                )
 
         super().visit_for_(op)
 
     def visit_call_(self, op: Call) -> None:
         if self.in_parallel_context and is_tile_op(op):
-            raise ValueError("[Tilelang Semantic Check] "
-                             "Only elementwise operations are allowed inside a parallel loop. " \
-                             f"Got a tile-op \"{op.op}\"."
-                             )
+            raise ValueError(
+                f'[Tilelang Semantic Check] Only elementwise operations are allowed inside a parallel loop. Got a tile-op "{op.op}".'
+            )
 
 
 def NestedLoopChecker():
diff --git a/tilelang/autotuner/capture.py b/tilelang/autotuner/capture.py
index 27c24f14..428a6da9 100644
--- a/tilelang/autotuner/capture.py
+++ b/tilelang/autotuner/capture.py
@@ -85,8 +85,7 @@ def _get_current_stack() -> CaptureStack:
 
 
 class AutotuneInputsCapture:
-
-    __slots__ = ("tensors")
+    __slots__ = "tensors"
 
     def __init__(self, tensors: list[Any]):
         self.tensors = tensors
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index 4c8d9a94..69ad49c7 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -1,5 +1,5 @@
-"""The auto-tune parameters.
-"""
+"""The auto-tune parameters."""
+
 from __future__ import annotations
 
 import tilelang
@@ -50,7 +50,7 @@ class CompileArgs:
 
     out_idx: list[int] | int | None = None
     execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto"
-    target: Literal['auto', 'cuda', 'hip'] = 'auto'
+    target: Literal["auto", "cuda", "hip"] = "auto"
     target_host: str | Target = None
     verbose: bool = False
     pass_configs: dict[str, Any] | None = None
@@ -62,24 +62,20 @@ class CompileArgs:
             target=self.target,
             target_host=self.target_host,
             verbose=self.verbose,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
 
     def __hash__(self):
         data = {
-            "execution_backend":
-                self.execution_backend,
-            "target":
-                str(self.target),
-            "target_host":
-                str(self.target_host) if self.target_host else None,
-            "verbose":
-                self.verbose,
-            "pass_configs":
-                json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
+            "execution_backend": self.execution_backend,
+            "target": str(self.target),
+            "target_host": str(self.target_host) if self.target_host else None,
+            "verbose": self.verbose,
+            "pass_configs": json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
         }
 
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -104,6 +100,7 @@ class ProfileArgs:
         manual_check_prog: Callable = None
         cache_input_tensors: bool = True
     """
+
     warmup: int = 25
     rep: int = 100
     timeout: int = 30
@@ -127,8 +124,8 @@ class ProfileArgs:
             "atol": self.atol,
             "max_mismatched_ratio": self.max_mismatched_ratio,
         }
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -143,6 +140,7 @@ class AutotuneResult:
         func: Optimized function.
         kernel: Compiled kernel function.
     """
+
     latency: float | None = None
     config: dict | None = None
     ref_latency: float | None = None
@@ -199,8 +197,7 @@ class AutotuneResult:
             if verbose:
                 logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                self._safe_write_file(device_kernel_path, "w",
-                                      lambda f: f.write(kernel.kernel_source))
+                self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
         except Exception as e:
             logger.error(f"Error saving kernel source code to disk: {e}")
 
@@ -211,11 +208,9 @@ class AutotuneResult:
                 logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
             # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
             if kernel.execution_backend == "tvm_ffi":
-                self._safe_write_file(host_kernel_path, "w",
-                                      lambda f: f.write(kernel.adapter.get_host_source()))
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
             else:
-                self._safe_write_file(host_kernel_path, "w",
-                                      lambda f: f.write(kernel.adapter.get_kernel_source()))
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             logger.error(f"Error saving wrapped kernel source code to disk: {e}")
 
@@ -237,12 +232,10 @@ class AutotuneResult:
                 py_src_path = src_lib_path.replace(".cubin", ".py")
                 if verbose:
                     logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                self._safe_write_file(kernel_py_path, "wb",
-                                      lambda f: f.write(self._load_binary(py_src_path)))
+                self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
                 if verbose:
                     logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-                self._safe_write_file(kernel_lib_path, "wb",
-                                      lambda f: f.write(self._load_binary(src_lib_path)))
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
             elif kernel.execution_backend == "tvm_ffi":
                 executable = kernel.adapter.executable
                 if verbose:
@@ -252,8 +245,7 @@ class AutotuneResult:
                 src_lib_path = kernel.adapter.libpath
                 if verbose:
                     logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-                self._safe_write_file(kernel_lib_path, "wb",
-                                      lambda f: f.write(self._load_binary(src_lib_path)))
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
 
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
@@ -370,14 +362,12 @@ class AutotuneResult:
         # save best config (atomic)
         if verbose:
             logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        self._safe_write_file(
-            str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
+        self._safe_write_file(str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
 
         # save function (atomic)
         if verbose:
             logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        self._safe_write_file(
-            str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
+        self._safe_write_file(str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
         # save ref latency (atomic)
         if verbose:
@@ -385,10 +375,13 @@ class AutotuneResult:
         self._safe_write_file(
             str(path / LATENCY_PATH),
             "w",
-            lambda f: json.dump({
-                "latency": self.latency,
-                "ref_latency": self.ref_latency,
-            }, f),
+            lambda f: json.dump(
+                {
+                    "latency": self.latency,
+                    "ref_latency": self.ref_latency,
+                },
+                f,
+            ),
         )
 
         # save kernel
@@ -403,8 +396,8 @@ class AutotuneResult:
         # Normalize target and resolve execution backend for loading
         from tilelang.utils.target import determine_target as _determine_target
         from tilelang.jit.execution_backend import resolve_execution_backend
-        norm_target = Target(_determine_target(compile_args.target)) if isinstance(
-            compile_args.target, str) else compile_args.target
+
+        norm_target = Target(_determine_target(compile_args.target)) if isinstance(compile_args.target, str) else compile_args.target
         requested_backend = compile_args.execution_backend
         resolved_backend = resolve_execution_backend(requested_backend, norm_target)
         # load best config
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 9b2fca2c..5bbdc48a 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -3,6 +3,7 @@
 This module provides functionality for auto-tuning tilelang programs, including JIT compilation
 and performance optimization through configuration search.
 """
+
 from __future__ import annotations
 from dataclasses import dataclass
 
@@ -14,7 +15,8 @@ from tvm.tir import PrimFunc, Var
 from tvm.target import Target
 import inspect
 from functools import partial
-from typing import (Callable, Generic, Literal, Any, TypeVar)
+from typing import Callable, Generic, Literal, Any, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
@@ -74,8 +76,8 @@ def _init_logger_handlers():
     global _logger_handlers_initialized
     if _logger_handlers_initialized:
         return
-    formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
-    file_handler = logging.FileHandler('autotuner.log', mode='w')
+    formatter = logging.Formatter("%(asctime)s %(levelname)s:%(message)s")
+    file_handler = logging.FileHandler("autotuner.log", mode="w")
     file_handler.setLevel(logging.DEBUG)
     file_handler.setFormatter(formatter)
     console_handler = logging.StreamHandler(sys.stdout)
@@ -87,8 +89,7 @@ def _init_logger_handlers():
 
 
 def get_available_cpu_count() -> int:
-    """Gets the number of CPU cores available to the current process.
-    """
+    """Gets the number of CPU cores available to the current process."""
     try:
         cpu_count = len(os.sched_getaffinity(0))
     except AttributeError:
@@ -107,6 +108,7 @@ class AutoTuner:
         fn: The function to be auto-tuned.
         configs: List of configurations to try during auto-tuning.
     """
+
     compile_args = CompileArgs()
     profile_args = ProfileArgs()
 
@@ -137,14 +139,15 @@ class AutoTuner:
         """
         return cls(kernel, configs)
 
-    def set_compile_args(self,
-                         out_idx: list[int] | int | None = None,
-                         target: Literal['auto', 'cuda', 'hip', 'metal'] = 'auto',
-                         execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                                                    "torch"] = "auto",
-                         target_host: str | Target = None,
-                         verbose: bool = False,
-                         pass_configs: dict[str, Any] | None = None):
+    def set_compile_args(
+        self,
+        out_idx: list[int] | int | None = None,
+        target: Literal["auto", "cuda", "hip", "metal"] = "auto",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+        target_host: str | Target = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         """Set compilation arguments for the auto-tuner.
 
         Args:
@@ -161,6 +164,7 @@ class AutoTuner:
         # Normalize target to a concrete TVM Target and resolve execution backend
         t = Target(determine_target(target))
         from tilelang.jit.execution_backend import resolve_execution_backend
+
         resolved_backend = resolve_execution_backend(execution_backend, t)
 
         self.compile_args = CompileArgs(
@@ -169,23 +173,26 @@ class AutoTuner:
             execution_backend=resolved_backend,
             target_host=target_host,
             verbose=verbose,
-            pass_configs=pass_configs)
+            pass_configs=pass_configs,
+        )
 
         return self
 
-    def set_profile_args(self,
-                         warmup: int = 25,
-                         rep: int = 100,
-                         timeout: int = 30,
-                         supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
-                         ref_prog: Callable = None,
-                         supply_prog: Callable = None,
-                         rtol: float = 1e-2,
-                         atol: float = 1e-2,
-                         max_mismatched_ratio: float = 0.01,
-                         skip_check: bool = False,
-                         manual_check_prog: Callable = None,
-                         cache_input_tensors: bool = False):
+    def set_profile_args(
+        self,
+        warmup: int = 25,
+        rep: int = 100,
+        timeout: int = 30,
+        supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
+        ref_prog: Callable = None,
+        supply_prog: Callable = None,
+        rtol: float = 1e-2,
+        atol: float = 1e-2,
+        max_mismatched_ratio: float = 0.01,
+        skip_check: bool = False,
+        manual_check_prog: Callable = None,
+        cache_input_tensors: bool = False,
+    ):
         """Set profiling arguments for the auto-tuner.
 
         Args:
@@ -209,9 +216,7 @@ class AutoTuner:
         # the `supply_prog` will be ignored and the `get_autotune_inputs` will be used instead.
         if get_autotune_inputs() is not None:
             if supply_prog is not None:
-                logger.warning(
-                    "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
-                )
+                logger.warning("`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context.")
             supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
 
         self.profile_args = ProfileArgs(
@@ -226,13 +231,13 @@ class AutoTuner:
             cache_input_tensors=cache_input_tensors,
             warmup=warmup,
             rep=rep,
-            timeout=timeout)
+            timeout=timeout,
+        )
 
         # If a custom `supply_prog` is provided, the profiler's `supply_type` setting
         # becomes ineffective. The custom supply program will be used instead.
         if supply_prog is not None and supply_type != tilelang.TensorSupplyType.Auto:
-            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because "
-                           "`supply_prog` is not None.")
+            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because `supply_prog` is not None.")
 
         return self
 
@@ -241,10 +246,8 @@ class AutoTuner:
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
 
-    def generate_cache_key(self, parameters: dict[str, Any],
-                           extra_parameters: dict[str, Any]) -> AutotuneResult | None:
-        """Generate a cache key for the auto-tuning process.
-        """
+    def generate_cache_key(self, parameters: dict[str, Any], extra_parameters: dict[str, Any]) -> AutotuneResult | None:
+        """Generate a cache key for the auto-tuning process."""
 
         def _normalize_param(value):
             if isinstance(value, Var):
@@ -315,8 +318,9 @@ class AutoTuner:
                 if var_name in parameters:
                     continue
                 # Cell content must be serializable
-                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), \
+                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), (
                     f"Cell contents {cell.cell_contents} is not serializable: {type(cell.cell_contents)}"
+                )
                 extra_parameters[var_name] = cell.cell_contents
 
         if isinstance(self.configs, Callable):
@@ -328,8 +332,10 @@ class AutoTuner:
             if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
-                    logger.warning("Found kernel in memory cache. For better performance," \
-                                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.")
+                    logger.warning(
+                        "Found kernel in memory cache. For better performance,"
+                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel."
+                    )
                     return self._memory_cache[key]
 
                 # Then check disk cache
@@ -369,7 +375,6 @@ class AutoTuner:
             # This encapsulates the logic of using either a custom supply program (`supply_prog`)
             # or the default profiler input generation (`profiler._get_inputs`).
             def get_input_tensors_supply(with_output: bool):
-
                 def func():
                     if supply_prog is not None:
                         return supply_prog(profiler._get_params(with_output=with_output))
@@ -387,8 +392,7 @@ class AutoTuner:
                     self.jit_input_tensors = jit_input_tensors_supply()
                 else:
                     # check if the cached tensors are compatible with the current configuration
-                    assert len(params) == len(
-                        self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
+                    assert len(params) == len(self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
                     for p, c in zip(params, self.jit_input_tensors):
                         if not isinstance(c, torch.Tensor):
                             # skip non-tensor inputs checking
@@ -397,8 +401,8 @@ class AutoTuner:
                         # Check tensor compatibility using generator expression
                         def shape_equal(a, b):
                             return all(
-                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var)
-                                for a_dim, b_dim in zip(a.shape, b.shape))
+                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var) for a_dim, b_dim in zip(a.shape, b.shape)
+                            )
 
                         if p.dtype != c.dtype or not shape_equal(p, c):
                             logger.warning(
@@ -409,7 +413,8 @@ class AutoTuner:
                                 "To ensure fresh, compatible inputs are generated for every trial "
                                 "you can disable caching by setting:\n"
                                 "  `cache_input_tensors=False`\n"
-                                "within your `.set_compile_args(...)` call.\n")
+                                "within your `.set_compile_args(...)` call.\n"
+                            )
                             # otherwise, regenerate the input tensors for safety
                             self.jit_input_tensors = jit_input_tensors_supply()
                             break
@@ -418,24 +423,16 @@ class AutoTuner:
 
             if (not skip_check) and (ref_prog is not None):
                 if manual_check_prog is not None:
-                    profiler.manual_assert_close(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        manual_check_prog=manual_check_prog)
+                    profiler.manual_assert_close(ref_prog, input_tensors=self.jit_input_tensors, manual_check_prog=manual_check_prog)
                 else:
                     profiler.assert_allclose(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        rtol=rtol,
-                        atol=atol,
-                        max_mismatched_ratio=max_mismatched_ratio)
-            latency = profiler.do_bench(
-                warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
+                        ref_prog, input_tensors=self.jit_input_tensors, rtol=rtol, atol=atol, max_mismatched_ratio=max_mismatched_ratio
+                    )
+            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
 
             if self.ref_latency_cache is None and ref_prog is not None:
                 self.ref_input_tensors = ref_input_tensors_supply()
-                self.ref_latency_cache = profiler.do_bench(
-                    ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
+                self.ref_latency_cache = profiler.do_bench(ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
 
             return latency, self.ref_latency_cache
 
@@ -469,17 +466,14 @@ class AutoTuner:
 
             # Check if all tunable arguments have been tuned by comparing config keys with key_kwargs_tuple
             if any(key in top_config for key, _ in key_kwargs_tuple) or any(
-                    check_tunable_argument_value(key, self._function_parameters, key_args_tuple)
-                    for key in tunable_arguments):
+                check_tunable_argument_value(key, self._function_parameters, key_args_tuple) for key in tunable_arguments
+            ):
                 logger.warning(
                     f"Tunable parameters {tunable_arguments} already provided during auto-tuning. Skipping compilation and using direct JIT"
                 )
                 # compile the kernel with the provided parameters
                 jit_kernel = self.jit_compile()
-                autotuner_result = AutotuneResult(
-                    libcode=jit_kernel.get_kernel_source(),
-                    func=jit_kernel.prim_func,
-                    kernel=jit_kernel)
+                autotuner_result = AutotuneResult(libcode=jit_kernel.get_kernel_source(), func=jit_kernel.prim_func, kernel=jit_kernel)
                 self._memory_cache[key] = autotuner_result
                 return autotuner_result
         # get the cpu count
@@ -489,9 +483,7 @@ class AutoTuner:
         max_cpu_count = int(env.TILELANG_AUTO_TUNING_MAX_CPU_COUNT)
         if cpu_counts > 0:
             num_workers = min(cpu_counts, available_cpu_count)
-            logger.info(
-                f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used"
-            )
+            logger.info(f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used")
         else:
             num_workers = max(1, int(available_cpu_count * cpu_utilizations))
             logger.info(
@@ -509,7 +501,6 @@ class AutoTuner:
         future_to_index = {}
 
         def cuda_device_wrapper(func, device):
-
             def inner(**config_arg):
                 torch.cuda.set_device(device)
                 return func(**config_arg)
@@ -532,18 +523,14 @@ class AutoTuner:
             future_to_index[future] = i
 
         results_with_configs = []
-        for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Compiling configurations"):
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Compiling configurations"):
             idx = future_to_index[future]
             config = config_args[idx]
             try:
                 result = future.result()
                 results_with_configs.append((result, config))
             except Exception as e:
-                logger.debug(
-                    f"Compilation failed for config {config} at index {idx} with error: {e}")
+                logger.debug(f"Compilation failed for config {config} at index {idx} with error: {e}")
                 continue
 
         ref_latency = None
@@ -556,14 +543,10 @@ class AutoTuner:
                 # latency, ref_latency = target_fn(jit_kernel)
                 latency, ref_latency = run_with_timeout(target_fn, timeout, jit_kernel)
             except TimeoutException:
-                logger.warning(
-                    f"A timeout occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"A timeout occurred while testing config {config}, checkout autotuner.log for more details")
                 continue
             except Exception:
-                logger.warning(
-                    f"An error occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"An error occurred while testing config {config}, checkout autotuner.log for more details")
                 logger.debug(f"Error: {traceback.format_exc()}")
                 continue
 
@@ -578,8 +561,7 @@ class AutoTuner:
         pool.shutdown()
 
         if best_kernel is None:
-            error_msg = ("Auto-tuning failed: No configuration successfully "
-                         "compiled and passed benchmarking/validation.")
+            error_msg = "Auto-tuning failed: No configuration successfully compiled and passed benchmarking/validation."
             logger.error(error_msg)
             raise RuntimeError(error_msg)
 
@@ -595,7 +577,8 @@ class AutoTuner:
             ref_latency=ref_latency,
             libcode=best_kernel.get_kernel_source(),
             func=best_kernel.prim_func,
-            kernel=best_kernel)
+            kernel=best_kernel,
+        )
 
         if self.compile_args.execution_backend in ("torch"):
             logger.warning("DLPack backend does not support cache saving to disk.")
@@ -617,8 +600,8 @@ class AutoTuner:
         return self.run()
 
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 @dataclass
@@ -643,8 +626,9 @@ class AutoTuneImpl(Generic[_P, _T]):
         self._tuner_cache = {}
 
     def get_tunner(self):
-        autotuner = AutoTuner(
-            self.jit_impl.func, configs=self.configs).set_profile_args(
+        autotuner = (
+            AutoTuner(self.jit_impl.func, configs=self.configs)
+            .set_profile_args(
                 supply_type=self.supply_type,
                 ref_prog=self.ref_prog,
                 supply_prog=self.supply_prog,
@@ -654,7 +638,8 @@ class AutoTuneImpl(Generic[_P, _T]):
                 skip_check=self.skip_check,
                 manual_check_prog=self.manual_check_prog,
                 cache_input_tensors=self.cache_input_tensors,
-            ).set_compile_args(
+            )
+            .set_compile_args(
                 out_idx=self.jit_impl.out_idx,
                 execution_backend=self.jit_impl.execution_backend,
                 target=self.jit_impl.target,
@@ -662,6 +647,7 @@ class AutoTuneImpl(Generic[_P, _T]):
                 verbose=self.jit_impl.verbose,
                 pass_configs=self.jit_impl.pass_configs,
             )
+        )
         autotuner.run = partial(autotuner.run, self.warmup, self.rep, self.timeout)
         return autotuner
 
@@ -753,16 +739,13 @@ def autotune(  # This is the new public interface
     if callable(func):
         # Case 1: Used as @autotune (func_or_out_idx is the function, others are defaults)
         # This is a placeholder for a real auto tuner implementation
-        raise ValueError(
-            "Use tilelang.autotune to decorate func without arguments is not supported yet.")
+        raise ValueError("Use tilelang.autotune to decorate func without arguments is not supported yet.")
     elif isinstance(func, PrimFunc):
         raise ValueError("Use tilelang.jit to decorate prim_func is not supported yet.")
     else:
 
         def decorator(impl):
-            assert isinstance(
-                impl, JITImpl
-            ), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
+            assert isinstance(impl, JITImpl), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
             return AutoTuneImpl(
                 jit_impl=impl,
                 configs=configs,
diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py
index 144c2729..18ac847b 100644
--- a/tilelang/cache/__init__.py
+++ b/tilelang/cache/__init__.py
@@ -1,4 +1,5 @@
 """The cache utils with class and database persistence - Init file"""
+
 from __future__ import annotations
 
 from typing import Literal
@@ -18,8 +19,7 @@ def cached(
     *args,
     target: str | Target = "auto",
     target_host: str | Target = None,
-    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
-    | None = "auto",
+    execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] | None = "auto",
     verbose: bool | None = False,
     pass_configs: dict | None = None,
     compile_flags: list[str] | str | None = None,
@@ -36,7 +36,8 @@ def cached(
         execution_backend=execution_backend,
         verbose=verbose,
         pass_configs=pass_configs,
-        compile_flags=compile_flags)
+        compile_flags=compile_flags,
+    )
 
 
 def clear_cache():
@@ -47,9 +48,11 @@ def clear_cache():
         RuntimeError: Always raised to warn users to clear the cache manually.
     """
     cache_dir = env.TILELANG_CACHE_DIR
-    raise RuntimeError("tilelang.clear_cache() is disabled because deleting the cache directory "
-                       "is dangerous. If you accept the risk, remove it manually with "
-                       f"`rm -rf '{cache_dir}'`.")
+    raise RuntimeError(
+        "tilelang.clear_cache() is disabled because deleting the cache directory "
+        "is dangerous. If you accept the risk, remove it manually with "
+        f"`rm -rf '{cache_dir}'`."
+    )
 
 
 if env.TILELANG_CLEAR_CACHE.lower() in ("1", "true", "yes", "on"):
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index 74ecb278..4fbe2dce 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -1,4 +1,5 @@
 """The cache utils with class and database persistence - KernelCache Class"""
+
 from __future__ import annotations
 
 import json
@@ -97,9 +98,7 @@ class KernelCache:
             "version": __version__,
             "func": sha256(func_binary).hexdigest(),  # Use SHA256 to generate hash key
             "out_idx": (tuple(out_idx) if isinstance(out_idx, (list, tuple)) else [out_idx]),
-            "args_repr": tuple(
-                repr(arg) for arg in args
-            ),  # Use repr to serialize arguments, may need more robust serialization
+            "args_repr": tuple(repr(arg) for arg in args),  # Use repr to serialize arguments, may need more robust serialization
             "target": str(target),
             "target_host": str(target_host) if target_host else None,
             "execution_backend": execution_backend,
@@ -118,8 +117,7 @@ class KernelCache:
         *args,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                                   "torch"] = "auto",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
         verbose: bool = False,
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
@@ -140,6 +138,7 @@ class KernelCache:
         # Normalize target and resolve execution backend before proceeding
         from tilelang.utils.target import determine_target as _determine_target
         from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
         norm_target = Target(_determine_target(target)) if isinstance(target, str) else target
         requested_backend = execution_backend
         execution_backend = resolve_execution_backend(requested_backend, norm_target)
@@ -180,21 +179,21 @@ class KernelCache:
         with self._lock:
             # First check in-memory cache
             if key in self._memory_cache:
-                self.logger.warning("Found kernel in memory cache. For better performance," \
-                                    " consider using `@tilelang.jit` instead of direct kernel caching.")
+                self.logger.warning(
+                    "Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching."
+                )
                 return self._memory_cache[key]
 
             if verbose:
                 self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
 
             # Then check disk cache
-            kernel = self._load_kernel_from_disk(key, norm_target, target_host, out_idx,
-                                                 execution_backend, pass_configs, compile_flags,
-                                                 func, verbose)
+            kernel = self._load_kernel_from_disk(
+                key, norm_target, target_host, out_idx, execution_backend, pass_configs, compile_flags, func, verbose
+            )
             if kernel is not None:
                 if verbose:
-                    self.logger.debug(
-                        f"Found kernel in disk cache for {func.attrs['global_symbol']}")
+                    self.logger.debug(f"Found kernel in disk cache for {func.attrs['global_symbol']}")
                 # Populate memory cache with disk result
                 self._memory_cache[key] = kernel
                 return kernel
@@ -262,11 +261,7 @@ class KernelCache:
         executable.export_library(temp_path)
         os.replace(temp_path, path)
 
-    def _save_kernel_to_disk(self,
-                             key: str,
-                             kernel: JITKernel,
-                             func: Callable = None,
-                             verbose: bool = False):
+    def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
@@ -292,8 +287,7 @@ class KernelCache:
             if verbose:
                 self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(device_kernel_path, "w",
-                                             lambda file: file.write(kernel.kernel_source))
+                KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
         except Exception as e:
             self.logger.error(f"Error saving kernel source code to disk: {e}")
 
@@ -303,13 +297,9 @@ class KernelCache:
             if verbose:
                 self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
             if self.execution_backend == "tvm_ffi":
-                KernelCache._safe_write_file(
-                    host_kernel_path, "w",
-                    lambda file: file.write(kernel.adapter.get_host_source()))
+                KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_host_source()))
             else:
-                KernelCache._safe_write_file(
-                    host_kernel_path, "w",
-                    lambda file: file.write(kernel.adapter.get_kernel_source()))
+                KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             self.logger.error(f"Error saving host kernel source code to disk: {e}")
 
@@ -332,9 +322,7 @@ class KernelCache:
                 src_lib_path = src_lib_path.replace(".cubin", ".py")
                 if verbose:
                     self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                KernelCache._safe_write_file(
-                    kernel_py_path, "wb",
-                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+                KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
             elif self.execution_backend == "tvm_ffi":
                 executable = kernel.adapter.executable
                 if verbose:
@@ -344,9 +332,7 @@ class KernelCache:
                 src_lib_path = kernel.adapter.libpath
                 if verbose:
                     self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-                KernelCache._safe_write_file(
-                    kernel_lib_path, "wb",
-                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+                KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
 
         except Exception as e:
             self.logger.error(f"Error saving kernel library to disk: {e}")
@@ -356,8 +342,7 @@ class KernelCache:
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            KernelCache._safe_write_file(params_path, "wb",
-                                         lambda file: cloudpickle.dump(kernel.params, file))
+            KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
         except Exception as e:
             self.logger.error(f"Error saving kernel parameters to disk: {e}")
 
@@ -417,8 +402,7 @@ class KernelCache:
             self.logger.error(f"Error loading kernel source code from disk: {e}")
         try:
             if verbose:
-                self.logger.debug(
-                    f"Loading wrapped kernel source code from file: {host_kernel_path}")
+                self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
             with open(host_kernel_path) as f:
                 host_kernel_source = f.read()
         except Exception as e:
diff --git a/tilelang/carver/__init__.py b/tilelang/carver/__init__.py
index 4ffd4364..f1dfc5b4 100644
--- a/tilelang/carver/__init__.py
+++ b/tilelang/carver/__init__.py
@@ -1,4 +1,5 @@
 """Base infra"""
+
 from .analysis import (
     BlockInfo,  # noqa: F401
     IterInfo,  # noqa: F401
diff --git a/tilelang/carver/analysis.py b/tilelang/carver/analysis.py
index 96606e79..6ca91681 100644
--- a/tilelang/carver/analysis.py
+++ b/tilelang/carver/analysis.py
@@ -1,4 +1,5 @@
 """Analysis on TIR blocks, loops and functions."""
+
 from __future__ import annotations
 from typing_extensions import Literal
 
@@ -144,11 +145,13 @@ def normalize_prim_func(sch: tir.Schedule) -> list[BlockInfo] | None:
                         var=iter.var,
                         dom=iter.dom,
                         loop_rv=loop,
-                    ) for loop, iter in zip(loops, iters)
+                    )
+                    for loop, iter in zip(loops, iters)
                 ],
                 block_rv=block,
                 reduction_block=is_reduction,
-            ))
+            )
+        )
     return blocks
 
 
@@ -188,8 +191,7 @@ def get_max_shared_memory_per_block(target: Target) -> int:
     _assert_gpu_target(target)
     max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
     if max_shared_memory_per_block is None:
-        raise ValueError(
-            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
+        raise ValueError(f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
     return int(max_shared_memory_per_block)
 
 
@@ -197,13 +199,11 @@ def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
     try:
         block = sch.mod[func_name].body.block
     except Exception:
-        raise ValueError(f"The function body is expected to be the root block, but got:\n"
-                         f"{sch.mod[func_name].body}") from None
+        raise ValueError(f"The function body is expected to be the root block, but got:\n{sch.mod[func_name].body}") from None
     return sch.get_block(block.name_hint)
 
 
-def collect_block_iter_vars_used_in_access_region(block: tir.Block,
-                                                  region: list[ir.Range]) -> set[tir.Var]:
+def collect_block_iter_vars_used_in_access_region(block: tir.Block, region: list[ir.Range]) -> set[tir.Var]:
     """Collect the block iter variables used in the access region of a buffer region."""
     tir_vars = set()
     for expr in region:
@@ -251,15 +251,13 @@ def is_broadcast_epilogue(
     for buffer_region in sch.get(epilogue).reads:
         if buffer_region.buffer not in write_buffers:
             continue
-        tir_vars = collect_block_iter_vars_used_in_access_region(
-            sch.get(epilogue), buffer_region.region)
+        tir_vars = collect_block_iter_vars_used_in_access_region(sch.get(epilogue), buffer_region.region)
         if len(tir_vars) < len(epilogue_iters):
             return True
     return False
 
 
-def get_reduction_blocks(sch: tir.Schedule,
-                         blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
+def get_reduction_blocks(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
     # Get the main computation block
     def is_reduction(block: BlockRV) -> bool:
         block_stmt = sch.get(block)
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
index c2bc9c75..b6cb9e72 100644
--- a/tilelang/carver/arch/__init__.py
+++ b/tilelang/carver/arch/__init__.py
@@ -39,18 +39,18 @@ def auto_infer_current_arch() -> TileDevice:
 
 
 __all__ = [
-    'is_cpu_arch',
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
-    'is_cdna_arch',
-    'is_metal_arch',
-    'CUDA',
-    'CDNA',
-    'METAL',
-    'CPU',
+    "is_cpu_arch",
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
+    "is_cdna_arch",
+    "is_metal_arch",
+    "CUDA",
+    "CDNA",
+    "METAL",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/arch_base.py b/tilelang/carver/arch/arch_base.py
index 4c8825e8..c5e9dfa6 100644
--- a/tilelang/carver/arch/arch_base.py
+++ b/tilelang/carver/arch/arch_base.py
@@ -7,9 +7,7 @@ class TileDevice:
         self.reg_cap: int = 0  # Register capacity: The amount of register memory available
         self.smem_cap: int = 0  # Shared memory capacity: The amount of shared memory available
         self.compute_max_core: int = 0  # The maximum number of computing cores
-        self.warp_size: int = (
-            0  # The size of a warp, a group of threads that execute instructions in lockstep
-        )
+        self.warp_size: int = 0  # The size of a warp, a group of threads that execute instructions in lockstep
         self.sm_partition: int = 0  # The number of streaming multiprocessor partitions
         self.transaction_size: list[int] = [
             0,
@@ -21,9 +19,7 @@ class TileDevice:
             0,
         ]  # Bandwidth specifications, possibly including peak and sustained rates
         self.platform: str = "unknown"  # The platform or manufacturer of the device
-        self.compute_capability: str = (
-            "unknown"  # The compute capability, indicating the feature set and performance level
-        )
+        self.compute_capability: str = "unknown"  # The compute capability, indicating the feature set and performance level
         self.l2_cache_size_bytes: int = 0
         # the number of transaction size in bytes
         self.transaction_size: list[int] = [0, 0]  # in bytes
diff --git a/tilelang/carver/arch/cdna.py b/tilelang/carver/arch/cdna.py
index ec5aa905..5c2d4c4e 100644
--- a/tilelang/carver/arch/cdna.py
+++ b/tilelang/carver/arch/cdna.py
@@ -9,7 +9,6 @@ def is_cdna_arch(arch: TileDevice) -> bool:
 
 
 class CDNA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -33,6 +32,6 @@ class CDNA(TileDevice):
 
 
 __all__ = [
-    'is_cdna_arch',
-    'CDNA',
+    "is_cdna_arch",
+    "CDNA",
 ]
diff --git a/tilelang/carver/arch/cpu.py b/tilelang/carver/arch/cpu.py
index f4643baa..fc18c6c8 100644
--- a/tilelang/carver/arch/cpu.py
+++ b/tilelang/carver/arch/cpu.py
@@ -10,7 +10,6 @@ def is_cpu_arch(arch: TileDevice) -> bool:
 # For LLVM Backend, we do not provide the detailed information of the CPU
 # As the LLVM backend do not required tuning, just maintain the consistency
 class CPU(TileDevice):
-
     def __init__(self, target: Target):
         self.target = target
         device = tvm.runtime.cpu(0)
@@ -21,6 +20,6 @@ class CPU(TileDevice):
 
 
 __all__ = [
-    'is_cpu_arch',
-    'CPU',
+    "is_cpu_arch",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/cuda.py b/tilelang/carver/arch/cuda.py
index 4c7f98df..2b79b283 100644
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -78,7 +78,6 @@ hopper_tensorcore_supported = ada_tensorcore_supported
 # instead of assuming both a and b share the same dtype.
 # As the tensorcore may supports float8_e4m3 * float8_e5m2
 def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
-
     if is_volta_arch(arch):
         return (in_dtype, accum_dtype) in volta_tensorcore_supported
     elif is_ampere_arch(arch):
@@ -92,7 +91,6 @@ def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: Til
 
 
 class TensorInstruction:
-
     def __init__(
         self,
         name: str,
@@ -104,7 +102,6 @@ class TensorInstruction:
 
 
 class CUDA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -148,12 +145,12 @@ class CUDA(TileDevice):
 
 
 __all__ = [
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
     "CUDA",
 ]
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index c8cc1a38..a6312766 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -83,8 +83,7 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
     Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
     """
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
-    shared_mem = get_device_attribute(
-        cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
+    shared_mem = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
     if format == "bytes":
         return shared_mem
     elif format == "kb":
diff --git a/tilelang/carver/arch/metal.py b/tilelang/carver/arch/metal.py
index 9cd1c4d1..0b76849a 100644
--- a/tilelang/carver/arch/metal.py
+++ b/tilelang/carver/arch/metal.py
@@ -8,7 +8,6 @@ def is_metal_arch(arch: TileDevice) -> bool:
 
 
 class METAL(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = Target(target)
@@ -16,6 +15,6 @@ class METAL(TileDevice):
 
 
 __all__ = [
-    'is_metal_arch',
-    'METAL',
+    "is_metal_arch",
+    "METAL",
 ]
diff --git a/tilelang/carver/common_schedules.py b/tilelang/carver/common_schedules.py
index 199f0158..4904b770 100644
--- a/tilelang/carver/common_schedules.py
+++ b/tilelang/carver/common_schedules.py
@@ -19,6 +19,7 @@
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm common_schedules.py in dlight.
 """Common schedule strategies for TIR."""
+
 from typing import Callable
 
 from tvm import tir
diff --git a/tilelang/carver/matmul_analysis.py b/tilelang/carver/matmul_analysis.py
index 02a86cc7..6d27de82 100644
--- a/tilelang/carver/matmul_analysis.py
+++ b/tilelang/carver/matmul_analysis.py
@@ -1,5 +1,6 @@
 # pylint: disable=missing-docstring, invalid-name
 """A GEMM schedule rule for GPU operators."""
+
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
@@ -157,8 +158,7 @@ def find_last_producer_from_buffer(sch, main_block, buffer: tir.Buffer) -> Block
     return block
 
 
-def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV,
-                                   buffer: tir.Buffer) -> int:
+def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV, buffer: tir.Buffer) -> int:
     """traverse to find the arg index from the buffer"""
     producers = sch.get_producers(main_block)
 
@@ -226,9 +226,7 @@ def make_iter_fusion_index_map(
         else:
             fused_iters[trait.kind] = v_i
 
-    final_indices: list[tir.PrimExpr] = [
-        fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order
-    ]
+    final_indices: list[tir.PrimExpr] = [fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order]
 
     return tir.IndexMap(input_iters, final_indices, None)
 
@@ -307,8 +305,7 @@ def detect_iter_traits(block: tir.Block) -> tuple[list[IterTrait]] | None:
     return A_traits, B_traits, C_traits, block_traits
 
 
-def get_index_map(block: tir.Block,
-                  layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
+def get_index_map(block: tir.Block, layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
     """Get index maps for the block
 
     Parameters
@@ -343,10 +340,7 @@ def get_index_map(block: tir.Block,
         return axes
 
     def is_common_reduce(var: Var) -> bool:
-        for iter_var in block.iter_vars:
-            if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                return True
-        return False
+        return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block.iter_vars)
 
     def has_common_reduce(var: Var) -> bool:
         vars = collect_vars_from_expr(var)
@@ -384,17 +378,17 @@ def get_index_map(block: tir.Block,
             if kind == "C":
                 return [IterKind.kIter_S, primary_iter, secondary_iter]
             else:
-                return ([IterKind.kIter_S, spatial_iter, reduction_iter] if check_last_trait(region)
-                        else [IterKind.kIter_S, reduction_iter, spatial_iter])
+                return (
+                    [IterKind.kIter_S, spatial_iter, reduction_iter]
+                    if check_last_trait(region)
+                    else [IterKind.kIter_S, reduction_iter, spatial_iter]
+                )
         else:
             raise ValueError(f"Unknown layout {layout}")
 
-    A_index_map = make_iter_fusion_index_map(
-        A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
-    B_index_map = make_iter_fusion_index_map(
-        B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
-    C_index_map = make_iter_fusion_index_map(
-        C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
+    A_index_map = make_iter_fusion_index_map(A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
+    B_index_map = make_iter_fusion_index_map(B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
+    C_index_map = make_iter_fusion_index_map(C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
 
     matmul_index_map = make_iter_fusion_index_map(
         block_traits,
@@ -429,8 +423,7 @@ def get_dequantize_block(sch, blocks) -> BlockRV | None:
         has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads)
         if not has_uint_input:
             return False
-        return not (len(block_stmt.writes) != 1 or
-                    "float" not in str(block_stmt.writes[0].buffer.dtype))
+        return not (len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype))
 
     dequantize_blocks = [block for block in blocks if is_dequantize(block)]
     return dequantize_blocks[0] if len(dequantize_blocks) == 1 else None
@@ -452,8 +445,7 @@ def is_identity_or_transpose_block(block_stmt: tir.Block) -> bool:
                 return None
             axes.extend(undefined_vars(r.min))
         # remove trivial axis
-        trivial_vars = set(
-            iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
+        trivial_vars = set(iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
         axes = [axis for axis in axes if axis not in trivial_vars]
         # remove duplicate axis
         axes = [var for i, var in enumerate(axes) if i == 0 or var != axes[i - 1]]
@@ -462,8 +454,7 @@ def is_identity_or_transpose_block(block_stmt: tir.Block) -> bool:
     lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:]
     rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:]
     is_identity = list(lhs_access_vars) == list(rhs_access_vars)
-    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(
-        rhs_access_vars)
+    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(rhs_access_vars)
     return is_identity, is_transpose
 
 
@@ -491,9 +482,7 @@ def inline_transpose_block(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]
     return result_blocks
 
 
-def normalize_to_matmul(sch: tir.Schedule,
-                        main_block: BlockRV,
-                        layout: list[str] | None = None) -> tir.Schedule | None:
+def normalize_to_matmul(sch: tir.Schedule, main_block: BlockRV, layout: list[str] | None = None) -> tir.Schedule | None:
     if layout is None:
         layout = ["n", "t", "n"]
     block_stmt = sch.get(main_block)
@@ -526,7 +515,7 @@ def get_tensorized_func_and_tags(
     allow_gemv: bool = False,
 ) -> tuple[tir.PrimFunc, dict[str, list[int] | int]]:
     """
-        transform function to matmul if necessary (e.g. transform conv2d with im2col)
+    transform function to matmul if necessary (e.g. transform conv2d with im2col)
     """
     if layout is None:
         layout = ["a", "a", "a"]
@@ -543,10 +532,7 @@ def get_tensorized_func_and_tags(
         conditions = []
         conditions.append(len(block_stmt.reads) == 2)
         conditions.append(len(block_stmt.writes) == 1)
-        conditions.append(
-            len(
-                collect_block_iter_vars_used_in_access_region(block_stmt,
-                                                              block_stmt.writes[0].region)) > 0)
+        conditions.append(len(collect_block_iter_vars_used_in_access_region(block_stmt, block_stmt.writes[0].region)) > 0)
         return all(conditions)
 
     # step2. transform function to tensorcore matmul (e.g. conv2d with im2col)
@@ -592,10 +578,7 @@ def get_tensorized_func_and_tags(
             return axes
 
         def is_common_reduce(var: Var) -> bool:
-            for iter_var in block_stmt.iter_vars:
-                if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                    return True
-            return False
+            return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block_stmt.iter_vars)
 
         def has_common_reduce(var: Var) -> bool:
             vars = collect_vars_from_expr(var)
@@ -626,7 +609,7 @@ def get_tensorized_func_and_tags(
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
         # And we only support float16 for now
-        if (hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]):
+        if hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]:
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
@@ -645,9 +628,7 @@ def get_tensorized_func_and_tags(
     if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70:
         in_dtype, out_dtype = get_in_out_dtypes(block_stmt)
         if not is_tensorcore_supported_precision(in_dtype, out_dtype, arch=get_arch(target)):
-            logger.debug(
-                f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore"
-            )
+            logger.debug(f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore")
             return func, None
 
         # reindex and transform functions
@@ -676,7 +657,7 @@ def get_tensorized_func_and_tags(
             else:
                 raise ValueError(f"Unknown IterVar type {iter_type}")
 
-            if (isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold):
+            if isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold:
                 return func, None
         tags = analysis_tensorcore_tags(sch, main_block, target)
         return sch.mod["main"], tags
@@ -686,8 +667,10 @@ def get_tensorized_func_and_tags(
 
 def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", index_dtype="int32"):
     from bitblas.tl.mma_layout import (  # pylint: disable=import-outside-toplevel
-        ldmatrix_32x8_to_shared_16x16_layout, ldmatrix_trans_32x8_to_shared_16x16_layout,
-        ldmatrix_32x16_to_shared_16x32_layout_a, ldmatrix_32x16_to_shared_16x32_layout_b,
+        ldmatrix_32x8_to_shared_16x16_layout,
+        ldmatrix_trans_32x8_to_shared_16x16_layout,
+        ldmatrix_32x16_to_shared_16x32_layout_a,
+        ldmatrix_32x16_to_shared_16x32_layout_b,
     )
 
     assert dtype in [
@@ -727,9 +710,7 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
         return ldmatrix_layout(thread_id, local_id)
 
     if dtype in ["bfloat16", "float16"]:
-        ldmatrix_index_map = (
-            ldmatrix_trans_permutation_16x16_32x8_16x16
-            if trans else ldmatrix_permutation_16x16_32x8_16x16)
+        ldmatrix_index_map = ldmatrix_trans_permutation_16x16_32x8_16x16 if trans else ldmatrix_permutation_16x16_32x8_16x16
     else:
         ldmatrix_index_map = ldmatrix_permutation_16x32_32x16_32x16
 
@@ -744,7 +725,6 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
 # Ladder weight propagation, which can be used to avoid the ldmatrix
 # Instructions.
 def get_ladder_stage3_map(dtype="float16", index_dtype="int32"):
-
     def shared_32x8_to_mma_32x8_layout(i, j):
         thread_id = (i % 8) * 4 + (j // 2)
         local_id = (i // 8) * 2 + (j % 2)
@@ -837,8 +817,7 @@ def layout_propagate_chain(
                 scaling_factor = 1
                 for i, j in zip(write.buffer.shape, read.buffer.shape):
                     scaling_factor *= i // j
-                final_indices = list(
-                    index_map.map_indices(tmp_index_map.map_indices(write_indices)))
+                final_indices = list(index_map.map_indices(tmp_index_map.map_indices(write_indices)))
                 final_indices[-1] = final_indices[-1] // scaling_factor
                 index_map = IndexMap(
                     write_indices,
diff --git a/tilelang/carver/roller/bestfit.py b/tilelang/carver/roller/bestfit.py
index b66ceaae..ec781742 100644
--- a/tilelang/carver/roller/bestfit.py
+++ b/tilelang/carver/roller/bestfit.py
@@ -2,7 +2,6 @@
 
 
 class Block:
-
     def __init__(self, start, end, is_free):
         self.start = start
         self.end = end
@@ -21,7 +20,6 @@ class Block:
 
 
 class BestFit:
-
     def __init__(self, align=32):
         self.limit = 0
         self.list = []
@@ -31,16 +29,14 @@ class BestFit:
         size = (size + self.align - 1) // self.align * self.align
         found = None
         for block in self.list:
-            if block.is_free and block.size() >= size and (not found or
-                                                           found.size() > block.size()):
+            if block.is_free and block.size() >= size and (not found or found.size() > block.size()):
                 found = block
         if found:
             found.is_free = False
             remain = found.size() - size
             if remain != 0:
                 found.end -= remain
-                self.list.insert(
-                    self.list.index(found) + 1, Block(found.end, found.end + remain, True))
+                self.list.insert(self.list.index(found) + 1, Block(found.end, found.end + remain, True))
             return found
         elif len(self.list) > 0 and self.list[-1].is_free:
             add = size - self.list[-1].size()
diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py
index 17c69dae..8fd1fb40 100644
--- a/tilelang/carver/roller/hint.py
+++ b/tilelang/carver/roller/hint.py
@@ -1,4 +1,5 @@
 """Hint definition for schedule"""
+
 from tvm import DataType
 from . import PrimFuncNode
 import numpy as np
@@ -60,7 +61,7 @@ class Stride:
             strided_elem = original_shape
         else:
             assert self.ax < len(shape)
-            strided_elem = np.prod(shape[0:self.ax + 1]) * self.stride
+            strided_elem = np.prod(shape[0 : self.ax + 1]) * self.stride
             assert strided_elem >= original_shape
         return int(strided_elem)
 
@@ -217,7 +218,7 @@ class Hint:
         return dic
 
     @classmethod
-    def from_dict(cls, dic: dict) -> 'Hint':
+    def from_dict(cls, dic: dict) -> "Hint":
         hint = cls()
         for k, v in dic.items():
             setattr(hint, k, v)
diff --git a/tilelang/carver/roller/node.py b/tilelang/carver/roller/node.py
index f9e38b16..3122c7b0 100644
--- a/tilelang/carver/roller/node.py
+++ b/tilelang/carver/roller/node.py
@@ -1,4 +1,5 @@
 """PrimFunc Wrapper and Block information Analaysis"""
+
 from __future__ import annotations
 
 import tvm
@@ -31,7 +32,6 @@ def pre_order_traverse(block_analyzer, blocks, func):
 
 
 class BlockAnalyzer:
-
     def __init__(self, sch) -> None:
         self.sch: tir.Schedule = sch
         self.block_infos: list[BlockInfo] = normalize_prim_func(self.sch)
@@ -92,7 +92,6 @@ class Edge:
 
 
 class Node:
-
     def __init__(self, tags: dict | None = None, name: str = "Node") -> None:
         self.name = name
         if tags is None:
@@ -177,7 +176,6 @@ class Node:
 
 
 class PlaceHolderNode(Node):
-
     def __init__(self, name=""):
         super().__init__(name="PlaceHolder_" + name)
 
@@ -189,11 +187,7 @@ class PlaceHolderNode(Node):
 
 
 class PrimFuncNode(Node):
-
-    def __init__(self,
-                 prim_func: PrimFunc,
-                 tags: dict | None = None,
-                 name: str = "PrimFuncNode") -> None:
+    def __init__(self, prim_func: PrimFunc, tags: dict | None = None, name: str = "PrimFuncNode") -> None:
         super().__init__(tags, name=name)
         self.prim_func = self._specialize_func(prim_func)
         self.sch: tir.Schedule = tir.Schedule(self.prim_func)
@@ -227,7 +221,7 @@ class PrimFuncNode(Node):
         for dst_id, n in enumerate(inputs):
             if isinstance(n, Node):
                 n = (n, 0)
-            assert (len(n) == 2)
+            assert len(n) == 2
             src_node, src_id = n[0], n[1]
             edge = Edge(src_node, self, src_id, dst_id)
             self._in_edges.append(edge)
@@ -338,9 +332,8 @@ class PrimFuncNode(Node):
         if rstep is None:
             rstep = {}
         shape = {
-            self.block_analyzer.get_output_buffers(block)[0].name: [
-                tvm.arith.ConstIntBound(0, val - 1) for val in tile
-            ] for block in self.schedule_stages
+            self.block_analyzer.get_output_buffers(block)[0].name: [tvm.arith.ConstIntBound(0, val - 1) for val in tile]
+            for block in self.schedule_stages
         }
         return self.ana.infer(shape, rstep, targets)
 
@@ -356,10 +349,7 @@ class PrimFuncNode(Node):
                 results.append(shapes[arg.name])
                 continue
             # should not exceed original shape
-            trimmed_shape = [
-                self.extent_wrapper(i)
-                for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))
-            ]
+            trimmed_shape = [self.extent_wrapper(i) for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -380,10 +370,8 @@ class PrimFuncNode(Node):
             propagate_shape = shapes[arg.name]
             buffer_shape = args[i].shape
             if len(buffer_shape) > len(propagate_shape):
-                buffer_shape = buffer_shape[-len(propagate_shape):]
-            trimmed_shape = [
-                self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))
-            ]
+                buffer_shape = buffer_shape[-len(propagate_shape) :]
+            trimmed_shape = [self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -412,10 +400,7 @@ class PrimFuncNode(Node):
     def get_reduce_inputs_dtype(self):
         if self.reduction_block is None:
             return {}
-        return {
-            b.name: tvm.DataType(b.dtype)
-            for b in self.block_analyzer.get_input_buffers(self.reduction_block)
-        }
+        return {b.name: tvm.DataType(b.dtype) for b in self.block_analyzer.get_input_buffers(self.reduction_block)}
 
     @functools.lru_cache
     def infer_tensorcore_axis(self) -> tuple[int]:
@@ -425,8 +410,7 @@ class PrimFuncNode(Node):
         C_ax_m, C_ax_n = self.get_tag("tensorcore_config")
         wmma_m, wmma_n, wmma_k = [16, 16, 16]  # just for testing, any number is ok
 
-        output_buffer_shape = (
-            self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape)
+        output_buffer_shape = self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape
         valid_region = []
         for region in output_buffer_shape:
             if region.value == 1:
@@ -438,8 +422,7 @@ class PrimFuncNode(Node):
 
         def get_cl_shapes(c_ax_m, c_ax_n, num_nvalid_regions):
             spatial_dim = self.get_space_dim()
-            assert len(valid_region) == len(
-                spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
+            assert len(valid_region) == len(spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
             cl_shapes = [1] * len(spatial_dim)
             cl_shapes[c_ax_m - num_nvalid_regions] = wmma_m
             cl_shapes[c_ax_n - num_nvalid_regions] = wmma_n
@@ -467,9 +450,11 @@ class PrimFuncNode(Node):
         shapes, _ = self.propagate(shape, rstep)
 
         def is_broadcast_pattern(buffer, output_buffer):
-            return (buffer in self.args and
-                    len(shapes[output_buffer.name]) > len(shapes[buffer.name]) and
-                    np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name]))
+            return (
+                buffer in self.args
+                and len(shapes[output_buffer.name]) > len(shapes[buffer.name])
+                and np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name])
+            )
 
         def is_after_reduce_stage(block):
             if not self.reduction_block:
@@ -491,8 +476,8 @@ class PrimFuncNode(Node):
             output_buffer = self.block_analyzer.get_output_buffers(block)[0]
             for buffer in self.block_analyzer.get_input_buffers(block):
                 cache = buffer.name not in cached_tensor and (
-                    is_broadcast_pattern(buffer, output_buffer) or
-                    self.block_analyzer.get_block_info(block).is_reduction())
+                    is_broadcast_pattern(buffer, output_buffer) or self.block_analyzer.get_block_info(block).is_reduction()
+                )
                 if not cache:
                     continue
                 cached_tensor.append(buffer.name)
@@ -500,8 +485,7 @@ class PrimFuncNode(Node):
                     continue  # cache after reduce op can often reuse buffer in reduce stage
 
                 if buffer.name in stride_map:
-                    num_elem = stride_map[buffer.name].compute_elements_from_shape(
-                        shapes[buffer.name])
+                    num_elem = stride_map[buffer.name].compute_elements_from_shape(shapes[buffer.name])
                 else:
                     num_elem = np.prod(shapes[buffer.name])
                 buffer_len = num_elem * int((tvm.DataType(buffer.dtype).bits + 7) // 8)
@@ -514,7 +498,6 @@ class PrimFuncNode(Node):
 
 
 class OutputNode(Node):
-
     def __init__(self, node, id=0):
         super().__init__(name="OutputNode")
         # connect node and output node
@@ -549,15 +532,16 @@ def topo_order(list_of_nodes) -> list[Node]:
                 input_ready_count[dst_node] = len(dst_node.inputs)
                 list_of_nodes.append(dst_node)
             input_ready_count[dst_node] -= 1
-            assert (input_ready_count[dst_node] >= 0)
+            assert input_ready_count[dst_node] >= 0
             if input_ready_count[dst_node] == 0:
                 ready.append(dst_node)
-    assert (len(list_of_nodes) == len(output_list))
+    assert len(list_of_nodes) == len(output_list)
     return output_list
 
 
 def find_topo_sort_priority(output_node_list) -> list[Node]:
     import sys
+
     sys.setrecursionlimit(10000)
 
     def topo_sort_get_layer(node, topo_layer):
@@ -576,9 +560,7 @@ def find_topo_sort_priority(output_node_list) -> list[Node]:
         if node in visited:
             return
         visited.add(node)
-        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs],
-                                     key=lambda n: topo_layer[n],
-                                     reverse=True)
+        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs], key=lambda n: topo_layer[n], reverse=True)
         for n in ordered_input_nodes:
             topo_sort_dfs(n, visited, topo_order)
         topo_order.append(node)
@@ -591,7 +573,6 @@ def find_topo_sort_priority(output_node_list) -> list[Node]:
 
 
 def find_topo_sort(output_node_list) -> list[Node]:
-
     def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
diff --git a/tilelang/carver/roller/policy/default.py b/tilelang/carver/roller/policy/default.py
index 161df27a..d09216e1 100644
--- a/tilelang/carver/roller/policy/default.py
+++ b/tilelang/carver/roller/policy/default.py
@@ -1,4 +1,5 @@
 """Policy for cuda core schedule"""
+
 from __future__ import annotations
 import functools
 import math
@@ -36,20 +37,14 @@ class DefaultPolicy:
         self.rasterization = NoRasterization()
 
     @classmethod
-    def from_prim_func(cls,
-                       func: tvm.tir.PrimFunc,
-                       arch: TileDevice,
-                       tags: dict | None = None,
-                       name: str = "PrimFuncNode"):
+    def from_prim_func(cls, func: tvm.tir.PrimFunc, arch: TileDevice, tags: dict | None = None, name: str = "PrimFuncNode"):
         return cls(arch, tags)._init_with_prim_func(func, name)
 
     @classmethod
     def from_output_nodes(cls, nodes: list[OutputNode], arch: TileDevice, tags: dict | None = None):
         return cls(arch, tags)._init_with_output_nodes(nodes)
 
-    def _init_with_prim_func(self,
-                             func: tvm.tir.PrimFunc,
-                             name: str = "PrimFuncNode") -> DefaultPolicy:
+    def _init_with_prim_func(self, func: tvm.tir.PrimFunc, name: str = "PrimFuncNode") -> DefaultPolicy:
         if func is not None and isinstance(func, tvm.tir.PrimFunc):
             self.func = func
             self.prim_func_node = PrimFuncNode(self.func, tags=self.tags, name=name)
@@ -60,9 +55,7 @@ class DefaultPolicy:
         return self
 
     def _init_with_output_nodes(self, output_nodes: list[OutputNode]):
-        self.ordered_nodes = list(
-            filter(lambda n: not n.is_placeholder() and not n.is_output(),
-                   find_topo_sort(output_nodes)))
+        self.ordered_nodes = list(filter(lambda n: not n.is_placeholder() and not n.is_output(), find_topo_sort(output_nodes)))
         for node in self.ordered_nodes:
             node.update_tags(self.tags)
 
@@ -102,13 +95,14 @@ class DefaultPolicy:
 
     def dfs_smem_tile(self, init_tile, rstep_map) -> Iterable[TileDict]:
         _steps = [get_all_factors(n) for n in self.output_nodes[0].get_space_dim()]
-        steps = [step[step.index(t):] for step, t in zip(_steps, init_tile)]
+        steps = [step[step.index(t) :] for step, t in zip(_steps, init_tile)]
         for i in range(len(steps)):
             added = list(
                 filter(
                     lambda s: s < steps[i][-1] and s > steps[i][0] and s not in steps[i],
                     [2, 4, 8, 16, 32],
-                ))
+                )
+            )
             steps[i].extend(added)
             steps[i] = sorted(steps[i])
         visited_tiles = {}
@@ -190,10 +184,7 @@ class DefaultPolicy:
         """
         tile_map = {}
         for node in self.output_nodes:
-            tile_map[node] = [
-                tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i]
-                for i in range(len(tile))
-            ]
+            tile_map[node] = [tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i] for i in range(len(tile))]
         return tile_map
 
     def compute_workload_per_item(self, output_tile) -> float:
@@ -304,8 +295,7 @@ class DefaultPolicy:
             score = 0
             shape = node.propagate_inputs(tile, rstep=rstep)
             for i, input_buffer in enumerate(node.input_buffers):
-                read_transaction_elements = self.arch.transaction_size[1] // (
-                    (node.get_buffer_dtype(input_buffer).bits + 7) // 8)
+                read_transaction_elements = self.arch.transaction_size[1] // ((node.get_buffer_dtype(input_buffer).bits + 7) // 8)
                 score += sim(
                     int(coalesced_factor(shape[i], input_buffer.shape)),
                     read_transaction_elements,
@@ -380,17 +370,13 @@ class DefaultPolicy:
                     return None
                 return max(candidates, key=lambda x: x[1])[0]
 
-            cur_rstep_id = {
-                k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-            }
+            cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
             new_rstep_map = rstep_map.copy()
             while True:
                 new_rstep_id = _enlarge(cur_rstep_id)
                 if new_rstep_id is None:
                     break
-                new_rstep_map[node] = {
-                    k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis
-                }
+                new_rstep_map[node] = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                 old_rstep_map = td.rstep_map
                 td.rstep_map = new_rstep_map
                 smem_usage, _ = self._compute_shared_memory_usage(td)
@@ -434,15 +420,14 @@ class DefaultPolicy:
                 if edge.src_node.is_placeholder():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     read_transaction_elements = self.arch.transaction_size[1] // nbytes
-                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(),
-                                                      read_transaction_elements) * nbytes
+                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(), read_transaction_elements) * nbytes
             for edge in node.outputs:
                 if edge.dst_node.is_output():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     write_transaction_elements = self.arch.transaction_size[0] // nbytes
-                    traffic += coalesced_tensor_shape(output_shapes[edge.src_id],
-                                                      node.get_shape(edge.src_id),
-                                                      write_transaction_elements) * nbytes
+                    traffic += (
+                        coalesced_tensor_shape(output_shapes[edge.src_id], node.get_shape(edge.src_id), write_transaction_elements) * nbytes
+                    )
 
         return traffic, op_tile_map
 
@@ -487,10 +472,7 @@ class DefaultPolicy:
         cached_tensors_map = {}
 
         def can_free(node, out_id):
-            for edge in node.outputs:
-                if edge.src_id == out_id and edge.dst_node not in processed:
-                    return False
-            return True
+            return all(not (edge.src_id == out_id and edge.dst_node not in processed) for edge in node.outputs)
 
         for node in self.ordered_nodes:
             node_internal_bytes, cached_tensors_map[node] = self.infer_node_smem_usage(td, node)
@@ -528,9 +510,7 @@ class DefaultPolicy:
         Tuple[Dict, Dict]
             A tuple of dictionaries containing the output strides and tensor strides.
         """
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         return output_strides, tensor_strides
 
@@ -551,8 +531,7 @@ class DefaultPolicy:
         output_strides_map = {}
         tensor_strides_map = {}
         for node in self.ordered_nodes:
-            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(
-                node, td)
+            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(node, td)
         td.output_strides_map, td.tensor_strides_map = output_strides_map, tensor_strides_map
 
     def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
@@ -582,9 +561,7 @@ class DefaultPolicy:
         output_shape = self.output_nodes[0].get_space_dim()
         td.grid_size = int(np.prod([(y + x - 1) // x for x, y in zip(output_tile, output_shape)]))
         # estimated reg usage
-        reg_usage = int(2 * max([
-            np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes
-        ]))
+        reg_usage = int(2 * max([np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes]))
         if reg_usage > self.arch.reg_cap:
             td.valid = False
             return td
@@ -609,13 +586,10 @@ class DefaultPolicy:
         for node in self.ordered_nodes:
             if np.prod(td.get_tile(node)) == 0:
                 return False
-            node_grid_size = np.prod([
-                (y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())
-            ])
+            node_grid_size = np.prod([(y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())])
             if node_grid_size != td.grid_size:
                 return False
-            if (hasattr(node, "reduce_op") and node.reduce_op is not None and
-                    len(node.reduce_op.axis) == len(td.output_tile)):
+            if hasattr(node, "reduce_op") and node.reduce_op is not None and len(node.reduce_op.axis) == len(td.output_tile):
                 for i, tile_extent in enumerate(td.output_tile):
                     if node.reduce_op.axis[i].dom.extent % tile_extent:
                         return False
@@ -639,23 +613,22 @@ class DefaultPolicy:
         node_space_sizes = [int(np.prod(td.get_tile(node))) for node in self.ordered_nodes]
         max_block_size = functools.reduce(math.gcd, node_space_sizes)
 
-        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(
-                node_space_sizes):
-            node_reduce_sizes = [
-                int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes
-            ]
+        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(node_space_sizes):
+            node_reduce_sizes = [int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes]
             total_sizes = [x * y for x, y in zip(node_space_sizes, node_reduce_sizes)]
             max_possible_size = functools.reduce(math.gcd, total_sizes)
             possible_block_sizes = list(
                 filter(
                     lambda x: x % max_block_size == 0 and x <= 1024,
                     get_all_factors(max_possible_size),
-                ))
+                )
+            )
             possible_block_sizes = list(
                 filter(  # either be a factor of space or cover fully cover the space
                     lambda x: all([x % s == 0 or s % x == 0 for s in node_space_sizes]),
                     possible_block_sizes,
-                ))
+                )
+            )
             factor_ordered = sorted(possible_block_sizes, key=self.score_block_size)
             return factor_ordered
         else:
@@ -821,8 +794,7 @@ class DefaultPolicy:
         vectorize_result = {}
         for tensor, shape in shapes.items():
             for v in vectorize_sizes:
-                if (is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and
-                        is_type_allowed(dtypes[tensor], v)):
+                if is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and is_type_allowed(dtypes[tensor], v):
                     vectorize_result[tensor] = v
                     break
         return vectorize_result
diff --git a/tilelang/carver/roller/policy/tensorcore.py b/tilelang/carver/roller/policy/tensorcore.py
index 15bad412..86c79ea7 100644
--- a/tilelang/carver/roller/policy/tensorcore.py
+++ b/tilelang/carver/roller/policy/tensorcore.py
@@ -1,4 +1,5 @@
 """Policy for tensorcore schedule"""
+
 from __future__ import annotations
 import tvm
 import numpy as np
@@ -13,7 +14,6 @@ logger = logging.getLogger(__name__)
 
 
 class TensorCorePolicy(DefaultPolicy):
-
     # this is the trick for wmma.
     # However, for int8 mma, the wmma_k should be 32.
     wmma_k: int = 16
@@ -70,9 +70,9 @@ class TensorCorePolicy(DefaultPolicy):
         A_high_ax = min(A_ax_m, A_ax_k)
         B_high_ax = min(B_ax_n, B_ax_k)
         C_high_ax = min(C_ax_m, C_ax_n)
-        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1:]) + offset, ax=A_high_ax)
-        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1:]) + offset, ax=B_high_ax)
-        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1:]) + offset, ax=C_high_ax)
+        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1 :]) + offset, ax=A_high_ax)
+        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1 :]) + offset, ax=B_high_ax)
+        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1 :]) + offset, ax=C_high_ax)
         return A_stride, B_stride, C_stride
 
     def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode):
@@ -86,8 +86,7 @@ class TensorCorePolicy(DefaultPolicy):
         # get reduce input size
         target_transaction = self.arch.transaction_size[0] * 2
         # 512 bytes // type bits
-        reduce_input_dtype = node.get_buffer_dtype(
-            node.block_analyzer.get_input_buffers(node.reduction_block)[0])
+        reduce_input_dtype = node.get_buffer_dtype(node.block_analyzer.get_input_buffers(node.reduction_block)[0])
         basic = (target_transaction * 8) // reduce_input_dtype.bits
 
         result = {}
@@ -95,7 +94,7 @@ class TensorCorePolicy(DefaultPolicy):
             iter_name = iter_info.var.name
             iter_dom = iter_info.dom.extent
             if iter_dom % 16 > 0:
-                result[iter_name] = (16 if iter_dom < basic else basic)  # for the case of padding
+                result[iter_name] = 16 if iter_dom < basic else basic  # for the case of padding
             elif iter_dom % basic == 0:
                 result[iter_name] = basic
             else:
@@ -114,7 +113,6 @@ class TensorCorePolicy(DefaultPolicy):
             return False
 
         if _check_small_tile(td):
-
             smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
             rstep_map = td.rstep_map.copy()
 
@@ -127,13 +125,10 @@ class TensorCorePolicy(DefaultPolicy):
                     return rstep
 
                 def _shared_memory_usage(td: TileDict):
-                    return node.footprint(td.output_tile, new_rstep_map,
-                                          td.tensor_strides_map[node])
+                    return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node])
 
                 def _score(rstep_id):
-                    rstep = {
-                        k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
-                    }
+                    rstep = {k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis}
                     score = 0
                     shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
                     input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
@@ -153,18 +148,13 @@ class TensorCorePolicy(DefaultPolicy):
                         return None
                     return max(candidates, key=lambda x: x[1])[0]
 
-                cur_rstep_id = {
-                    k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-                }
+                cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
                 new_rstep_map = rstep_map.copy()
                 while True:
                     new_rstep_id = _enlarge(cur_rstep_id)
                     if new_rstep_id is None:
                         break
-                    new_rstep_map = {
-                        k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]]
-                        for k in node.raxis
-                    }
+                    new_rstep_map = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                     old_rstep_map = td.rstep_map
                     td.rstep_map = new_rstep_map
                     smem_usage, _ = _shared_memory_usage(td)
@@ -173,9 +163,7 @@ class TensorCorePolicy(DefaultPolicy):
                         break
                     else:
                         cur_rstep_id = new_rstep_id
-                rstep = {
-                    k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
-                }
+                rstep = {k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis}
                 return rstep
 
             for node in self.ordered_nodes:
@@ -206,11 +194,7 @@ class TensorCorePolicy(DefaultPolicy):
             return super().get_node_reduce_step_candidates(node)
         else:
             # must be a a multiple of wmma_k
-            return {
-                k.var.name: [
-                    x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)
-                ] for k in node.raxis
-            }
+            return {k.var.name: [x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)] for k in node.raxis}
 
     def check_tile_shape_isvalid(self, td: TileDict):
         for node in self.ordered_nodes:
@@ -221,10 +205,7 @@ class TensorCorePolicy(DefaultPolicy):
                     td.tile_map[node][ax_n],
                 )
                 # check the tile size is valid
-                wmma_invalid = [
-                    block_m < wmma_m or block_n < wmma_n
-                    for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()
-                ]
+                wmma_invalid = [block_m < wmma_m or block_n < wmma_n for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()]
                 if all(wmma_invalid):
                     return False
                 if any([y % x for x, y in zip(td.tile_map[node], node.get_space_dim())]):
@@ -242,13 +223,10 @@ class TensorCorePolicy(DefaultPolicy):
             return super().compute_node_stride_map(node, td)
         use_layout = self._can_implement_layout(node, td)
 
-        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node),
-                                                                  td.get_rstep(node))
+        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node), td.get_rstep(node))
         A_stride, B_stride, _ = self._compute_tc_strides(node, td.get_tile(node))
         tensor_strides = {}
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         # when connected to shared input, should use full stride without rstep
         for i, (_, _) in enumerate(zip([AS_stride, BS_stride], [A_stride, B_stride])):
@@ -347,8 +325,7 @@ class TensorCorePolicy(DefaultPolicy):
             overall_gmem_size_in_bytes: int = 0
             for node in self.ordered_nodes:
                 for buffer in node.input_buffers:
-                    overall_gmem_size_in_bytes += (
-                        int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8)
+                    overall_gmem_size_in_bytes += int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8
             return overall_gmem_size_in_bytes < self.arch.l2_cache_size_bytes
 
         conditions.append(_check_memory_size())
diff --git a/tilelang/carver/roller/rasterization.py b/tilelang/carver/roller/rasterization.py
index ebd1319a..ec565a1c 100644
--- a/tilelang/carver/roller/rasterization.py
+++ b/tilelang/carver/roller/rasterization.py
@@ -2,7 +2,6 @@
 
 
 class Rasterization:
-
     panel_width_ = None
 
     def __init__(self) -> None:
@@ -18,7 +17,6 @@ class Rasterization:
 
 
 class NoRasterization(Rasterization):
-
     def __init__(self) -> None:
         super().__init__()
 
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index c52a170e..c29ae412 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -4,9 +4,7 @@ from tvm import arith
 
 
 class Statement:
-
-    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict,
-                 range_map: OrderedDict):
+    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict):
         self.output = output
         self.dependent_region = dependent_region
         self.var_map = var_map
@@ -18,7 +16,6 @@ def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
 
diff --git a/tilelang/carver/roller/shape_inference/tir.py b/tilelang/carver/roller/shape_inference/tir.py
index 618cf9b3..d7b11d60 100644
--- a/tilelang/carver/roller/shape_inference/tir.py
+++ b/tilelang/carver/roller/shape_inference/tir.py
@@ -5,7 +5,6 @@ from tvm import arith, tir
 
 
 class Statement:
-
     def __init__(self, block_analyzer, block: BlockRV):
         self.block_analyzer = block_analyzer
         self.block = block
@@ -21,9 +20,7 @@ class Statement:
         if len(self.dependent_region[input_name]) != 1:
             return None
         indices = self.dependent_region[input_name][0]
-        iter_map_range = {
-            _iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)
-        }
+        iter_map_range = {_iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)}
         iter_map_result = arith.detect_iter_map(
             indices,
             iter_map_range,
@@ -77,7 +74,6 @@ class TensorDepNode:
 
 
 class DependencyAnalysis:
-
     def __init__(self, deps):
         self.deps = deps
         # issue: duplicate name when we have two same ops.
@@ -112,8 +108,7 @@ class DependencyAnalysis:
 
     def traverse_dependencies(self, compute):
         if isinstance(compute, Statement):
-            node = self.get_or_create_node(
-                compute.block_analyzer.get_output_buffers(compute.block)[0].name)
+            node = self.get_or_create_node(compute.block_analyzer.get_output_buffers(compute.block)[0].name)
             # Loop through input tensors
             for input_buffer in compute.block_analyzer.get_input_buffers(compute.block):
                 # Get the input node
@@ -167,7 +162,6 @@ class DependencyAnalysis:
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
         self.target_mapping = {}
@@ -183,16 +177,11 @@ class InputShapeInference:
         if targets in self.target_mapping:
             return self.target_mapping[targets]
         # should be buffer name instead of block name
-        name2dep = {
-            dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps
-        }
+        name2dep = {dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps}
         mapping = {}
         input_vars = []
         for target in targets:
-            vars = [
-                iter.var
-                for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)
-            ]
+            vars = [iter.var for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)]
             input_vars.append(vars)
             mapping[target] = [vars]
         ana = arith.Analyzer()
@@ -221,13 +210,8 @@ class InputShapeInference:
                     mapping[input_name] = []
                 for indices in indices_list:
                     for region in regions:
-                        vmap = {
-                            k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v)
-                            for k, v in zip(ax_vars, indices)
-                        }
-                        region = [
-                            ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region
-                        ]
+                        vmap = {k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v) for k, v in zip(ax_vars, indices)}
+                        region = [ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region]
                         if not region_exist_in_list(region, mapping[input_name]):
                             mapping[input_name].append(region)
         buffers = []
@@ -241,10 +225,7 @@ class InputShapeInference:
         self.target_mapping[targets] = input_vars, mapping
         return input_vars, mapping
 
-    def infer(self,
-              shape: dict[str, list[arith.ConstIntBound]],
-              rstep: dict[str, int] = None,
-              targets=None):
+    def infer(self, shape: dict[str, list[arith.ConstIntBound]], rstep: dict[str, int] = None, targets=None):
         if rstep is None:
             rstep = {}
         compute_targets = tuple(shape.keys())
@@ -258,8 +239,7 @@ class InputShapeInference:
         for ax in self.reduce_axes:
             # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value.
             if ax.var.name in rstep:
-                bound = arith.ConstIntBound(
-                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
+                bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
             else:
                 bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1))
             ana.update(ax.var, bound, True)
@@ -312,14 +292,11 @@ class InputShapeInference:
 
         for name, regions in mapping.items():
             region = regions[0]
-            result[name] = [
-                ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region
-            ]
+            result[name] = [ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region]
         return result
 
 
 def region_exist_in_list(a, list) -> bool:
-
     def expr_is_same(a, b) -> bool:
         if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm):
             return a.value == b.value
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index a119c16a..4a699fbc 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -2,7 +2,12 @@
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
-    TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch)
+    TileDevice,
+    is_volta_arch,
+    is_ampere_arch,
+    is_cdna_arch,
+    auto_infer_current_arch,
+)
 from ..roller.hint import Hint  # Import the Hint class
 from ..roller.node import OutputNode  # Import the OutputNode class
 from tvm.tir import PrimFunc  # Import PrimFunc for handling tensor IR functions
@@ -41,7 +46,7 @@ class BaseTemplate(ABC):
         """
         pass
 
-    def with_arch(self, arch: TileDevice) -> 'BaseTemplate':
+    def with_arch(self, arch: TileDevice) -> "BaseTemplate":
         """
         Sets the architecture for this template and returns itself.
 
@@ -109,7 +114,7 @@ class BaseTemplate(ABC):
         """
         raise NotImplementedError("initialize_function is not implemented")
 
-    def set_function(self, func: PrimFunc) -> 'BaseTemplate':
+    def set_function(self, func: PrimFunc) -> "BaseTemplate":
         """
         Sets the function for this template and returns itself.
 
@@ -122,7 +127,7 @@ class BaseTemplate(ABC):
         self._func = func
         return self
 
-    def set_output_nodes(self, output_nodes: list[OutputNode]) -> 'BaseTemplate':
+    def set_output_nodes(self, output_nodes: list[OutputNode]) -> "BaseTemplate":
         """
         Sets the output nodes for this template and returns itself.
 
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index 9ea89202..c339e589 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -28,6 +28,7 @@ class ConvTemplate(BaseTemplate):
         accum_dtype (str): Data type used for accumulation.
         with_bias (bool): Whether to add a bias term.
     """
+
     # Operation-related configuration parameters
     N: int  # The number of input samples processed simultaneously in a batch.
     C: int  # The number of input feature maps.
@@ -69,12 +70,18 @@ class ConvTemplate(BaseTemplate):
             AssertionError: If N, C, H, W, F, K, S, D, P are not positive integers.
         """
         N, C, H, W, F, K, S, D, P = self.N, self.C, self.H, self.W, self.F, self.K, self.S, self.D, self.P
-        assert (isinstance(N, int) and isinstance(C, int) and isinstance(H, int) and
-                isinstance(W, int) and isinstance(F, int) and isinstance(K, int) and
-                isinstance(S, int) and isinstance(D, int) and
-                isinstance(P, int)), "Only Support Integer Params"
-        assert (N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and
-                P > 0), "Params should be positive"
+        assert (
+            isinstance(N, int)
+            and isinstance(C, int)
+            and isinstance(H, int)
+            and isinstance(W, int)
+            and isinstance(F, int)
+            and isinstance(K, int)
+            and isinstance(S, int)
+            and isinstance(D, int)
+            and isinstance(P, int)
+        ), "Only Support Integer Params"
+        assert N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and P > 0, "Params should be positive"
 
         # Load configuration parameters
         in_dtype, out_dtype, accum_dtype = self.in_dtype, self.out_dtype, self.accum_dtype
@@ -123,8 +130,10 @@ class ConvTemplate(BaseTemplate):
                 te.if_then_else(
                     te.all(h_in >= 0, h_in < H, w_in >= 0, w_in < W),
                     A[n, h_in, w_in, c].astype(accum_dtype) * B[kh, kw, c, f].astype(accum_dtype),
-                    tir.const(0, accum_dtype)),
-                axis=[kh, kw, c])
+                    tir.const(0, accum_dtype),
+                ),
+                axis=[kh, kw, c],
+            )
 
         # Compute convolution result
         C = te.compute(
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index ae1a2540..933ab958 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -9,7 +9,6 @@ from ..utils import get_roller_hints_from_output_nodes, get_tensorized_func_and_
 
 @dataclass
 class FlashAttentionTemplate(BaseTemplate):
-
     _output_nodes: list[OutputNode] = None
 
     # Operation-related configuration parameters
@@ -91,10 +90,7 @@ class FlashAttentionTemplate(BaseTemplate):
                 """
                 A_indices = [b, i, k]
                 B_indices = [b, j, k]
-                return te.sum(
-                    A[tuple(A_indices)].astype(accum_dtype) *
-                    B[tuple(B_indices)].astype(accum_dtype),
-                    axis=k)
+                return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
             # Compute matrix multiplication result
             C = te.compute(
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index cdcc78d0..e7962f6a 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -50,9 +50,8 @@ class GEMVTemplate(BaseTemplate):
         N, K = self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_B = self.trans_B
@@ -86,9 +85,7 @@ class GEMVTemplate(BaseTemplate):
             """
             A_indices = [i, k]
             B_indices = [k, j] if not trans_B else [j, k]
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/template/general_reduce.py b/tilelang/carver/template/general_reduce.py
index a8da5fd6..b7a55157 100644
--- a/tilelang/carver/template/general_reduce.py
+++ b/tilelang/carver/template/general_reduce.py
@@ -9,15 +9,13 @@ from ..utils import get_roller_hints_from_func
 
 @dataclass
 class GeneralReductionTemplate(BaseTemplate):
-
     # OP Related Config
     structure: str | list[str] = None
     shape: list[int] = None
     dtype: str = "float16"
 
     def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) -> list[Hint]:
-        roller_hints = get_roller_hints_from_func(
-            self._func, arch=arch, topk=topk, allow_gemv=False)
+        roller_hints = get_roller_hints_from_func(self._func, arch=arch, topk=topk, allow_gemv=False)
         return roller_hints
 
     def initialize_function(self) -> None:
@@ -38,9 +36,9 @@ class GeneralReductionTemplate(BaseTemplate):
         spatial_axes = []
         reduce_axes = []
         for i, axis_type in enumerate(self.structure):
-            if axis_type.upper() == 'S':
+            if axis_type.upper() == "S":
                 spatial_axes.append((i, self.shape[i]))
-            elif axis_type.upper() == 'R':
+            elif axis_type.upper() == "R":
                 reduce_axes.append((i, self.shape[i]))
             else:
                 raise ValueError(f"Unrecognized axis type '{axis_type}', only 'S'/'R' allowed.")
@@ -90,7 +88,7 @@ class GeneralReductionTemplate(BaseTemplate):
 
             # Walk through the structure in order
             for axis_type in self.structure:
-                if axis_type.upper() == 'S':
+                if axis_type.upper() == "S":
                     # use the next spatial_indices item
                     full_index.append(spatial_indices[spatial_iter])
                     spatial_iter += 1
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index 653ddab3..57c92beb 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -65,9 +65,8 @@ class MatmulTemplate(BaseTemplate):
         M, N, K = self.M, self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_A, trans_B = self.trans_A, self.trans_B
@@ -101,9 +100,7 @@ class MatmulTemplate(BaseTemplate):
             """
             A_indices = [i, k] if not trans_A else [k, i]  # Adjust indexing if A is transposed
             B_indices = [k, j] if not trans_B else [j, k]  # Adjust indexing if B is transposed
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/utils.py b/tilelang/carver/utils.py
index cedb7547..67db89e3 100644
--- a/tilelang/carver/utils.py
+++ b/tilelang/carver/utils.py
@@ -26,11 +26,9 @@ def get_rasterization_code(pannel_width: int = 8) -> str:
     """
 
 
-def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
-                               arch: TileDevice,
-                               topk: int = 10,
-                               tensorcore_only: bool = False,
-                               allow_gemv: bool = False) -> list[Hint] | None:
+def get_roller_hints_from_func(
+    func_or_module: tir.PrimFunc | IRModule, arch: TileDevice, topk: int = 10, tensorcore_only: bool = False, allow_gemv: bool = False
+) -> list[Hint] | None:
     func = None
     if isinstance(func_or_module, tir.PrimFunc):
         func = func_or_module
@@ -44,8 +42,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     roller_hints = None
     if tensorcore_only:
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -58,8 +55,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
         policy = DefaultPolicy.from_prim_func(func=func, arch=arch)
         tensorized_func = None
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -69,10 +65,9 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     return roller_hints
 
 
-def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
-                                       arch: TileDevice,
-                                       topk: int = 10,
-                                       extra_tags: list[str] | None = None) -> list[Hint] | None:
+def get_roller_hints_from_output_nodes(
+    output_nodes: list[OutputNode], arch: TileDevice, topk: int = 10, extra_tags: list[str] | None = None
+) -> list[Hint] | None:
     assert isinstance(output_nodes, list), "The input should be a list of functions."
 
     lints = []
@@ -80,8 +75,7 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
         policy = TensorCorePolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
         lints = policy.emit_config(topk)
     except Exception as e_msg:
-        logger.debug(f"Generate hints from output nodes failed: {e_msg}",
-                     "fallback to default policy")
+        logger.debug(f"Generate hints from output nodes failed: {e_msg}", "fallback to default policy")
 
     if len(lints) == 0:
         policy = DefaultPolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
@@ -92,7 +86,6 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
 def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index 87d943ab..7dc45977 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Util to invoke C/C++ compilers in the system."""
+
 import functools
 import os
 import shutil
@@ -30,8 +31,7 @@ from tvm.contrib import utils as _utils
 
 
 def _is_linux_like():
-    return (sys.platform == "darwin" or sys.platform.startswith("linux") or
-            sys.platform.startswith("freebsd"))
+    return sys.platform == "darwin" or sys.platform.startswith("linux") or sys.platform.startswith("freebsd")
 
 
 def _is_windows_like():
@@ -90,7 +90,7 @@ def get_cplus_compiler():
 
 
 def is_darwin():
-    return platform.system() == 'Darwin'
+    return platform.system() == "Darwin"
 
 
 def create_shared(output, objects, options=None, cc=None, cwd=None, ccache_env=None):
@@ -287,11 +287,7 @@ create_shared.output_format = "so" if sys.platform != "win32" else "dll"
 create_shared.get_target_triple = get_target_by_dump_machine(os.environ.get("CXX", get_cc()))
 
 
-def cross_compiler(compile_func,
-                   options=None,
-                   output_format=None,
-                   get_target_triple=None,
-                   add_files=None):
+def cross_compiler(compile_func, options=None, output_format=None, get_target_triple=None, add_files=None):
     """Create a cross compiler function by specializing compile_func with options.
 
     This function can be used to construct compile functions that
@@ -363,13 +359,7 @@ def cross_compiler(compile_func,
     return _fcompile
 
 
-def _linux_compile(output,
-                   objects,
-                   options,
-                   compile_cmd,
-                   cwd=None,
-                   ccache_env=None,
-                   compile_shared=False):
+def _linux_compile(output, objects, options, compile_cmd, cwd=None, ccache_env=None, compile_shared=False):
     cmd = [compile_cmd]
     if compile_cmd != "nvcc":
         if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
@@ -430,15 +420,15 @@ def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
             raise ValueError("ccache not found")
 
     try:
-        proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("Can not find the LLVM clang for Windows clang.exe)."
-                           "Make sure it's installed"
-                           " and the installation directory is in the %PATH% environment "
-                           "variable. Prebuilt binaries can be found at: https://llvm.org/") \
-                               from None
+        raise RuntimeError(
+            "Can not find the LLVM clang for Windows clang.exe)."
+            "Make sure it's installed"
+            " and the installation directory is in the %PATH% environment "
+            "variable. Prebuilt binaries can be found at: https://llvm.org/"
+        ) from None
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += " ".join(cmd) + "\n"
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index 6772fe11..d80f0fdb 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping functions to bridge frameworks with DLPack support to TVM"""
+
 from tvm import runtime
 
 
@@ -45,12 +46,8 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
 
     def adapt_tensor(arg):
         if isinstance(arg, tensor_type):
-            if arg.dtype in {
-                    torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2,
-                    torch.float8_e5m2fnuz
-            }:
-                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(
-                    arg.shape, dtype=float8_dtype_map[arg.dtype])
+            if arg.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz}:
+                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(arg.shape, dtype=float8_dtype_map[arg.dtype])
             return runtime.from_dlpack(to_dlpack_func(arg))
         return arg
 
diff --git a/tilelang/contrib/hipcc.py b/tilelang/contrib/hipcc.py
index 4e3c9a5c..7b7f9f94 100644
--- a/tilelang/contrib/hipcc.py
+++ b/tilelang/contrib/hipcc.py
@@ -16,12 +16,7 @@ from tvm.base import py_str
 from tvm.contrib.rocm import get_rocm_arch, find_rocm_path
 
 
-def compile_hip(code,
-                target_format="hsaco",
-                arch=None,
-                options=None,
-                path_target=None,
-                verbose=False):
+def compile_hip(code, target_format="hsaco", arch=None, options=None, path_target=None, verbose=False):
     """Compile HIP code with hipcc.
 
     Parameters
@@ -61,7 +56,7 @@ def compile_hip(code,
 
     file_target = path_target if path_target else temp_target
     cmd = ["hipcc"]
-    cmd += ["-O3", '-c']
+    cmd += ["-O3", "-c"]
     if isinstance(arch, str):
         cmd += [f"--offload-arch={arch}"]
     if target_format == "hsaco":
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 0e6a19ba..36df6c87 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -1,6 +1,7 @@
 # pylint: disable=invalid-name
 # modified from apache tvm python/tvm/contrib/nvcc.py
 """Utility to invoke nvcc compiler in the system"""
+
 from __future__ import annotations
 
 import os
@@ -18,12 +19,7 @@ from tvm.base import py_str
 from tvm.contrib import utils
 
 
-def compile_cuda(code,
-                 target_format="ptx",
-                 arch=None,
-                 options=None,
-                 path_target=None,
-                 verbose=False):
+def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target=None, verbose=False):
     """Compile cuda code with NVCC from env.
 
     Parameters
@@ -67,7 +63,7 @@ def compile_cuda(code,
     temp_target = temp.relpath(f"{file_name}.{target_format}")
 
     pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-    kernels_output_dir = (pass_context.config.get("cuda.kernels_output_dir", None))
+    kernels_output_dir = pass_context.config.get("cuda.kernels_output_dir", None)
     if kernels_output_dir is not None:
         if not os.path.isdir(kernels_output_dir):
             os.makedirs(kernels_output_dir)
@@ -114,10 +110,7 @@ def compile_cuda(code,
         print(py_str(out))
 
     if proc.returncode != 0:
-        msg = f"{code}\n" \
-            f"Compilation error:\n" \
-            f"{py_str(out)}\n" \
-            f"Command: {' '.join(cmd)}\n"
+        msg = f"{code}\nCompilation error:\n{py_str(out)}\nCommand: {' '.join(cmd)}\n"
         raise RuntimeError(msg)
 
     with open(file_target, "rb") as f:
@@ -165,6 +158,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
     if compile_flags:
         import shlex
+
         for flag in compile_flags:
             # Split each string like a shell would, preserving quoted args
             tokens = shlex.split(flag) if isinstance(flag, str) else [str(flag)]
@@ -172,9 +166,7 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     return options
 
 
-def get_ptx_from_source(code: str,
-                        compile_flags: list[str] | None = None,
-                        verbose: bool = False) -> str:
+def get_ptx_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to PTX using NVCC and return as text.
 
@@ -212,9 +204,7 @@ def _find_tool(name: str) -> str | None:
     return None
 
 
-def get_sass_from_source(code: str,
-                         compile_flags: list[str] | None = None,
-                         verbose: bool = False) -> str:
+def get_sass_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
     """
     Compile CUDA C++ source to CUBIN and disassemble to SASS.
 
@@ -246,9 +236,7 @@ def get_sass_from_source(code: str,
     cand_nvdisasm = _find_tool("nvdisasm")
     cand_cuobjdump = _find_tool("cuobjdump")
     if not cand_nvdisasm and not cand_cuobjdump:
-        raise RuntimeError(
-            "Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH."
-        )
+        raise RuntimeError("Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH.")
     last_err: str | None = None
     try:
         # Attempt nvdisasm first
@@ -268,8 +256,7 @@ def get_sass_from_source(code: str,
                 return text
             last_err = f"{tool_name} rc={proc.returncode}, output:\n{text}"
         # If we reach here, all attempts failed
-        raise RuntimeError(f"SASS disassembly failed. Tried tools: "
-                           f"{', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
+        raise RuntimeError(f"SASS disassembly failed. Tried tools: {', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
     finally:
         with contextlib.suppress(Exception):
             os.remove(cubin_path)
@@ -438,8 +425,7 @@ def get_target_compute_version(target=None):
     if tvm.cuda(0).exist:
         return tvm.cuda(0).compute_version
 
-    raise ValueError("No CUDA architecture was specified or GPU detected."
-                     "Try specifying it by adding '-arch=sm_xx' to your target.")
+    raise ValueError("No CUDA architecture was specified or GPU detected.Try specifying it by adding '-arch=sm_xx' to your target.")
 
 
 def parse_compute_version(compute_version) -> tuple[int, int]:
@@ -524,7 +510,8 @@ def have_tensorcore(compute_version=None, target=None):
                 warnings.warn(
                     "Tensorcore will be disabled due to no CUDA architecture specified."
                     "Try specifying it by adding '-arch=sm_xx' to your target.",
-                    stacklevel=2)
+                    stacklevel=2,
+                )
                 return False
             compute_version = target.attrs["arch"]
             # Compute version will be in the form "sm_{major}{minor}"
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index b6911554..105c5181 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -11,11 +11,13 @@ def get_nvrtc_version() -> tuple[int, int]:
     return (major, minor)
 
 
-def compile_cuda(code: str,
-                 target_format: Literal["ptx", "cubin"] = "ptx",
-                 arch: int | None = None,
-                 options: str | list[str] | None = None,
-                 verbose: bool = False) -> bytearray:
+def compile_cuda(
+    code: str,
+    target_format: Literal["ptx", "cubin"] = "ptx",
+    arch: int | None = None,
+    options: str | list[str] | None = None,
+    verbose: bool = False,
+) -> bytearray:
     """Compile cuda code with NVRTC.
 
     Parameters
@@ -43,8 +45,7 @@ def compile_cuda(code: str,
     if arch is None:
         # If None, then it will use `tvm.target.Target.current().arch`.
         # Target arch could be a str like "80", "90", "90a", etc.
-        major, minor = parse_compute_version(
-            get_target_compute_version(Target.current(allow_none=True)))
+        major, minor = parse_compute_version(get_target_compute_version(Target.current(allow_none=True)))
         arch = major * 10 + minor
     prefix = "compute" if target_format == "ptx" else "sm"
     suffix = "a" if arch >= 90 else ""
@@ -77,8 +78,7 @@ def compile_cuda(code: str,
     compile_result = nvrtc.nvrtcCompileProgram(program, len(options_bytes), options_bytes)[0]
 
     if compile_result != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        msg = f"{code}\n" \
-            f"Compilation error:\n"
+        msg = f"{code}\nCompilation error:\n"
         if verbose:
             result, log_size = nvrtc.nvrtcGetProgramLogSize(program)
             assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get program log size: {result}"
@@ -105,7 +105,6 @@ def compile_cuda(code: str,
         assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get PTX: {result}"
 
     # Destroy handler
-    assert nvrtc.nvrtcDestroyProgram(
-        program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
+    assert nvrtc.nvrtcDestroyProgram(program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
 
     return result_bytes
diff --git a/tilelang/contrib/rocm.py b/tilelang/contrib/rocm.py
index 4a57c3c6..f3b92e54 100644
--- a/tilelang/contrib/rocm.py
+++ b/tilelang/contrib/rocm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility for ROCm backend"""
+
 # ruff: noqa
 import re
 import subprocess
@@ -255,9 +256,11 @@ def get_rocm_arch(rocm_path="/opt/rocm"):
             gpu_arch = match.group(1)
         return gpu_arch
     except subprocess.CalledProcessError:
-        print(f"Unable to execute rocminfo command, \
+        print(
+            f"Unable to execute rocminfo command, \
                 please ensure ROCm is installed and you have an AMD GPU on your system.\
-                    using default {gpu_arch}.")
+                    using default {gpu_arch}."
+        )
         return gpu_arch
 
 
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 7abdfb92..9932d522 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -1,4 +1,5 @@
 """The compiler for TL programs."""
+
 from __future__ import annotations
 
 import os
@@ -28,14 +29,13 @@ def is_cpu_device_backend(target: Target):
 
 def has_device_kernel_launch(attrs) -> bool:
     """Check if the attributes indicate a device kernel launch."""
-    return bool(attrs and "calling_conv" in attrs and
-                attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
+    return bool(attrs and "calling_conv" in attrs and attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
 
 
 def is_device_call_c_device(func: tir.PrimFunc):
     attrs = func.attrs
     calling_conv = attrs.get("calling_conv", CallingConv.DEFAULT)
-    is_cpacked = (calling_conv == CallingConv.C_PACKED_FUNC)
+    is_cpacked = calling_conv == CallingConv.C_PACKED_FUNC
 
     # Check if it's a C target
     if "target" in attrs and attrs["target"].kind.name == "c" and not is_cpacked:
@@ -141,16 +141,16 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
         if var in func.buffer_map:
             tensor_types.append(KernelParam.from_buffer(func.buffer_map[var]))
         else:
-            if var.dtype == 'handle':
+            if var.dtype == "handle":
                 raise ValueError(
-                    f'Handle parameter {var} must be mapped to a buffer.\n'
-                    f'Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer.')
+                    f"Handle parameter {var} must be mapped to a buffer.\n"
+                    f"Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer."
+                )
             tensor_types.append(KernelParam.from_var(var))
     return tensor_types
 
 
 def canon_target_host(target: str | Target, target_host: str | Target | None):
-
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "c"
 
@@ -195,11 +195,9 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
-            device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(
-            device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
     elif target.kind.name == "llvm":
@@ -222,12 +220,12 @@ def lower(
     enable_host_codegen=False,
     enable_device_compile=False,
 ) -> CompiledArtifact:
-    '''
-        enable_host_codegen: whether to enable host codegen, default is False, as we have our
-        own host codegen implementation in jit.
-        enable_device_compile: whether to enable device codegen, default is False, as we have our
-        own device codegen implementation in jit.
-    '''
+    """
+    enable_host_codegen: whether to enable host codegen, default is False, as we have our
+    own host codegen implementation in jit.
+    enable_device_compile: whether to enable device codegen, default is False, as we have our
+    own device codegen implementation in jit.
+    """
 
     mod = func_or_mod
     params = None
@@ -259,14 +257,11 @@ def lower(
     host_mod = tir.transform.Filter(_is_host_call)(mod)
     device_mod = tir.transform.Filter(_is_device_call)(mod)
 
-    codegen_mod = device_codegen(
-        device_mod, target) if enable_device_compile else device_codegen_without_compile(
-            device_mod, target)
+    codegen_mod = device_codegen(device_mod, target) if enable_device_compile else device_codegen_without_compile(device_mod, target)
 
     if enable_host_codegen:
         host_mod = host_codegen(host_mod, target_host)
         host_mod.import_module(codegen_mod)
-        return CompiledArtifact(
-            host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
+        return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
 
     return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source())
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index de3c979e..1abf66a5 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -14,6 +15,7 @@ class KernelParam:
     Represents parameters for a kernel operation, storing dtype and shape information.
     Used to describe tensor or scalar parameters in TVM/PyTorch interop.
     """
+
     dtype: torch.dtype  # PyTorch data type of the parameter
     shape: list[int | Var]  # List of dimensions, can be integers or TVM variables
 
@@ -109,6 +111,7 @@ class CompiledArtifact:
     Represents a compiled kernel artifact containing both host and device code.
     Stores all necessary components for kernel execution in the TVM runtime.
     """
+
     host_mod: tvm.IRModule  # Host-side TVM IR module for managing kernel execution
     device_mod: tvm.IRModule  # Device-side TVM IR module containing the actual kernel code
     params: list[KernelParam]  # List of parameters (tensors/scalars) used by the kernel
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index cd205a6d..cef3d9a2 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -6,8 +6,7 @@ from tilelang.transform import PassContext
 from tilelang.contrib.nvcc import have_tma, is_hopper
 
 
-def allow_warp_specialized(pass_ctx: PassContext | None = None,
-                           target: Target | None = None) -> bool:
+def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     # avoid circular import
     from tilelang.jit.adapter.utils import is_cuda_target
 
@@ -19,8 +18,7 @@ def allow_warp_specialized(pass_ctx: PassContext | None = None,
     return not disable_warp_specialized
 
 
-def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
     if not have_tma(target):
@@ -47,12 +45,10 @@ def allow_global_thread_synchronization(pass_ctx: PassContext | None = None) ->
     return enable_global_thread_sync
 
 
-def should_enable_aggressive_merge(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def should_enable_aggressive_merge(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
-    enable_aggressive_merge = bool(
-        pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
+    enable_aggressive_merge = bool(pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
     if allow_warp_specialized(pass_ctx=pass_ctx, target=target):
         # This is a workaround to avoid the bug in the MergeSharedMemoryAllocations pass
         # when warp specialization is enabled, as different warp threads may access different
@@ -88,7 +84,7 @@ def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
         return ["txt", "png", "pdf", "svg"]
 
     if "," in formats_str:
-        formats_list = [f.strip() for f in formats_str.split(',')]
+        formats_list = [f.strip() for f in formats_str.split(",")]
     else:
         formats_list = [formats_str]
 
@@ -257,9 +253,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
-    mod = tilelang.transform.MergeSharedMemoryAllocations(
-        enable_aggressive_merge=enable_aggressive_merge)(
-            mod)
+    mod = tilelang.transform.MergeSharedMemoryAllocations(enable_aggressive_merge=enable_aggressive_merge)(mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
     # Inject PTX async copy must behind the thread sync pass
diff --git a/tilelang/env.py b/tilelang/env.py
index ce27aba9..0583cd4c 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -10,36 +10,34 @@ from dataclasses import dataclass
 logger = logging.getLogger(__name__)
 
 # SETUP ENVIRONMENT VARIABLES
-CUTLASS_NOT_FOUND_MESSAGE = ("CUTLASS is not installed or found in the expected path")
+CUTLASS_NOT_FOUND_MESSAGE = "CUTLASS is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = (
-    "Composable Kernel is not installed or found in the expected path")
+COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = "Composable Kernel is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expected path")
+TL_TEMPLATE_NOT_FOUND_MESSAGE = "TileLang is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TVM_LIBRARY_NOT_FOUND_MESSAGE = ("TVM is not installed or found in the expected path")
+TVM_LIBRARY_NOT_FOUND_MESSAGE = "TVM is not installed or found in the expected path"
 
 TL_ROOT = os.path.dirname(os.path.abspath(__file__))
 # Only expose the internal lib directory to sys.path to avoid shadowing
 # common top-level module names (e.g., utils, analysis) from user projects.
-TL_LIBS = [os.path.join(TL_ROOT, 'lib')]
+TL_LIBS = [os.path.join(TL_ROOT, "lib")]
 TL_LIBS = [i for i in TL_LIBS if os.path.exists(i)]
 
 DEV = False
-THIRD_PARTY_ROOT = os.path.join(TL_ROOT, '3rdparty')
+THIRD_PARTY_ROOT = os.path.join(TL_ROOT, "3rdparty")
 if not os.path.exists(THIRD_PARTY_ROOT):
     DEV = True
     tl_dev_root = os.path.dirname(TL_ROOT)
 
-    dev_lib_root = os.path.join(tl_dev_root, 'build')
+    dev_lib_root = os.path.join(tl_dev_root, "build")
     # In dev builds, place artifacts under build/lib and point search path there
     # to avoid adding the entire build root to sys.path.
-    TL_LIBS = [os.path.join(dev_lib_root, 'lib'), os.path.join(dev_lib_root, 'tvm')]
-    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, '3rdparty')
-    logger.warning(f'Loading tilelang libs from dev root: {dev_lib_root}')
+    TL_LIBS = [os.path.join(dev_lib_root, "lib"), os.path.join(dev_lib_root, "tvm")]
+    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, "3rdparty")
+    logger.warning(f"Loading tilelang libs from dev root: {dev_lib_root}")
 
-assert TL_LIBS and all(
-    os.path.exists(i) for i in TL_LIBS), f'tilelang lib root do not exists: {TL_LIBS}'
+assert TL_LIBS and all(os.path.exists(i) for i in TL_LIBS), f"tilelang lib root do not exists: {TL_LIBS}"
 
 for lib in TL_LIBS:
     if lib not in sys.path:
@@ -52,7 +50,7 @@ def _find_cuda_home() -> str:
     Adapted from https://github.com/pytorch/pytorch/blob/main/torch/utils/cpp_extension.py
     """
     # Guess #1
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
     if cuda_home is None:
         # Guess #2
         nvcc_path = shutil.which("nvcc")
@@ -70,15 +68,15 @@ def _find_cuda_home() -> str:
 
         else:
             # Guess #3
-            if sys.platform == 'win32':
-                cuda_homes = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
-                cuda_home = '' if len(cuda_homes) == 0 else cuda_homes[0]
+            if sys.platform == "win32":
+                cuda_homes = glob.glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
+                cuda_home = "" if len(cuda_homes) == 0 else cuda_homes[0]
             else:
                 # Linux/macOS
-                if os.path.exists('/usr/local/cuda'):
-                    cuda_home = '/usr/local/cuda'
-                elif os.path.exists('/opt/nvidia/hpc_sdk/Linux_x86_64'):
-                    cuda_home = '/opt/nvidia/hpc_sdk/Linux_x86_64'
+                if os.path.exists("/usr/local/cuda"):
+                    cuda_home = "/usr/local/cuda"
+                elif os.path.exists("/opt/nvidia/hpc_sdk/Linux_x86_64"):
+                    cuda_home = "/opt/nvidia/hpc_sdk/Linux_x86_64"
 
             # Validate found path
             if cuda_home is None or not os.path.exists(cuda_home):
@@ -89,13 +87,13 @@ def _find_cuda_home() -> str:
 
 def _find_rocm_home() -> str:
     """Find the ROCM install path."""
-    rocm_home = os.environ.get('ROCM_PATH') or os.environ.get('ROCM_HOME')
+    rocm_home = os.environ.get("ROCM_PATH") or os.environ.get("ROCM_HOME")
     if rocm_home is None:
         rocmcc_path = shutil.which("hipcc")
         if rocmcc_path is not None:
             rocm_home = os.path.dirname(os.path.dirname(rocmcc_path))
         else:
-            rocm_home = '/opt/rocm'
+            rocm_home = "/opt/rocm"
             if not os.path.exists(rocm_home):
                 rocm_home = None
     return rocm_home if rocm_home is not None else ""
@@ -104,6 +102,7 @@ def _find_rocm_home() -> str:
 # Cache control
 class CacheState:
     """Class to manage global kernel caching state."""
+
     _enabled = True
 
     @classmethod
@@ -230,13 +229,11 @@ class Environment:
     TILELANG_TMP_DIR = EnvVar("TILELANG_TMP_DIR", os.path.join(TILELANG_CACHE_DIR.get(), "tmp"))
 
     # Kernel Build options
-    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION",
-                                           "1")  # print kernel name on compile
+    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION", "1")  # print kernel name on compile
     TILELANG_DISABLE_CACHE = EnvVar(
-        "TILELANG_DISABLE_CACHE",
-        "0")  # disable kernel cache, usually for unit testing / debugging, high priority
-    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE",
-                                  "0")  # DEPRECATED! clear cache automatically if set
+        "TILELANG_DISABLE_CACHE", "0"
+    )  # disable kernel cache, usually for unit testing / debugging, high priority
+    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # DEPRECATED! clear cache automatically if set
 
     # Kernel selection options
     # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
@@ -244,12 +241,9 @@ class Environment:
 
     # Auto-tuning settings
     TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
-    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES",
-                                                "0.9")  # percent of CPUs used
-    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS",
-                                             "-1")  # -1 means auto
-    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT",
-                                                "-1")  # -1 means no limit
+    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES", "0.9")  # percent of CPUs used
+    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS", "-1")  # -1 means auto
+    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT", "-1")  # -1 means no limit
 
     # TVM integration
     SKIP_LOADING_TILELANG_SO = EnvVar("SKIP_LOADING_TILELANG_SO", "0")
@@ -323,18 +317,18 @@ def prepend_pythonpath(path):
 if env.TVM_IMPORT_PYTHON_PATH is not None:
     prepend_pythonpath(env.TVM_IMPORT_PYTHON_PATH)
 else:
-    tvm_path = os.path.join(THIRD_PARTY_ROOT, 'tvm', 'python')
+    tvm_path = os.path.join(THIRD_PARTY_ROOT, "tvm", "python")
     assert os.path.exists(tvm_path), tvm_path
     if tvm_path not in sys.path:
         prepend_pythonpath(tvm_path)
         env.TVM_IMPORT_PYTHON_PATH = tvm_path
 # By default, the built TVM-related libraries are stored in TL_LIBS.
 if os.environ.get("TVM_LIBRARY_PATH") is None:
-    os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
+    os.environ["TVM_LIBRARY_PATH"] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
 
 # Initialize CUTLASS paths
 if os.environ.get("TL_CUTLASS_PATH", None) is None:
-    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, 'cutlass', 'include')
+    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, "cutlass", "include")
     if os.path.exists(cutlass_inc_path):
         os.environ["TL_CUTLASS_PATH"] = env.CUTLASS_INCLUDE_DIR = cutlass_inc_path
     else:
@@ -342,7 +336,7 @@ if os.environ.get("TL_CUTLASS_PATH", None) is None:
 
 # Initialize COMPOSABLE_KERNEL paths
 if os.environ.get("TL_COMPOSABLE_KERNEL_PATH", None) is None:
-    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, 'composable_kernel', 'include')
+    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, "composable_kernel", "include")
     if os.path.exists(ck_inc_path):
         os.environ["TL_COMPOSABLE_KERNEL_PATH"] = env.COMPOSABLE_KERNEL_INCLUDE_DIR = ck_inc_path
     else:
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 183ba646..38959649 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
-    thread_id = (j * 16 + i)
+    thread_id = j * 16 + i
     return thread_id, convert(0)
 
 
@@ -15,7 +15,7 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
-    thread_id = (i * 16 + j)
+    thread_id = i * 16 + j
     return thread_id, convert(0)
 
 
@@ -27,7 +27,7 @@ def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_C(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -45,7 +45,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_A(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_A(i, j):
     thread_id = i + 16 * (j // 4)
-    local = (j % 4)
+    local = j % 4
     return thread_id, local
 
 
@@ -57,7 +57,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_B(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -87,7 +87,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_A(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_A(i, j):
     thread_id = i + 16 * (j // 8)
-    local = (j % 8)
+    local = j % 8
     return thread_id, local
 
 
@@ -99,7 +99,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_B(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_B(i, j):
     thread_id = j + (i // 8) * 16
-    local = (i % 8)
+    local = i % 8
     return thread_id, local
 
 
@@ -111,7 +111,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_A(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_A(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
@@ -123,7 +123,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_B(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_B(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 618a9981..1e97bd0f 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -6,7 +6,7 @@ from tvm import tir
 from tvm.ir import Range
 from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
-from .utils import (mfma_store_index_map)
+from .utils import mfma_store_index_map
 from typing import Literal, Callable
 
 from tilelang.utils import is_fragment
@@ -101,7 +101,7 @@ class MatrixCoreIntrinEmitter:
         self.warp_rows = warp_row_tiles // self.micro_size_x
         self.warp_cols = warp_col_tiles // self.micro_size_y
         self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
         self.num_elems_per_byte = num_elems_per_byte
         self.thread_var = thread_var
 
@@ -132,12 +132,7 @@ class MatrixCoreIntrinEmitter:
     def _initialize_mfma_prefix(self, k_dim=16):
         in_dtype, out_dtype = self.a_dtype, self.accum_dtype
         M_DIM, N_DIM = self.M_DIM, self.N_DIM
-        out_dtype_abbrv = {
-            "float16": "f16",
-            "float32": "f32",
-            "int8": "i8",
-            "int32": "i32"
-        }[out_dtype]
+        out_dtype_abbrv = {"float16": "f16", "float32": "f32", "int8": "i8", "int32": "i32"}[out_dtype]
 
         in_dtype_abbrv = {
             "bfloat16": "bf16",
@@ -176,7 +171,6 @@ class MatrixCoreIntrinEmitter:
             self.b_preshuffle = b_preshuffle
 
     def get_ldmatrix_index_map(self, is_b=False):
-
         k_dim = self.k_dim * self.k_pack
         transposed = self.a_transposed if not is_b else self.b_transposed
         if k_dim == 4:
@@ -184,28 +178,42 @@ class MatrixCoreIntrinEmitter:
             reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A
             if is_b:
                 index_map = shared_16x4_to_local_64x1_layout_A if transposed else shared_4x16_to_local_64x1_layout_B
-                reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                )
         elif k_dim == 16:
             index_map = shared_16x16_to_local_64x4_layout_B if transposed else shared_16x16_to_local_64x4_layout_A
-            reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x16_to_local_64x4_layout_A if transposed else shared_16x16_to_local_64x4_layout_B
-                reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                )
         elif k_dim == 32:
             index_map = shared_16x32_to_local_64x8_layout_B if transposed else shared_16x32_to_local_64x8_layout_A
-            reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x32_to_local_64x8_layout_A if transposed else shared_16x32_to_local_64x8_layout_B
-                reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                )
         elif k_dim == 64:
             index_map = shared_16x64_to_local_64x16_layout_B if transposed else shared_16x64_to_local_64x16_layout_A
-            reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x64_to_local_64x16_layout_A if transposed else shared_16x64_to_local_64x16_layout_B
-                reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                )
         else:
             raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
 
@@ -227,14 +235,12 @@ class MatrixCoreIntrinEmitter:
         else:
             return self.thread_var
 
-    def extract_thread_binding(self,
-                               thread_id,
-                               is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
-        '''
-            is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
-            which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
-            Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
-        '''
+    def extract_thread_binding(self, thread_id, is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
         WARP_SIZE = self.WARP_SIZE
         block_row_warps = self.block_row_warps
         block_col_warps = self.block_col_warps
@@ -244,16 +250,18 @@ class MatrixCoreIntrinEmitter:
             is_m_first = self.is_m_first
 
         if is_m_first:
-            lane_id, warp_n, warp_m = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_col_warps, (thread_id //
-                                               (WARP_SIZE * block_col_warps)) % block_row_warps,
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
             return lane_id, warp_n, warp_m
         else:
-            lane_id, warp_m, warp_n = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_row_warps, (thread_id //
-                                               (WARP_SIZE * block_row_warps)) % block_col_warps,
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
             return lane_id, warp_n, warp_m
 
     def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0):
@@ -287,18 +295,14 @@ class MatrixCoreIntrinEmitter:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (rk * chunk + ki * (k_pack * micro_size_k),
-                                warp_m * warp_row_tiles + i * micro_size_x)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
             else:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (warp_m * warp_row_tiles + i * micro_size_x,
-                                rk * chunk + ki * (k_pack * micro_size_k))
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row,
-                                                                                  A_base1 + r + col]
+                        l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
@@ -337,8 +341,7 @@ class MatrixCoreIntrinEmitter:
                             warp_n * warp_col_tiles + j * micro_size_y,
                             rk * chunk + ki * (k_pack * micro_size_k),
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
             else:
                 for j in T.serial(warp_cols):
@@ -348,16 +351,11 @@ class MatrixCoreIntrinEmitter:
                             rk * chunk + ki * (k_pack * micro_size_k),
                             warp_n * warp_col_tiles + j * micro_size_y,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row,
-                                                                                  B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mfma(self,
-             A_local_buf: Buffer,
-             B_local_buf: Buffer,
-             C_local_buf: Buffer,
-             k_inner: PrimExpr | None = 0):
+    def mfma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -421,14 +419,13 @@ class MatrixCoreIntrinEmitter:
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
                     if C_buf_dims == 2:
-                        C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                              (warp_n * warp_cols + j) * N_DIM +
-                              col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                 j * local_size_out + local_id]
+                        C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * N_DIM + col] = C_local_buf[
+                            i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                        ]
                     else:
-                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                              col] = C_local_buf[i * warp_cols * local_size_out +
-                                                 j * local_size_out + local_id]
+                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                            i * warp_cols * local_size_out + j * local_size_out + local_id
+                        ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -436,18 +433,17 @@ class MatrixCoreIntrinEmitter:
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
-                    C_buf[(pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
-                          (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM +
-                          col] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                             local_id]
-
-        return _warp_stmatrix_global(C_local_buf, C_buf,
-                                     thread_binding) if is_global else _warp_stmatrix_shared(
-                                         C_local_buf, C_buf, thread_binding)
-
-    def make_mfma_load_layout(self,
-                              local_buf: Buffer,
-                              matrix: Literal["A", "B"] = "A") -> T.Fragment:
+                    C_buf[
+                        (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col
+                    ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mfma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MFMA results into a fragment buffer.
 
@@ -468,6 +464,7 @@ class MatrixCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -506,11 +503,9 @@ class MatrixCoreIntrinEmitter:
 
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -543,8 +538,7 @@ class MatrixCoreIntrinEmitter:
             return local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r *
-             self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
+            [micro_size_s, micro_size_r * self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
@@ -558,31 +552,19 @@ class MatrixCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -686,7 +668,6 @@ class MatrixCoreIntrinEmitter:
 
 
 class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
-
     def __init__(
         self,
         a_dtype: str = "float16",
@@ -792,20 +773,20 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             rk * (chunk // micro_size_k) + ki,
                             warp_m * warp_rows + i,
                         )
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
             else:
                 print(self.a_preshuffle)
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (warp_m * warp_rows + i, rk * (chunk // micro_size_k) + ki)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_a_shared(
-                                           A_local_buf, A_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_a_shared(A_local_buf, A_buf, ki, thread_binding, rk)
+        )
 
     def ldmatrix_b(self, B_local_buf, B_buf, ki, rk=0, pid_m=None, pid_n=None):
         warp_cols = self.warp_cols
@@ -867,8 +848,7 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
             else:
                 for j in T.serial(warp_cols):
                     for local_id in T.vectorized(k_pack * local_size_b):
@@ -877,9 +857,10 @@ class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
                             rk * (chunk // micro_size_k) + ki,
                             warp_n * warp_cols + j,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_b_shared(
-                                           B_local_buf, B_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_b_shared(B_local_buf, B_buf, ki, thread_binding, rk)
+        )
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
index f49b5956..2eb575f0 100644
--- a/tilelang/intrinsics/mma_layout.py
+++ b/tilelang/intrinsics/mma_layout.py
@@ -153,14 +153,14 @@ def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
 
 def mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id):
     """
-        groupID           = %laneid >> 2
-        threadID_in_group = %laneid % 4
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
 
-        row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
-                groupID + 8         Otherwise
+    row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
+            groupID + 8         Otherwise
 
-        col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
-        (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
+    col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
+    (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
     """
     row = (thread_id // 4) + 8 * (local_id % 4 // 2)
     col = (thread_id % 4) * 2 + (local_id % 2) + 8 * (local_id // 4)
@@ -175,13 +175,13 @@ def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
 
 def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
     """
-        groupID           = %laneid >> 2
-        threadID_in_group = %laneid % 4
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
 
-        row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
-            (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
+    row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
+        (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
 
-        col = groupID
+    col = groupID
     """
     col = (thread_id % 4) * 2 + ((local_id % 4) % 2) + ((local_id % 4) // 2) * 8
     row = (thread_id // 4) + 8 * (local_id // 4)
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 5811eb53..28afdb29 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -191,6 +191,7 @@ class TensorCoreIntrinEmitter:
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         from .utils import mma_store_index_map, mma_store_index_map_fp64
+
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         if DataType(self.accum_dtype).bits == 64:
             index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype="int32")
@@ -201,10 +202,7 @@ class TensorCoreIntrinEmitter:
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -233,11 +231,7 @@ class TensorCoreIntrinEmitter:
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.a_dtype).bits == 64:
             warp_row_tiles = self.warp_row_tiles
@@ -324,9 +318,7 @@ class TensorCoreIntrinEmitter:
             for i in T.serial(warp_rows):
                 # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
-                A_shared_buf_elem = A_buf[A_base0 + wk,
-                                          A_base1 + wi] if a_transposed else A_buf[A_base0 + wi,
-                                                                                   A_base1 + wk]
+                A_shared_buf_elem = A_buf[A_base0 + wk, A_base1 + wi] if a_transposed else A_buf[A_base0 + wi, A_base1 + wk]
 
                 if ldmatrix_available:
                     T.ptx_ldmatrix(
@@ -343,20 +335,13 @@ class TensorCoreIntrinEmitter:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
                         if a_transposed:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk,
-                                                                      A_base1 + wi + mi]
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
                         else:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi,
-                                                                      A_base1 + wk + mk]
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
-
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
         if DataType(self.b_dtype).bits == 64:
             warp_col_tiles = self.warp_col_tiles
@@ -411,7 +396,7 @@ class TensorCoreIntrinEmitter:
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
         B_stride_last = B_buf.shape[-1]
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
 
@@ -448,9 +433,7 @@ class TensorCoreIntrinEmitter:
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_buf[B_base0 + wi,
-                                              B_base1 + wk] if b_transposed else B_buf[B_base0 + wk,
-                                                                                       B_base1 + wi]
+                    B_shared_buf_elem = B_buf[B_base0 + wi, B_base1 + wk] if b_transposed else B_buf[B_base0 + wk, B_base1 + wi]
 
                     T.ptx_ldmatrix(
                         b_dtype,
@@ -469,19 +452,13 @@ class TensorCoreIntrinEmitter:
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
                         if b_transposed:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                      B_base1 + wk + mk]
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
                         else:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                      B_base1 + wi + mi]
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -492,7 +469,7 @@ class TensorCoreIntrinEmitter:
         accum_dtype = self.accum_dtype
         accum_dtype_abbrv = self.accum_dtype_abbrv
         mma_prefix = self.mma_prefix
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
@@ -532,8 +509,7 @@ class TensorCoreIntrinEmitter:
                         B_local_buf.data,
                         b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                         C_local_buf.data,
-                        i * warp_cols * local_size_out + j * local_size_out +
-                        lift(local_size_out) // 2,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
                         T.bool(False),  # saturate
                     )
 
@@ -568,14 +544,13 @@ class TensorCoreIntrinEmitter:
                         local_id = local_id_o * 2 + local_id_i
                         row, col = T.meta_var(mma_store_index_map(tx, local_id))
                         if C_buf_dims == 2:
-                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                                  (warp_n * warp_cols + j) * n_dim +
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
                         else:
-                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -588,15 +563,15 @@ class TensorCoreIntrinEmitter:
                         C_buf[
                             (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
                             (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
-                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                        local_id]
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
 
-        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
-                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -619,6 +594,7 @@ class TensorCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -655,11 +631,9 @@ class TensorCoreIntrinEmitter:
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -706,31 +680,19 @@ class TensorCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -761,8 +723,7 @@ class TensorCoreIntrinEmitter:
         from tilelang.utils import is_fragment
 
         shape = local_buf.shape
-        assert is_fragment(
-            local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
+        assert is_fragment(local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
         inverse_mma_store_layout = self.get_store_index_map(inverse=True)
 
         micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
@@ -954,10 +915,12 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                         ".b16",
                         A_local_buf.data,
                         i * local_size_a,
-                        T.address_of(A_shared_buf[
-                            warp_m * warp_row_tiles + i * micro_size_x,
-                            rk * chunk + ki * micro_size_k,
-                        ]),
+                        T.address_of(
+                            A_shared_buf[
+                                warp_m * warp_row_tiles + i * micro_size_x,
+                                rk * chunk + ki * micro_size_k,
+                            ]
+                        ),
                         get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
                     )
             elif transform_kind_a == TransformKind.InterWarpTransform:
@@ -1019,10 +982,8 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                             warp_m * warp_rows + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_a +
-                                    local_id) // micro_size_k, (tx * local_size_a + local_id) % (
-                                        micro_size_k)
-                        A_local_buf[j * local_size_a + local_id] = (A_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (tx * local_size_a + local_id) // micro_size_k, (tx * local_size_a + local_id) % (micro_size_k)
+                        A_local_buf[j * local_size_a + local_id] = A_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input A")
 
@@ -1131,12 +1092,11 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_dequantize +
-                                    local_id) // (micro_size_k // num_elems_per_byte), (
-                                        tx * local_size_dequantize + local_id) % (
-                                            micro_size_k // num_elems_per_byte)
-                        B_local_buf[j * local_size_dequantize + local_id] = (
-                            B_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (
+                            (tx * local_size_dequantize + local_id) // (micro_size_k // num_elems_per_byte),
+                            (tx * local_size_dequantize + local_id) % (micro_size_k // num_elems_per_byte),
+                        )
+                        B_local_buf[j * local_size_dequantize + local_id] = B_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input B")
 
@@ -1195,7 +1155,6 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
 
 class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
@@ -1298,9 +1257,7 @@ class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
 
 
 class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWithLadderTransform):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
-
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
diff --git a/tilelang/intrinsics/mma_sm70_layout.py b/tilelang/intrinsics/mma_sm70_layout.py
index e7a57da7..80292344 100644
--- a/tilelang/intrinsics/mma_sm70_layout.py
+++ b/tilelang/intrinsics/mma_sm70_layout.py
@@ -17,10 +17,8 @@ def shared_16x4_to_mma_b_32x4_layout_trans(row, col, rep):
 
 
 def mma_32x8_to_shared_16x16_layout_fp32(thread_id, local_id):
-    row = (thread_id % 2) + (
-        (local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
-    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id %
-                                                                  2) + (local_id // 4) * 8
+    row = (thread_id % 2) + ((local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
+    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id % 2) + (local_id // 4) * 8
     return row, col
 
 
@@ -31,7 +29,7 @@ def mma_32x8_to_shared_16x16_layout_fp16(thread_id, local_id):
 
 
 def mma_load_a_32x4_to_shared_16x4_layout(thread_id, local_id):
-    row = (thread_id % 4) + (4 * (((thread_id // 16 + thread_id % 16 // 4 * 2)) % 4))
+    row = (thread_id % 4) + (4 * ((thread_id // 16 + thread_id % 16 // 4 * 2) % 4))
     col = local_id
     return row, col
 
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index 78248081..3186adb2 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -147,18 +147,15 @@ class TensorCoreIntrinEmitter:
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         index_map = IndexMap.from_func(
-            mma_32x8_to_shared_16x16_layout_fp32
-            if self.accum_dtype == "float32" else mma_32x8_to_shared_16x16_layout_fp16,
-            index_dtype="int32")
+            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == "float32" else mma_32x8_to_shared_16x16_layout_fp16,
+            index_dtype="int32",
+        )
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -187,11 +184,7 @@ class TensorCoreIntrinEmitter:
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         chunk = self.chunk
@@ -231,11 +224,7 @@ class TensorCoreIntrinEmitter:
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer | BufferRegion,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         chunk = self.chunk
@@ -274,20 +263,14 @@ class TensorCoreIntrinEmitter:
                 for j in T.vectorized(local_size_b):
                     if b_transposed:
                         mi, mk = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi,
-                                                                  B_base1 + wk + mk]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
                     else:
                         mk, mi = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk,
-                                                                  B_base1 + wi + mi]
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_region, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -326,9 +309,7 @@ class TensorCoreIntrinEmitter:
 
         return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -351,6 +332,7 @@ class TensorCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -383,11 +365,9 @@ class TensorCoreIntrinEmitter:
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(
-                i, j)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(i, j)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -413,9 +393,8 @@ class TensorCoreIntrinEmitter:
             return lane_id, local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
-            forward_fn=forward,
-            replicate=2)
+            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s], forward_fn=forward, replicate=2
+        )
 
         warp_rows, warp_cols = self.warp_rows, self.warp_cols
         chunk = self.chunk
@@ -426,31 +405,19 @@ class TensorCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
index bae86bf4..58034e7f 100644
--- a/tilelang/intrinsics/mma_sp_layout.py
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -72,56 +72,47 @@ def get_logical_id_32bit(thread_id: int) -> int:
     return (thread_id // 4) * 2 + (thread_id % 4) % 2
 
 
-def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int,
-                                                        local_id: int) -> tuple[int, int]:
+def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
     logical_id = get_logical_id_32bit(thread_id)
     row = logical_id // 4 + local_id * 8
     col = logical_id % 4
     return row, col
 
 
-def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int,
-                                                         local_id: int) -> tuple[int, int]:
+def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
     logical_id = get_logical_id_32bit(thread_id)
     row = logical_id // 2 + local_id * 8
     col = logical_id % 2
     return row, col
 
 
-def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int,
-                                                        local_id: int) -> tuple[int, int]:
-    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(
-        thread_id, local_id)  # same mapping for 16bit and 32bit
+def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
 
 
-def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int,
-                                                         local_id: int) -> tuple[int, int]:
-    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(
-        thread_id, local_id)  # same mapping for 16bit and 32bit
+def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
 
 
 def get_logical_id_8bit(thread_id: int) -> int:
     return thread_id
 
 
-def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int,
-                                                       local_id: int) -> tuple[int, int]:
+def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
     logical_id = get_logical_id_8bit(thread_id)
     row = logical_id // 2 + local_id * 8
     col = (logical_id % 4) // 2 * 4 + local_id
     return row, col
 
 
-def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int,
-                                                        local_id: int) -> tuple[int, int]:
+def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
     logical_id = get_logical_id_8bit(thread_id)
     row = logical_id // 2 + local_id * 8
     col = (logical_id % 4) // 2 * 2 + local_id
     return row, col
 
 
-def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int,
-                                                        local_id: int) -> tuple[int, int]:
+def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
     # local_id is always 0
     logical_id = get_logical_id_8bit(thread_id)
     row = logical_id // 4 + (logical_id % 2) * 8
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
index 629d95d9..ea7aa899 100644
--- a/tilelang/intrinsics/mma_sp_macro_generator.py
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -190,8 +190,7 @@ class SparseTensorCoreIntrinEmitter:
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
         self.local_size_a = (m_dim * k_dim) // warp_size // self.SPARSE_FACTOR
-        self.local_size_e = (
-            m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
+        self.local_size_e = (m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
         self.local_size_b = (n_dim * k_dim) // warp_size
         self.local_size_out = (m_dim * n_dim) // warp_size
 
@@ -257,10 +256,7 @@ class SparseTensorCoreIntrinEmitter:
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -330,8 +326,7 @@ class SparseTensorCoreIntrinEmitter:
 
             for i in T.serial(warp_rows):
                 # Assign A_shared_buf_elem
-                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (
-                    rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
                 A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
 
                 if ldmatrix_available:
@@ -348,10 +343,9 @@ class SparseTensorCoreIntrinEmitter:
                 else:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
-                        A_local_buf[i * local_size_a +
-                                    j] = A_shared_buf[wk + mk, wi +
-                                                      mi] if a_transposed else A_shared_buf[wi + mi,
-                                                                                            wk + mk]
+                        A_local_buf[i * local_size_a + j] = (
+                            A_shared_buf[wk + mk, wi + mi] if a_transposed else A_shared_buf[wi + mi, wk + mk]
+                        )
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
@@ -412,14 +406,10 @@ class SparseTensorCoreIntrinEmitter:
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
             for i in T.serial(warp_rows):
                 # Assign E_shared_buf_elem
-                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (
-                    rk * warp_k + ki * micro_size_k) // self.e_factor
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.e_factor
                 for j in T.serial(local_size_e):
                     mi, mk = mma_load_layout(tx, j)
-                    E_local_buf[i * local_size_e +
-                                j] = E_shared_buf[wk + mk,
-                                                  wi + mi] if trans else E_shared_buf[wi + mi,
-                                                                                      wk + mk]
+                    E_local_buf[i * local_size_e + j] = E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
 
         return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
 
@@ -433,7 +423,7 @@ class SparseTensorCoreIntrinEmitter:
         b_dtype = self.b_dtype
         b_transposed = self.b_transposed
         thread_binding = self.get_thread_binding()
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
 
@@ -470,8 +460,7 @@ class SparseTensorCoreIntrinEmitter:
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk,
-                                                                                               wi]
+                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
 
                     if replicate_b:
                         T.ptx_ldmatrix(
@@ -493,9 +482,7 @@ class SparseTensorCoreIntrinEmitter:
                             B_local_buf.data,
                             i * local_size_b + lift(local_size_b) // 2,
                             T.address_of(B_shared_buf_elem),
-                            get_ldmatrix_offset_b("B", tx,
-                                                  lift(local_size_b) // 2, stride, b_dtype,
-                                                  b_transposed),
+                            get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed),
                         )
                     else:
                         T.ptx_ldmatrix(
@@ -514,19 +501,13 @@ class SparseTensorCoreIntrinEmitter:
                     # must be transposed.
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b +
-                                    j] = B_shared_buf[wi + mi, wk +
-                                                      mk] if b_transposed else B_shared_buf[wk + mk,
-                                                                                            wi + mi]
+                        B_local_buf[i * local_size_b + j] = (
+                            B_shared_buf[wi + mi, wk + mk] if b_transposed else B_shared_buf[wk + mk, wi + mi]
+                        )
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mma_sp(self,
-               A_local_buf: Buffer,
-               E_local_buf: Buffer,
-               B_local_buf: Buffer,
-               C_local_buf: Buffer,
-               k_inner: PrimExpr = 0):
+    def mma_sp(self, A_local_buf: Buffer, E_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -538,7 +519,7 @@ class SparseTensorCoreIntrinEmitter:
         accum_dtype = self.accum_dtype
         accum_dtype_abbrv = self.accum_dtype_abbrv
         mma_prefix = self.mma_prefix
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
 
         a_is_fragment = is_fragment(A_local_buf)
         e_is_fragment = is_fragment(E_local_buf)
@@ -584,8 +565,7 @@ class SparseTensorCoreIntrinEmitter:
                         B_local_buf.data,
                         b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                         C_local_buf.data,
-                        i * warp_cols * local_size_out + j * local_size_out +
-                        lift(local_size_out) // 2,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
                         E_local_buf.data,  # metadata
                         e_local_stride + i * local_size_e,  # metadata offset
                         self.SPARSE_SELECTOR,  # sparse_selector
@@ -623,14 +603,13 @@ class SparseTensorCoreIntrinEmitter:
                         local_id = local_id_o * 2 + local_id_i
                         row, col = T.meta_var(mma_store_index_map(tx, local_id))
                         if C_buf_dims == 2:
-                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                                  (warp_n * warp_cols + j) * n_dim +
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
                         else:
-                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -643,15 +622,15 @@ class SparseTensorCoreIntrinEmitter:
                         C_buf[
                             (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
                             (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
-                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                        local_id]
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
 
-        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
-                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -674,6 +653,7 @@ class SparseTensorCoreIntrinEmitter:
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -710,11 +690,9 @@ class SparseTensorCoreIntrinEmitter:
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -747,7 +725,8 @@ class SparseTensorCoreIntrinEmitter:
             return local_id
 
         base_fragment = T.Fragment(
-            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r] if is_sr_axis_order
+            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r]
+            if is_sr_axis_order
             else [micro_size_r // 2 if matrix_is_a else micro_size_r, micro_size_s],
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
@@ -762,31 +741,19 @@ class SparseTensorCoreIntrinEmitter:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 966f4dc4..26208d6c 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -88,9 +88,22 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         is_m_first: bool = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
 
     def _assign_a_shared_layout(self, layout: Layout):
         self.a_shared_layout = layout
@@ -137,13 +150,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def tcgen05mma(self,
-                   A_buf: Buffer,
-                   B_buf: Buffer,
-                   C_local_buf: Buffer,
-                   mbar,
-                   clear_accum: PrimExpr = False):
-
+    def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
         if is_tensor_memory(A_buf):
             return self.tcgen05mma_rs(A_buf, B_buf, C_local_buf, clear_accum)
 
@@ -164,22 +171,20 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         elems_in_bits = DataType(self.a_dtype).bits
         elems_in_bytes = elems_in_bits // 8
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
         meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
         if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
-                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
         atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -202,11 +207,8 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -312,21 +314,26 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                     for ki in T.unroll(0, (k_dim // micro_size_k)):
                         scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
                         A_elem_offset = (
-                            ki % ak_atom_size
-                        ) * micro_size_k + i * atom_m * a_swizzle_atom_elems + (
-                            ki // ak_atom_size
-                        ) * m_dim * a_swizzle_atom_elems if a_is_k_major else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                            (ki % ak_atom_size) * micro_size_k
+                            + i * atom_m * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
 
-                        B_elem_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                            ki % bk_atom_size
-                        ) * micro_size_k + j * atom_n * b_swizzle_atom_elems if b_is_k_major else (
-                            ki * b_swizzle_atom_elems * micro_size_k + j * atom_n *
-                            (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k + j * atom_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
 
                         A_byte_offset = A_elem_offset * elems_in_bytes
                         B_byte_offset = B_elem_offset * elems_in_bytes
-                        C_offset = (i * n_dim + j * tmem_col_step
-                                   ) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
 
                         T.ptx_tcgen05_mma_ss(
                             a_dtype_abbrv,
@@ -373,8 +380,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         """
         assert is_tensor_memory(tmem_buf), "tmem_buf must reside in tensor memory (shared.tmem)"
         if len(tmem_buf.shape) != 2:
-            raise ValueError(
-                f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
+            raise ValueError(f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
 
         m = int(tmem_buf.shape[0])
         n = int(tmem_buf.shape[1])
@@ -382,14 +388,13 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
         meta = self.get_tcgen5_mma_meta(m, n, k)
         if len(meta) != 5:
-            raise ValueError(f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, "
-                             f"A dtype={self.a_dtype}, accum dtype={self.accum_dtype}")
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
         atom_m, atom_n, _, _, _ = (int(x) for x in meta)
 
         if m % atom_m != 0 or n % atom_n != 0:
-            raise ValueError(
-                f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})"
-            )
+            raise ValueError(f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})")
 
         def forward(i: PrimExpr, j: PrimExpr):
             atom_idx = (i // atom_m) + (j // atom_n) * (m // atom_m)
@@ -422,11 +427,11 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         return Layout([m, n], forward)
 
     def get_tcgen5_mma_meta(self, m: int, n: int, k: int):
-        return _ffi_api.get_tcgen5_mma_meta(
-            int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
+        return _ffi_api.get_tcgen5_mma_meta(int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
 
-    def get_tcgen5_instr_desc(self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool,
-                              b_is_k_major: bool, scale_in_a: int, scale_in_b: int) -> PrimExpr:
+    def get_tcgen5_instr_desc(
+        self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool, b_is_k_major: bool, scale_in_a: int, scale_in_b: int
+    ) -> PrimExpr:
         desc = _ffi_api.get_tcgen5_instr_desc(
             atom_m,
             atom_n,
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index 7fc9bab1..fb24a4ad 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -10,7 +10,7 @@ from .mma_layout import (
     mma_store_32x8_to_shared_16x16_layout,
     mma_store_32x2_to_shared_8x8_layout_fp64,
 )
-from .mfma_layout import (thread_id_shared_access_64x4_to_16x16_layout_C_n_m)
+from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m
 
 from .mma_layout import get_swizzle_layout  # noqa: F401
 from .mma_layout import make_mma_swizzle_layout  # noqa: F401
diff --git a/tilelang/intrinsics/wgmma_macro_generator.py b/tilelang/intrinsics/wgmma_macro_generator.py
index 51a90fba..483b6e73 100644
--- a/tilelang/intrinsics/wgmma_macro_generator.py
+++ b/tilelang/intrinsics/wgmma_macro_generator.py
@@ -15,9 +15,11 @@ from tilelang.layout import (
     make_linear_layout,
 )
 from tvm.runtime import convert
-from tilelang.intrinsics.mma_layout import (shared_16x8_to_mma_32x4_layout_sr_a,
-                                            shared_16x16_to_mma_32x8_layout_sr_a,
-                                            shared_16x32_to_mma_32x16_layout_sr_a)
+from tilelang.intrinsics.mma_layout import (
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
 
 lift = convert
 
@@ -96,9 +98,22 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         is_m_first: bool | None = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
         self._initialize_wgmma_prefix(self.n_dim)
 
     def _assign_a_shared_layout(self, layout: Layout):
@@ -112,12 +127,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
     def _initialize_wgmma_prefix(self, n_dim: int = 16):
         inst_m, inst_n = 64, gcd(self.warp_col_tiles, 256)
         assert inst_n % 8 == 0, (
-            f"inst_n must be a multiple of 8, got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be a multiple of 8, got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # Validate inst_n: Hopper WGMMA supports n in [8, 256] and multiple of 8
         assert 8 <= inst_n <= 256, (
-            f"inst_n must be within [8, 256], got {inst_n} "
-            f"(block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})")
+            f"inst_n must be within [8, 256], got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # 256 bits per instruction
         inst_k = 256 // DataType(self.a_dtype).bits
         self.wgmma_inst_m = inst_m
@@ -160,13 +175,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def wgmma(self,
-              A_region: BufferRegion,
-              B_region: BufferRegion,
-              C_region: BufferRegion,
-              clear_accum: PrimExpr = False,
-              wg_wait: int = 0):
-
+    def wgmma(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         if is_fragment(A_region):
             return self.wgmma_rs(A_region, B_region, C_region, clear_accum, wg_wait)
 
@@ -195,16 +206,13 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         elems_in_bytes = elems_in_bits // 8
 
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_bits = DataType(accum_dtype).bits
         accum_regs = ((m_dim // 64) * warp_cols * local_size_out * accum_bits + 31) // 32
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -220,19 +228,15 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                 if a_m_axis_atoms <= 1:
                     a_leading_byte_offset = 0
                 else:
-                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (
-                        a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
 
                 if a_m_axis_atoms <= 1:
                     a_stride_byte_offset = 8 * elems_in_bytes * m_dim
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -275,12 +279,8 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
             desc_a = T.alloc_wgmma_desc()
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode,
-                                          int(a_leading_byte_offset >> 4),
-                                          int(a_stride_byte_offset >> 4))
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode, int(a_leading_byte_offset >> 4), int(a_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
 
@@ -291,21 +291,41 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
                         warp_i = (warp_m // 4) * num_inst_m + i
                         warp_j = warp_n * num_inst_n + j
                         A_offset = (
-                            ki % ak_atom_size
-                        ) * micro_size_k + warp_i * 64 * a_swizzle_atom_elems + (
-                            ki // ak_atom_size
-                        ) * m_dim * a_swizzle_atom_elems if a_is_k_major else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
-                        B_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                            ki % bk_atom_size
-                        ) * micro_size_k + warp_j * wgmma_inst_n * b_swizzle_atom_elems if b_is_k_major else (
-                            ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                            (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki % ak_atom_size) * micro_size_k
+                            + warp_i * 64 * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+                        B_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
-                        T.ptx_wgmma_ss(accum_dtype, wgmma_prefix, a_is_k_major, b_is_k_major,
-                                       a_dtype_abbrv, b_dtype_abbrv, accum_dtype_abbrv, desc_a.data,
-                                       (A_offset * elems_in_bytes) >> 4, desc_b.data,
-                                       (B_offset * elems_in_bytes) >> 4, C_buf.data, C_offset,
-                                       scale_out, scale_in_a, scale_in_b)
+                        T.ptx_wgmma_ss(
+                            accum_dtype,
+                            wgmma_prefix,
+                            a_is_k_major,
+                            b_is_k_major,
+                            a_dtype_abbrv,
+                            b_dtype_abbrv,
+                            accum_dtype_abbrv,
+                            desc_a.data,
+                            (A_offset * elems_in_bytes) >> 4,
+                            desc_b.data,
+                            (B_offset * elems_in_bytes) >> 4,
+                            C_buf.data,
+                            C_offset,
+                            scale_out,
+                            scale_in_a,
+                            scale_in_b,
+                        )
 
             T.warpgroup_commit_batch()
             if wg_wait >= 0:
@@ -314,12 +334,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
         return _warp_mma(A_ptr, B_ptr, C_buf)
 
-    def wgmma_rs(self,
-                 A_region: BufferRegion,
-                 B_region: BufferRegion,
-                 C_region: BufferRegion,
-                 clear_accum: PrimExpr = False,
-                 wg_wait: int = 0):
+    def wgmma_rs(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         local_size_a = self.local_size_a
         local_size_out = self.local_size_out
         a_dtype_abbrv = self.a_dtype_abbrv
@@ -344,14 +361,10 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         b_is_k_major = self.b_transposed
 
         b_swizzle_mode = self._determinate_swizzle_mode(B_region, self.b_shared_layout)
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -390,9 +403,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
 
             desc_b = T.alloc_wgmma_desc()
-            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode,
-                                          int(b_leading_byte_offset >> 4),
-                                          int(b_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
             T.warpgroup_fence_operand(A_buf, num_regs=a_regs)
             T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
@@ -405,11 +416,15 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
                         A_offset = ki * warp_rows * local_size_a + i * local_size_a
                         B_offset = (
-                            ki // bk_atom_size
-                        ) * n_dim * b_swizzle_atom_elems + warp_j * wgmma_inst_n * b_swizzle_atom_elems + (
-                            ki % bk_atom_size) * micro_size_k if b_is_k_major else (
-                                ki * b_swizzle_atom_elems * micro_size_k + warp_j * wgmma_inst_n *
-                                (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1))
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
                         C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
                         T.ptx_wgmma_rs(
                             accum_dtype,
@@ -460,6 +475,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A"], "matrix should be A for WGMMA"
         dtype = self.a_dtype
         dtype_bits = DataType(dtype).bits
@@ -488,8 +504,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         # the layout of mma.sync is row.col.
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
 
         assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
 
@@ -531,20 +546,12 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         replicate = block_col_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([block_s, 1],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_s, warp_r],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
         else:
             # rs condition, transposed_a matrix
-            warp_fragment = base_fragment.repeat([1, block_s],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_r, warp_s],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
 
         return block_fragment
 
diff --git a/tilelang/ir.py b/tilelang/ir.py
index 08d4e96c..b4a7de5e 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -7,23 +7,19 @@ from tilelang import _ffi_api
 
 
 @tvm_ffi.register_object("tl.Fill")
-class Fill(Node, Scriptable):
-    ...
+class Fill(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.AtomicAdd")
-class AtomicAdd(Node, Scriptable):
-    ...
+class AtomicAdd(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Copy")
-class Copy(Node, Scriptable):
-    ...
+class Copy(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.Conv2DIm2Col")
-class Conv2DIm2ColOp(Node, Scriptable):
-    ...
+class Conv2DIm2ColOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmWarpPolicy")
@@ -32,10 +28,8 @@ class GemmWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
-                               is_wgmma: bool):
-        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
-                                                    is_wgmma)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool):
+        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma)
         return self.m_warp, self.n_warp
 
 
@@ -45,48 +39,38 @@ class GemmSPWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
-                               is_wgmma: bool, bits: int):
-        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
-                                                      is_wgmma, bits)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma, bits)
         return self.m_warp, self.n_warp
 
 
 @tvm_ffi.register_object("tl.Gemm")
-class Gemm(Node, Scriptable):
-    ...
+class Gemm(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.GemmSP")
-class GemmSP(Node, Scriptable):
-    ...
+class GemmSP(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.FinalizeReducerOp")
-class FinalizeReducerOp(Node, Scriptable):
-    ...
+class FinalizeReducerOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ParallelOp")
-class ParallelOp(Node, Scriptable):
-    ...
+class ParallelOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceOp")
-class ReduceOp(Node, Scriptable):
-    ...
+class ReduceOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.CumSumOp")
-class CumSumOp(Node, Scriptable):
-    ...
+class CumSumOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.RegionOp")
-class RegionOp(Node, Scriptable):
-    ...
+class RegionOp(Node, Scriptable): ...
 
 
 @tvm_ffi.register_object("tl.ReduceType")
-class ReduceType(Node, Scriptable):
-    ...
+class ReduceType(Node, Scriptable): ...
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 09cbac5e..9a5920d7 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -3,6 +3,7 @@ This module provides an auto-tuning infrastructure for TileLang (tl) programs.
 It includes functionality to JIT-compile TileLang programs into a runnable
 kernel adapter using TVM.
 """
+
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -39,17 +40,16 @@ from tqdm.auto import tqdm
 
 logger = getLogger(__name__)
 
-_P = ParamSpec('_P')
-_KP = ParamSpec('_KP')
-_T = TypeVar('_T')
-_Ret = TypeVar('_Ret')
+_P = ParamSpec("_P")
+_KP = ParamSpec("_KP")
+_T = TypeVar("_T")
+_Ret = TypeVar("_Ret")
 
 
 def compile(
     func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                               "torch"] = "auto",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     verbose: bool = False,
@@ -83,11 +83,9 @@ def compile(
     if isinstance(compile_flags, str):
         compile_flags = [compile_flags]
 
-    if hasattr(func, 'out_idx_override'):
+    if hasattr(func, "out_idx_override"):
         if func.out_idx_override is not None and out_idx is not None:
-            raise ValueError(
-                "Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors"
-            )
+            raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
         out_idx = func.out_idx_override or out_idx
 
     # This path is not a performance critical path, so we can afford to convert the target.
@@ -96,6 +94,7 @@ def compile(
     # Resolve execution backend (handles aliases, auto, validation per target)
     requested_backend = execution_backend
     from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
     execution_backend = resolve_execution_backend(requested_backend, target)
     if verbose:
         allowed_now = allowed_backends_for_target(target, include_unavailable=False)
@@ -119,17 +118,18 @@ def compile(
     )
 
 
-def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
-                out_idx: list[int] | int | None = None,
-                execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc",
-                                           "torch"] = "auto",
-                target: str | Target = "auto",
-                target_host: str | Target | None = None,
-                verbose: bool = False,
-                pass_configs: dict[str, Any] | None = None,
-                compile_flags: list[str] | str | None = None,
-                num_workers: int = None,
-                ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+def par_compile(
+    funcs: Iterable[PrimFunc[_KP, _T]],
+    out_idx: list[int] | int | None = None,
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+    target: str | Target = "auto",
+    target_host: str | Target | None = None,
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    compile_flags: list[str] | str | None = None,
+    num_workers: int = None,
+    ignore_error: bool = False,
+) -> list[JITKernel[_KP, _T]]:
     """
     Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
     Parameters
@@ -151,7 +151,7 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
     """
-    with concurrent.futures.ThreadPoolExecutor(num_workers, 'tl-par-comp') as executor:
+    with concurrent.futures.ThreadPoolExecutor(num_workers, "tl-par-comp") as executor:
         futures = []
         future_map = {}
         for i, func in enumerate(funcs):
@@ -170,9 +170,9 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
             futures.append(future)
         results = [... for _ in futures]
         for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Parallel Compiling",
+            concurrent.futures.as_completed(futures),
+            total=len(futures),
+            desc="Parallel Compiling",
         ):
             idx = future_map[future]
             if ignore_error:
@@ -189,7 +189,7 @@ def par_compile(funcs: Iterable[PrimFunc[_KP, _T]],
 
 @dataclass
 class JITImpl(Generic[_P, _KP, _T, _Ret]):
-    '''
+    """
     Detailed Just-In-Time wrapper for TileLang programs.
 
     This dataclass encapsulates the configuration and runtime helpers used by the
@@ -256,7 +256,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
         PrimFunc and the resulting set is compiled in parallel via the
         module-level `par_compile` helper. Returns a list of JITKernel objects
         in the same order as the provided configs.
-    '''
+    """
 
     out_idx: list[int] | int | None
     execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
@@ -302,10 +302,9 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
         assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
         return tir
 
-    def par_compile(self,
-                    configs: Iterable[dict[str, Any] | tuple[str, Any]],
-                    num_workers: int = None,
-                    ignore_error: bool = False) -> list[JITKernel[_KP, _T]]:
+    def par_compile(
+        self, configs: Iterable[dict[str, Any] | tuple[str, Any]], num_workers: int = None, ignore_error: bool = False
+    ) -> list[JITKernel[_KP, _T]]:
         """
         Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
         Parameters
@@ -328,7 +327,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
         """
         configs = list(configs)
         funcs = []
-        for cfg in tqdm(configs, desc='Elaborating'):
+        for cfg in tqdm(configs, desc="Elaborating"):
             if isinstance(cfg, tuple):
                 funcs.append(self.get_tir(*cfg))
             elif isinstance(cfg, dict):
@@ -345,7 +344,8 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
             pass_configs=self.pass_configs,
             compile_flags=self.compile_flags,
             num_workers=num_workers,
-            ignore_error=ignore_error)
+            ignore_error=ignore_error,
+        )
 
     def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         func = self.get_tir(*args, **kwargs)
@@ -362,25 +362,25 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
 
         if self.debug_root_path:
             if isinstance(self.func, PrimFunc):
-                func_name = self.func.attrs['global_symbol']
+                func_name = self.func.attrs["global_symbol"]
             else:
-                func_name = getattr(self.func, '__name__', 'jit_kernel')
-            kernel_file = f'tilelang_jit_kernel_{func_name}.c'
-            program_file = f'tilelang_jit_program_{func_name}.py'
+                func_name = getattr(self.func, "__name__", "jit_kernel")
+            kernel_file = f"tilelang_jit_kernel_{func_name}.c"
+            program_file = f"tilelang_jit_program_{func_name}.py"
             makedirs(self.debug_root_path, exist_ok=True)
-            with open(path.join(self.debug_root_path, kernel_file), 'w') as f:
+            with open(path.join(self.debug_root_path, kernel_file), "w") as f:
                 print(kernel_result.get_kernel_source(), file=f)
-            with open(path.join(self.debug_root_path, program_file), 'w') as f:
+            with open(path.join(self.debug_root_path, program_file), "w") as f:
                 print(func.script(), file=f)
 
         return kernel_result
 
     def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
         if isinstance(self.func, PrimFuncCreater):
-            tune_params = kwargs.pop('__tune_params', {})
+            tune_params = kwargs.pop("__tune_params", {})
             return self.func.func_annot.parse_key(*args, **kwargs, **tune_params)
         else:
-            tune_params = kwargs.pop('__tune_params', {})
+            tune_params = kwargs.pop("__tune_params", {})
             key_args_tuple = args
             key_kwargs_tuple = tuple(sorted(kwargs.items()))
             tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
@@ -389,34 +389,31 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
 
     def convert_kernel_args(self, *args: _P.args, **kwargs: _P.kwargs):
         if isinstance(self.func, PrimFuncCreater):
-            tune_params = kwargs.pop('__tune_params', {})
+            tune_params = kwargs.pop("__tune_params", {})
             return self.func.func_annot.convert_to_kernel_args(*args, **kwargs, **tune_params)
         else:
-            raise NotImplementedError(
-                "convert_arg_to_kernel_args is only implemented for PrimFuncCreater.")
+            raise NotImplementedError("convert_arg_to_kernel_args is only implemented for PrimFuncCreater.")
 
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         # Separate out the tuning parameters from the user's kwargs
         # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
-        return_compile_arguments = kwargs.pop('__return_compile_arguments', False)
+        return_compile_arguments = kwargs.pop("__return_compile_arguments", False)
         if return_compile_arguments:
-            logger.warning(
-                "`__return_compile_arguments` is deprecated and will be removed in future versions."
-            )
+            logger.warning("`__return_compile_arguments` is deprecated and will be removed in future versions.")
             compile_args = {
-                'out_idx': self.out_idx,
-                'execution_backend': self.execution_backend,
-                'target': self.target,
-                'target_host': self.target_host,
-                'verbose': self.verbose,
-                'pass_configs': self.pass_configs,
-                'compile_flags': self.compile_flags,
+                "out_idx": self.out_idx,
+                "execution_backend": self.execution_backend,
+                "target": self.target,
+                "target_host": self.target_host,
+                "verbose": self.verbose,
+                "pass_configs": self.pass_configs,
+                "compile_flags": self.compile_flags,
             }
             return compile_args
 
         key = self.parse_cache_key(*args, **kwargs)
 
-        tune_params = kwargs.pop('__tune_params', {})
+        tune_params = kwargs.pop("__tune_params", {})
 
         kernel = self._kernel_cache.get(key, None)
         if kernel is None:
@@ -434,8 +431,7 @@ ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvr
 
 
 @overload
-def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]:
-    ...
+def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]: ...
 
 
 @overload
@@ -448,22 +444,22 @@ def jit(
     verbose: bool = False,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None
-) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]:
-    ...
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]: ...
 
 
 def jit(  # This is the new public interface
-        func: Callable[_P, _T] | PrimFunc | None = None,
-        *,  # Indicates subsequent arguments are keyword-only
-        out_idx: Any = None,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: ExecutionBackend = "auto",
-        verbose: bool = False,
-        pass_configs: dict[str, Any] | None = None,
-        debug_root_path: str | None = None,
-        compile_flags: list[str] | str | None = None):
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    out_idx: Any = None,
+    target: str | Target = "auto",
+    target_host: str | Target = None,
+    execution_backend: ExecutionBackend = "auto",
+    verbose: bool = False,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
     """
     Just-In-Time (JIT) compiler decorator for TileLang functions.
 
@@ -516,7 +512,8 @@ def jit(  # This is the new public interface
             compile_flags=compile_flags,
             func_source=inspect.getsource(orig_func),
             signature=inspect.signature(orig_func),
-            lazy_jit=False)
+            lazy_jit=False,
+        )
 
     if func is not None:
         return decorator(func)
@@ -525,8 +522,7 @@ def jit(  # This is the new public interface
 
 
 @overload
-def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]:
-    ...
+def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
 
 
 @overload
@@ -539,9 +535,8 @@ def lazy_jit(
     verbose: bool = False,
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None
-) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]:
-    ...
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
 
 
 def lazy_jit(
@@ -555,7 +550,6 @@ def lazy_jit(
     debug_root_path: str | None = None,
     compile_flags: list[str] | str | None = None,
 ):
-
     if isinstance(compile_flags, str):
         compile_flags = [compile_flags]
 
@@ -567,7 +561,8 @@ def lazy_jit(
         verbose=verbose,
         pass_configs=pass_configs,
         debug_root_path=debug_root_path,
-        compile_flags=compile_flags)
+        compile_flags=compile_flags,
+    )
 
     def decorator(func: Callable[_P, _T]):
         pf: PrimFunc[_P, _T] | PrimFuncCreater[_P, _T] = prim_func(func, generator=True)
@@ -576,10 +571,7 @@ def lazy_jit(
         #     return compile(pf, **compile_args)
         # else:
         return JITImpl(
-            func=pf,
-            **compile_args,
-            func_source=inspect.getsource(pf.orig_func),
-            signature=inspect.signature(pf.orig_func),
-            lazy_jit=True)
+            func=pf, **compile_args, func_source=inspect.getsource(pf.orig_func), signature=inspect.signature(pf.orig_func), lazy_jit=True
+        )
 
     return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/base.py b/tilelang/jit/adapter/base.py
index 6bd69cff..3669f9e3 100644
--- a/tilelang/jit/adapter/base.py
+++ b/tilelang/jit/adapter/base.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
@@ -8,7 +9,6 @@ import torch
 
 
 class BaseKernelAdapter(ABC):
-
     func: Callable | None = None
 
     def __init__(self, mod, params: list[KernelParam], result_idx: list[int]) -> None:
@@ -24,18 +24,14 @@ class BaseKernelAdapter(ABC):
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                )
+                raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
         elif isinstance(result_idx, list):
             for i, idx in enumerate(result_idx):
                 if idx >= len(params) or idx < -len(params):
-                    raise ValueError(
-                        f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                    )
+                    raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
                 if idx < 0:
                     result_idx[i] = len(params) + idx
         else:
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index e2677305..92af8262 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 import torch
 from ..base import BaseKernelAdapter
@@ -41,18 +42,20 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     param_dtypes: list[torch.dtype] | None = None  # Cache for parameter dtypes
     param_shapes: list[list] | None = None  # Cache for parameter shapes
 
-    def __init__(self,
-                 params: list[TensorType],
-                 result_idx: list[int],
-                 target: str,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 host_kernel_source: str | None = None,
-                 device_kernel_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -109,17 +112,19 @@ class CtypesKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      host_kernel_source: str,
-                      device_kernel_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
@@ -175,15 +180,13 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -192,9 +195,7 @@ class CtypesKernelAdapter(BaseKernelAdapter):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -288,7 +289,7 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     @property
     def is_dynamic(self):
         """Indicates whether the kernel handles dynamic shapes."""
-        return (self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0)
+        return self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0
 
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index fe8fe5bd..c456e4db 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 import ctypes
 import logging
@@ -70,17 +71,19 @@ class CythonKernelAdapter(BaseKernelAdapter):
     # Pass configs for the compiler
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 device_kernel_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -130,7 +133,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
         self.lib.get_last_error.restype = ctypes.c_char_p
         result = self.lib.init()
         if result != 0:
-            error_msg = self.lib.get_last_error().decode('utf-8')
+            error_msg = self.lib.get_last_error().decode("utf-8")
             error_msg += f"\n{self.lib_code}"
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
@@ -145,17 +148,19 @@ class CythonKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      host_kernel_source: str,
-                      device_kernel_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
@@ -190,11 +195,10 @@ class CythonKernelAdapter(BaseKernelAdapter):
         adapter.lib.get_last_error.restype = ctypes.c_char_p
         result = adapter.lib.init()
         if result != 0:
-            error_msg = adapter.lib.get_last_error().decode('utf-8')
+            error_msg = adapter.lib.get_last_error().decode("utf-8")
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
-        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params,
-                                                     adapter.lib)
+        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params, adapter.lib)
         adapter.cython_wrapper.set_dynamic_symbolic_map(adapter.dynamic_symbolic_map)
         adapter.cython_wrapper.set_buffer_dtype_map(adapter.buffer_dtype_map)
         adapter.cython_wrapper.set_static_shape_map(adapter.static_shape_map)
@@ -221,15 +225,13 @@ class CythonKernelAdapter(BaseKernelAdapter):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -259,14 +261,13 @@ class CythonKernelAdapter(BaseKernelAdapter):
         params = func.params
         ptr_map = {}
         for i, param in enumerate(params):
-            if param.dtype == 'handle':
+            if param.dtype == "handle":
                 ptr_map[i] = param.name
         return ptr_map
 
-    def _process_static_buffer_infos(self) -> \
-            tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  list[tuple[tir.Var]]]:
+    def _process_static_buffer_infos(
+        self,
+    ) -> tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]], dict[tir.Var, tuple[int, list[tuple[int, int]]]], list[tuple[tir.Var]]]:
         """Extract information about static shapes from the TIR function.
 
         Maps buffer variables to their corresponding static shapes.
@@ -332,9 +333,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -349,9 +348,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
                 skip_tensor_validation: Whether to skip tensor attributes validation which
                 includes shape, dtype, device, etc.
             """
-            return self.cython_wrapper.forward([*args],
-                                               stream=stream,
-                                               skip_tensor_validation=skip_tensor_validation)
+            return self.cython_wrapper.forward([*args], stream=stream, skip_tensor_validation=skip_tensor_validation)
 
         return lambda_forward
 
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index 208370b0..d67f5b40 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -55,6 +55,7 @@ class LibraryGenerator:
         verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import CUTLASS_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
             target_arch = get_target_arch(get_target_compute_version(target))
             libpath = src.name.replace(".cu", ".so")
@@ -65,15 +66,12 @@ class LibraryGenerator:
                     "TL_ENABLE_FAST_MATH",
                     "0.1.7",
                 )
-                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH,
-                                                             True)
+                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH, True)
             else:
                 enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
 
-            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL,
-                                                      None)
-            verbose_ptxas_output = self.pass_configs.get(
-                PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
+            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+            verbose_ptxas_output = self.pass_configs.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
             command = [
                 get_nvcc_compiler(),
@@ -102,6 +100,7 @@ class LibraryGenerator:
 
         elif is_hip_target(target):
             from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
             rocm_path = find_rocm_path()
@@ -119,6 +118,7 @@ class LibraryGenerator:
             ]
         elif is_cpu_target(target):
             from tilelang.contrib.cc import get_cplus_compiler
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
 
@@ -134,9 +134,7 @@ class LibraryGenerator:
         ]
 
         if self.compile_flags:
-            command += [
-                item for flag in self.compile_flags for item in flag.split() if item not in command
-            ]
+            command += [item for flag in self.compile_flags for item in flag.split() if item not in command]
 
         command += ["-o", libpath]
 
@@ -151,8 +149,7 @@ class LibraryGenerator:
             raise RuntimeError(f"Compile kernel failed because of {e}") from e
 
         if ret.returncode != 0:
-            raise RuntimeError(f"Compilation Failed! {command}"
-                               f"\n {self.lib_code}")
+            raise RuntimeError(f"Compilation Failed! {command}\n {self.lib_code}")
 
         self.srcpath = src.name
         self.libpath = libpath
diff --git a/tilelang/jit/adapter/nvrtc/__init__.py b/tilelang/jit/adapter/nvrtc/__init__.py
index faa08c19..c8abe8d7 100644
--- a/tilelang/jit/adapter/nvrtc/__init__.py
+++ b/tilelang/jit/adapter/nvrtc/__init__.py
@@ -5,22 +5,22 @@ This module provides runtime compilation support using NVIDIA's NVRTC API.
 
 import logging
 
-__all__ = [
-    'NVRTCKernelAdapter', 'TLNVRTCSourceWrapper', 'NVRTCLibraryGenerator', 'is_nvrtc_available',
-    'check_nvrtc_available'
-]
+__all__ = ["NVRTCKernelAdapter", "TLNVRTCSourceWrapper", "NVRTCLibraryGenerator", "is_nvrtc_available", "check_nvrtc_available"]
 
 logger = logging.getLogger(__name__)
 
 # Check if cuda-python is available
 is_nvrtc_available = False
-NVRTC_UNAVAILABLE_MESSAGE = ("cuda-python is not available, NVRTC backend cannot be used. "
-                             "Please install cuda-python via `pip install cuda-python` "
-                             "if you want to use the NVRTC backend.")
+NVRTC_UNAVAILABLE_MESSAGE = (
+    "cuda-python is not available, NVRTC backend cannot be used. "
+    "Please install cuda-python via `pip install cuda-python` "
+    "if you want to use the NVRTC backend."
+)
 
 try:
     import cuda.bindings.driver as cuda  # noqa: F401
     import cuda.bindings.nvrtc as nvrtc  # noqa: F401
+
     is_nvrtc_available = True
 except ImportError as e:
     logger.debug(f"cuda-python import failed: {e}")
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index 4a465d33..d222f33a 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -27,18 +27,19 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
     pymodule = None
     kernels = {}
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 device_kernel_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
-
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         check_nvrtc_available()
 
         self.params = params
@@ -92,17 +93,19 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[KernelParam],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      host_kernel_source: str,
-                      device_kernel_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
@@ -183,8 +186,7 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
             return self.host_func
 
     def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
-        """Low-level function to call the compiled CUDA kernel.
-        """
+        """Low-level function to call the compiled CUDA kernel."""
         return self.pymodule.call(self.kernels, *args, stream=stream)
 
     def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None):
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
index 50a587a5..406cc44d 100644
--- a/tilelang/jit/adapter/nvrtc/libgen.py
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -13,6 +13,7 @@ Key responsibilities:
 - Load compiled cubin and extract kernel handles
 - Manage library lifecycle (load/unload)
 """
+
 from __future__ import annotations
 import importlib
 import logging
@@ -56,6 +57,7 @@ class NVRTCLibraryGenerator(LibraryGenerator):
         culib: CUDA library handle (CUlibrary)
         pymodule: Imported Python module containing call() function
     """
+
     host_func: str | None = None
     culib: cuda.CUlibrary | None = None
     pymodule: ModuleType | None = None
@@ -131,10 +133,10 @@ class NVRTCLibraryGenerator(LibraryGenerator):
         ctx = cuda.cuCtxGetCurrent()[1]
         if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
             import torch
+
             torch.cuda.synchronize()
 
-        result, self.culib = cuda.cuLibraryLoadFromFile(
-            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
+        result, self.culib = cuda.cuLibraryLoadFromFile(bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
         if result != cuda.CUresult.CUDA_SUCCESS:
             raise RuntimeError(f"Failed to load library: {lib_path}, error: {result}")
 
@@ -164,7 +166,8 @@ class NVRTCLibraryGenerator(LibraryGenerator):
         target = self.target
         verbose = self.verbose
         if is_cuda_target(target):
-            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
+            from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
             libpath = src.name.replace(".cu", ".cubin")
 
@@ -195,13 +198,9 @@ class NVRTCLibraryGenerator(LibraryGenerator):
                 f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
             ]
             if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
+                options += [item for flag in self.compile_flags for item in flag.split() if item not in options]
 
-            cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=verbose)
+            cubin_bytes = compile_cuda(self.lib_code, target_format="cubin", options=options, verbose=verbose)
             with open(libpath, "wb") as f:
                 f.write(cubin_bytes)
 
@@ -212,8 +211,7 @@ class NVRTCLibraryGenerator(LibraryGenerator):
             self.libpath = libpath
             self.pypath = src.name.replace(".cu", ".py")
             if self.host_func is None:
-                raise RuntimeError(
-                    "Host function is not set, please call update_host_func() first.")
+                raise RuntimeError("Host function is not set, please call update_host_func() first.")
             with open(self.pypath, "w") as f:
                 f.write(self.host_func)
         else:
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
index 7e00050c..3df2b3bf 100644
--- a/tilelang/jit/adapter/nvrtc/wrapper.py
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -12,6 +12,7 @@ Key design:
 - Dict-based deduplication ensures TMA descriptors created only once
 - Generates pure Python using cuda.bindings.driver for zero C++ dependency
 """
+
 from __future__ import annotations
 from typing import Any, ClassVar
 
@@ -21,8 +22,7 @@ from tvm.tir.stmt_functor import post_order_visit
 
 from tilelang import tvm as tvm
 from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
-from tilelang.jit.adapter.utils import (match_declare_kernel, pythonic_expr,
-                                        parse_function_call_args, parse_tma_descriptor_args)
+from tilelang.jit.adapter.utils import match_declare_kernel, pythonic_expr, parse_function_call_args, parse_tma_descriptor_args
 
 PREDEF_HOST_FUNC_PY = """
 from cuda.bindings.driver import (
@@ -235,13 +235,15 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
 
     _generated_host_func: str | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         """Initialize NVRTC wrapper with compiled IR modules.
 
         Args:
@@ -303,15 +305,16 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": "ctypes.c_void_p",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": "ctypes.c_void_p",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
         for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
@@ -359,9 +362,9 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
                     return (f"{name}.data_ptr()", arg_type)
                 return (name, arg_type)
 
-            call_args = parse_function_call_args(declaration, function_args, function_params,
-                                                 desc_name_map, desc_name_var_map,
-                                                 transform_nvrtc_arg)
+            call_args = parse_function_call_args(
+                declaration, function_args, function_params, desc_name_map, desc_name_var_map, transform_nvrtc_arg
+            )
 
             for arg_name, arg_type in call_args:
                 if arg_type == "ctypes.c_void_p":
@@ -369,26 +372,28 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
                     break
 
             # Store kernel info for second pass
-            kernel_info_list.append({
-                'function_name': function_name,
-                'block_info': block_info,
-                'grid_info': grid_info,
-                'dynamic_smem_buf': dynamic_smem_buf,
-                'call_args': call_args,
-                'device_index': device_index,
-            })
+            kernel_info_list.append(
+                {
+                    "function_name": function_name,
+                    "block_info": block_info,
+                    "grid_info": grid_info,
+                    "dynamic_smem_buf": dynamic_smem_buf,
+                    "call_args": call_args,
+                    "device_index": device_index,
+                }
+            )
 
         # Generate TMA descriptor initialization code once for all kernels
         kernel_launch_code += self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
 
         # Second pass: generate kernel launch code for each kernel
         for kernel_info in kernel_info_list:
-            function_name = kernel_info['function_name']
-            block_info = kernel_info['block_info']
-            grid_info = kernel_info['grid_info']
-            dynamic_smem_buf = kernel_info['dynamic_smem_buf']
-            call_args = kernel_info['call_args']
-            device_index = kernel_info['device_index']
+            function_name = kernel_info["function_name"]
+            block_info = kernel_info["block_info"]
+            grid_info = kernel_info["grid_info"]
+            dynamic_smem_buf = kernel_info["dynamic_smem_buf"]
+            call_args = kernel_info["call_args"]
+            device_index = kernel_info["device_index"]
 
             arg_names = ", ".join([arg[0] for arg in call_args])
             arg_types = ", ".join([arg[1] for arg in call_args])
@@ -399,23 +404,26 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
             kernel_launch_code += init_l2_persistent_map
 
             # Generate kernel launch code
-            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(function_name,
-                                                               self._pythonic_expr(grid_info[0]),
-                                                               self._pythonic_expr(grid_info[1]),
-                                                               self._pythonic_expr(grid_info[2]),
-                                                               self._pythonic_expr(block_info[0]),
-                                                               self._pythonic_expr(block_info[1]),
-                                                               self._pythonic_expr(block_info[2]),
-                                                               smem_str, arg_names, arg_types,
-                                                               device_index)
+            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(
+                function_name,
+                self._pythonic_expr(grid_info[0]),
+                self._pythonic_expr(grid_info[1]),
+                self._pythonic_expr(grid_info[2]),
+                self._pythonic_expr(block_info[0]),
+                self._pythonic_expr(block_info[1]),
+                self._pythonic_expr(block_info[2]),
+                smem_str,
+                arg_names,
+                arg_types,
+                device_index,
+            )
 
         # Reset L2 persistent map after all kernel execution
         if has_l2_persistent_map:
             kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE_PY
 
         # Wrap the kernel dispatch logic in an external C function
-        host_func = PREDEF_HOST_FUNC_PY.format(
-            repr(list(function_informations.keys())), def_args, kernel_launch_code)
+        host_func = PREDEF_HOST_FUNC_PY.format(repr(list(function_informations.keys())), def_args, kernel_launch_code)
         return host_func
 
     def generate_l2_persistent_map(self, function_name: str) -> str:
@@ -434,23 +442,21 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
         if function_name not in self.l2_persistent_map:
             return ""
         init_l2_persistent_map = ""
-        for buffer_name, (hit_ratio,
-                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
             # Get persisting_l2_cache_max_size
             from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
             persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
             try:
                 num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
             except TypeError:
                 # as size_in_bytes may be a symbolic expression
                 num_bytes = persisting_l2_cache_max_size
-            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(
-                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
 
         return init_l2_persistent_map
 
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
-                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
         """Generate Python code to initialize TMA descriptors.
 
         TMA (Tensor Memory Accelerator) descriptors are opaque CUDA objects
@@ -470,28 +476,43 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
             return tma_descriptor_init
 
         # Parse TMA descriptor arguments using the common utility
-        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map,
-                                                  desc_name_var_map, self._pythonic_expr)
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
 
         # Generate Python code from parsed parameters
         for params in parsed_params:
             if not params.is_img2col:
                 tma_descriptor_init += TMA_DESC_INIT_FUNC_PY.format(
-                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
                     ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
                     ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
                     ", ".join(map(lambda x: f"cuuint32_t({x})", params.box_dim)),
                     ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
-                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
             else:
                 tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC_PY.format(
-                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
                     ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
                     ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
-                    ", ".join(map(lambda x: f"cuuint32_t({x})",
-                                  params.element_strides)), ", ".join(params.lower_corner),
-                    ", ".join(params.upper_corner), params.smem_box_channel, params.smem_box_pixel,
-                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    ", ".join(params.lower_corner),
+                    ", ".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
 
         return tma_descriptor_init
 
@@ -527,17 +548,14 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
             def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 nonlocal function_params
                 if isinstance(node, tvm.tir.Call):
-                    if not (hasattr(node, "op") and
-                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
                         return
                     args = node.args
                     if not args or args[0] != fn:
                         return
                     if len(args) < 1 + param_cnt:
-                        raise AssertionError(
-                            "tvm_call_packed should have at least 1 argument and match device function parameters"
-                        )
-                    function_params = args[1:1 + param_cnt]
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
 
             post_order_visit(self.host_func.body, visitor)
             assert function_params is not None, "function_params should not be None"
diff --git a/tilelang/jit/adapter/torch/__init__.py b/tilelang/jit/adapter/torch/__init__.py
index 2390e3e7..f688993d 100644
--- a/tilelang/jit/adapter/torch/__init__.py
+++ b/tilelang/jit/adapter/torch/__init__.py
@@ -1,3 +1,3 @@
 from .metal import MetalKernelAdapter
 
-__all__ = ['MetalKernelAdapter']
+__all__ = ["MetalKernelAdapter"]
diff --git a/tilelang/jit/adapter/torch/metal.py b/tilelang/jit/adapter/torch/metal.py
index 0b1bc009..4690cf59 100644
--- a/tilelang/jit/adapter/torch/metal.py
+++ b/tilelang/jit/adapter/torch/metal.py
@@ -12,7 +12,6 @@ from tilelang.engine.param import KernelParam
 
 
 class MetalKernelAdapter(BaseKernelAdapter):
-
     def __init__(
         self,
         params: list[KernelParam],
@@ -28,10 +27,10 @@ class MetalKernelAdapter(BaseKernelAdapter):
     ):
         self.kernel_global_source = kernel_global_source
         if isinstance(func_or_mod, tir.PrimFunc):
-            func_name = func_or_mod.attrs['global_symbol']
+            func_name = func_or_mod.attrs["global_symbol"]
         else:
             func_name = func_or_mod.__name__
-        self.kernel_name = func_name + '_kernel'
+        self.kernel_name = func_name + "_kernel"
         self.verbose = verbose
 
         self.block_info = [1, 1, 1]
@@ -39,7 +38,7 @@ class MetalKernelAdapter(BaseKernelAdapter):
 
         for var, func in device_mod.functions.items():
             assert var.name_hint == self.kernel_name
-            thread_extent = func.attrs['thread_extent']
+            thread_extent = func.attrs["thread_extent"]
             for tag, extent in thread_extent.items():
                 if "threadIdx" in tag:
                     self.block_info["xyz".index(tag[-1])] = extent
@@ -47,7 +46,7 @@ class MetalKernelAdapter(BaseKernelAdapter):
                     self.grid_info["xyz".index(tag[-1])] = extent
             break
         else:
-            raise AssertionError(f'no kernel with name {func_name}')
+            raise AssertionError(f"no kernel with name {func_name}")
 
         # print(self.block_info, self.grid_info)
         super().__init__(func_or_mod, result_idx=result_idx, params=params)
@@ -55,15 +54,12 @@ class MetalKernelAdapter(BaseKernelAdapter):
     _kernel = None
 
     def _convert_torch_func(self) -> Callable:
-
         if self._kernel is None:
-
             _kernel = getattr(torch.mps.compile_shader(self.kernel_global_source), self.kernel_name)
             _threads = [x * y for (x, y) in zip(self.block_info, self.grid_info)]
 
             @wraps(_kernel)
             def launcher(*args: torch.Tensor):
-
                 return _kernel(
                     *args,
                     threads=_threads,
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index 96b4c85e..8b868645 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -5,6 +5,7 @@ via light-weight callables so that, when the wrapped function is invoked,
 the execution observes the same stream context as the active Torch code.
 On non-CUDA builds, the stream/device fall back to 0/CPU semantics.
 """
+
 from __future__ import annotations
 
 from typing import Callable, Any
@@ -31,6 +32,7 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
     - The stream pointer returned is a raw CUDA stream handle compatible with
       TVM's device API; on CPU or when CUDA is unavailable, we return 0.
     """
+
     # Class attributes to store compiled kernel information
     target: str | Target = "cuda"
     ir_module: tvm.IRModule | None = None
@@ -51,19 +53,21 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] | None = None
 
     # Stream/device functors are inherited from BaseKernelAdapter
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 rt_mod: tvm.runtime.Module | None = None,
-                 host_kernel_source: str | None = None,
-                 device_kernel_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        rt_mod: tvm.runtime.Module | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -113,15 +117,13 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -197,8 +199,7 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
             # Validate input count strictly
             expected_inputs = len(self.params) - len(self.result_idx)
             if len(inputs) != expected_inputs:
-                raise ValueError(
-                    f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
+                raise ValueError(f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
 
             # Resolve the device used for outputs. Prefer the first tensor input's device
             # if available, otherwise use PyTorch's current device.
@@ -217,17 +218,14 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
                     for s in param_shapes[i]:
                         if isinstance(s, tir.Var):
                             for key in dynamic_symbolic_map:
-                                if (str(s) == str(key)):
-                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[
-                                        key]
+                                if str(s) == str(key):
+                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[key]
                                     if ref_id == 2:
                                         shape.append(inputs[ref_tensor_idx])
                                     elif ref_id == 0:
-                                        shape.append(
-                                            tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                        shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
                                     elif ref_id == 1:
-                                        shape.append(
-                                            tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
+                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
                         else:  # Already converted to Python int during initialization
                             shape.append(s)
 
@@ -235,11 +233,11 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
                         out_device = current_device_functor()
 
                     if len(shape) == 0:
-                        param_name = self.params[i].name if hasattr(self.params[i],
-                                                                    'name') else f'parameter_{i}'
+                        param_name = self.params[i].name if hasattr(self.params[i], "name") else f"parameter_{i}"
                         raise ValueError(
                             f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
-                            f"Expected shape: {shape}")
+                            f"Expected shape: {shape}"
+                        )
                     tensor = torch.empty(*shape, dtype=dtype, device=out_device)
                 else:
                     tensor = inputs[ins_idx]
@@ -256,17 +254,19 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
         return func
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      host_kernel_source: str,
-                      device_kernel_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index 94e590d3..15801ffa 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -70,7 +70,6 @@ def get_annotated_mod(
     target_host: str | Target | None = None,
     model_type: Literal["device", "host", "all"] = "all",
 ) -> IRModule | tuple[IRModule, IRModule]:
-
     # Validate model_type early
     if model_type not in {"device", "host", "all"}:
         raise ValueError(f"Invalid model type: {model_type}")
@@ -95,21 +94,15 @@ def get_annotated_mod(
 
     # Define dispatch dictionary for different model types
     dispatch = {
-        "device":
-            lambda m: tir.transform.Filter(_is_device_call)(m),
-        "host":
-            lambda m: tir.transform.Filter(_is_host_call)(m),
-        "all":
-            lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)
-                       (m)),
+        "device": lambda m: tir.transform.Filter(_is_device_call)(m),
+        "host": lambda m: tir.transform.Filter(_is_host_call)(m),
+        "all": lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)(m)),
     }
 
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr,
-                  dtype_map: dict[str, str] | None = None,
-                  ignore_cast: bool = False) -> str:
+def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
@@ -168,9 +161,23 @@ def pythonic_expr(expr: tvm.tir.PrimExpr,
                 s = f"({type_str}){value_str}"
             p = PRECEDENCE.get(type(node), ATOMIC_PRECEDENCE)
         elif isinstance(
-                node,
-            (tvm.tir.Mul, tvm.tir.FloorDiv, tvm.tir.Add, tvm.tir.Sub, tvm.tir.FloorMod, tvm.tir.LT,
-             tvm.tir.LE, tvm.tir.GT, tvm.tir.GE, tvm.tir.EQ, tvm.tir.NE, tvm.tir.And, tvm.tir.Or)):
+            node,
+            (
+                tvm.tir.Mul,
+                tvm.tir.FloorDiv,
+                tvm.tir.Add,
+                tvm.tir.Sub,
+                tvm.tir.FloorMod,
+                tvm.tir.LT,
+                tvm.tir.LE,
+                tvm.tir.GT,
+                tvm.tir.GE,
+                tvm.tir.EQ,
+                tvm.tir.NE,
+                tvm.tir.And,
+                tvm.tir.Or,
+            ),
+        ):
             op_map = {
                 tvm.tir.Mul: "*",
                 tvm.tir.FloorDiv: "/",
@@ -222,10 +229,7 @@ def pythonic_expr(expr: tvm.tir.PrimExpr,
     return next(iter(node_to_result_map[expr]), "")
 
 
-def maybe_desc_name(name: str,
-                    matches: list[str],
-                    i: int,
-                    desc_name_map: dict[str, str] | None = None) -> bool:
+def maybe_desc_name(name: str, matches: list[str], i: int, desc_name_map: dict[str, str] | None = None) -> bool:
     """
     Check if a parameter name corresponds to a TMA descriptor.
 
@@ -290,8 +294,7 @@ def parse_function_call_args(
                 else:
                     call_args.append(match)
                 if desc_name_var_map is not None and function_params is not None:
-                    assert len(call_args) <= len(function_params), \
-                        f"Too many arguments: {len(call_args)} > {len(function_params)}"
+                    assert len(call_args) <= len(function_params), f"Too many arguments: {len(call_args)} > {len(function_params)}"
                     desc_name_var_map[match] = function_params[len(call_args) - 1]
 
     return call_args
@@ -300,12 +303,7 @@ def parse_function_call_args(
 class TMADescriptorParams:
     """Parsed TMA descriptor parameters."""
 
-    def __init__(self,
-                 handle_name: str,
-                 dtype: str,
-                 tensor_rank: int,
-                 global_address: Any,
-                 is_img2col: bool = False):
+    def __init__(self, handle_name: str, dtype: str, tensor_rank: int, global_address: Any, is_img2col: bool = False):
         self.handle_name = handle_name
         self.dtype = dtype
         self.tensor_rank = tensor_rank
@@ -355,22 +353,19 @@ def parse_tma_descriptor_args(
     results = []
 
     for handle_name, _ in desc_name_map.items():
-        assert handle_name in desc_name_var_map, \
-            f"Handle name {handle_name} not found in desc_name_var_map"
+        assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
         desc_var = desc_name_var_map[handle_name]
 
-        assert desc_var in tma_descriptor_args, \
-            f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
+        assert desc_var in tma_descriptor_args, f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
         args = tma_descriptor_args[desc_var]
 
         # Skip __tvm_tensormap_create_tiled and second element (like CUDA version)
         if len(args) < 3:
-            raise ValueError(
-                f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
+            raise ValueError(f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
 
         tma_create_str, _, dtype, tensor_rank, global_address, *remaining_args = args
 
-        is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
+        is_img2col = tma_create_str.value == "__tvm_tensormap_create_im2col"
 
         # Convert basic fields
         dtype = pythonic_expr_func(dtype)
@@ -386,60 +381,45 @@ def parse_tma_descriptor_args(
             # Tiled mode
             expected_args_len = 4 * tensor_rank + 4
             if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
 
             # Extract dimensions and strides
             params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
-            params.global_stride = [
-                pythonic_expr_func(i) for i in remaining_args[tensor_rank:2 * tensor_rank]
-            ]
-            params.box_dim = [
-                pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank:3 * tensor_rank]
-            ]
-            params.element_strides = [
-                pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank:4 * tensor_rank]
-            ]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.box_dim = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank]]
 
             # Extract remaining parameters
             try:
-                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank:4 *
-                                                                             tensor_rank + 4]
+                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank : 4 * tensor_rank + 4]
                 params.interleave = pythonic_expr_func(interleave)
                 params.swizzle = pythonic_expr_func(swizzle)
                 params.l2_promotion = pythonic_expr_func(l2_promotion)
                 params.oob_fill = pythonic_expr_func(oob_fill)
             except ValueError as e:
-                raise ValueError(
-                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                ) from e
+                raise ValueError("Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)") from e
         else:
             # Im2col mode
             expected_args_len = 5 * tensor_rank + 2
             if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
 
             # Extract dimensions and strides
             params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
-            params.global_stride = [
-                pythonic_expr_func(i) for i in remaining_args[tensor_rank:2 * tensor_rank]
-            ]
-            params.element_strides = [
-                pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank:3 * tensor_rank]
-            ]
-            params.lower_corner = [
-                pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
-            ]
-            params.upper_corner = [
-                pythonic_expr_func(i)
-                for i in remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
-            ]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.lower_corner = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank - 2]]
+            params.upper_corner = [pythonic_expr_func(i) for i in remaining_args[4 * tensor_rank - 2 : 5 * tensor_rank - 4]]
 
             # Extract remaining parameters
             try:
-                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = \
-                    remaining_args[5 * tensor_rank - 4:5 * tensor_rank + 2]
+                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = remaining_args[
+                    5 * tensor_rank - 4 : 5 * tensor_rank + 2
+                ]
                 params.smem_box_pixel = pythonic_expr_func(smem_box_pixel)
                 params.smem_box_channel = pythonic_expr_func(smem_box_channel)
                 params.interleave = pythonic_expr_func(interleave)
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 75607976..c028a58e 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -4,9 +4,18 @@ from tilelang import tvm as tvm
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
-from .utils import (is_metal_target, match_declare_kernel, match_declare_kernel_cpu, is_cuda_target,
-                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr,
-                    parse_function_call_args, parse_tma_descriptor_args)
+from .utils import (
+    is_metal_target,
+    match_declare_kernel,
+    match_declare_kernel_cpu,
+    is_cuda_target,
+    is_hip_target,
+    is_cpu_target,
+    get_annotated_mod,
+    pythonic_expr,
+    parse_function_call_args,
+    parse_tma_descriptor_args,
+)
 import re
 import logging
 import textwrap
@@ -129,7 +138,6 @@ TMA_IM2COL_DESC_INIT_FUNC = """
 
 
 class BaseWrapper(ABC):
-
     @abstractmethod
     def wrap(self, *args, **kwargs):
         raise NotImplementedError
@@ -163,13 +171,15 @@ class TLCUDASourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -211,15 +221,16 @@ class TLCUDASourceWrapper:
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": self._lookup_type(buffer.dtype) + "* __restrict__",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": self._lookup_type(buffer.dtype) + "* __restrict__",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
         for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
@@ -256,38 +267,40 @@ class TLCUDASourceWrapper:
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
 
-            block_str = f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
-            grid_str = f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            block_str = (
+                f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
+            )
+            grid_str = (
+                f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            )
             smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
             init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
             kernel_launch_code += init_l2_persistent_map
 
             if self.use_cooperative_groups[function_name]:
-                args_list = parse_function_call_args(declaration, function_args, function_params,
-                                                     desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 args_array = [f"(void*)&{arg}" for arg in args_list]
                 call_args = f"\tvoid* {function_name}_args[] = {{{', '.join(args_array)}}};\n"
                 kernel_launch_code += call_args
                 # Using cudaLaunchCooperativeKernel to launch the kernel
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
-                    function_name, grid_str, block_str, function_name + "_args", smem_str)
+                    function_name, grid_str, block_str, function_name + "_args", smem_str
+                )
             else:
-                args_list = parse_function_call_args(declaration, function_args, function_params,
-                                                     desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 call_args = ", ".join(args_list)
                 kernel_launch_code += f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
-                kernel_launch_code += f"\tTILELANG_CHECK_LAST_ERROR(\"{function_name}\");\n"
+                kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
             if has_l2_persistent_map:
                 kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE
 
-        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map,
-                                                                     desc_name_var_map)
+        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
         kernel_launch_code = init_tma_descriptor_args + kernel_launch_code
 
         # Wrap the kernel dispatch logic in an external C function
@@ -298,46 +311,63 @@ class TLCUDASourceWrapper:
         if function_name not in self.l2_persistent_map:
             return ""
         init_l2_persistent_map = ""
-        for buffer_name, (hit_ratio,
-                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
             # get persisting_l2_cache_max_size
             from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
             persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
             try:
                 num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
             except Exception:
                 # as size_in_bytes maybe a symbolic expression
                 num_bytes = persisting_l2_cache_max_size
-            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(
-                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
 
         return init_l2_persistent_map
 
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
-                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
         tma_descripter_init = ""
         if self.tma_descriptor_args is None:
             return tma_descripter_init
 
         # Parse TMA descriptor arguments using the common utility
-        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map,
-                                                  desc_name_var_map, self._pythonic_expr)
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
 
         # Generate C++ code from parsed parameters
         for params in parsed_params:
             if not params.is_img2col:
                 tma_descripter_init += TMA_DESC_INIT_FUNC.format(
-                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
-                    ",".join(params.global_dim), ",".join(params.global_stride),
-                    ",".join(params.box_dim), ",".join(params.element_strides), params.interleave,
-                    params.swizzle, params.l2_promotion, params.oob_fill)
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.box_dim),
+                    ",".join(params.element_strides),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
             else:
                 tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
-                    params.handle_name, params.dtype, params.tensor_rank, params.global_address,
-                    ",".join(params.global_dim), ",".join(params.global_stride),
-                    ",".join(params.element_strides), ",".join(params.lower_corner),
-                    ",".join(params.upper_corner), params.smem_box_channel, params.smem_box_pixel,
-                    params.interleave, params.swizzle, params.l2_promotion, params.oob_fill)
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.element_strides),
+                    ",".join(params.lower_corner),
+                    ",".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
 
         return tma_descripter_init
 
@@ -347,9 +377,8 @@ class TLCUDASourceWrapper:
                 device_mod, host_mod = get_annotated_mod(self.mod, self.target)
             self.device_mod = device_mod
             self.host_mod = host_mod
-        assert (len(self.device_mod.functions)
-                >= 1), "Device module should have at least one function."
-        assert (len(self.host_mod.functions) == 1), "Only support one function in host module."
+        assert len(self.device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(self.host_mod.functions) == 1, "Only support one function in host module."
 
         block_info_map = {}
         grid_info_map = {}
@@ -438,8 +467,7 @@ class TLCUDASourceWrapper:
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -466,17 +494,14 @@ class TLCUDASourceWrapper:
             def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 nonlocal function_params
                 if isinstance(node, tvm.tir.Call):
-                    if not (hasattr(node, "op") and
-                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
                         return
                     args = node.args
                     if not args or args[0] != fn:
                         return
                     if len(args) < 1 + param_cnt:
-                        raise AssertionError(
-                            "tvm_call_packed should have at least 1 argument and match device function parameters"
-                        )
-                    function_params = args[1:1 + param_cnt]
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
 
             post_order_visit(self.host_func.body, visitor)
             assert function_params is not None, "function_params should not be None"
@@ -564,13 +589,15 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "uchar": "uint8_t",
     }
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
     def get_init_func(self):
@@ -580,8 +607,7 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -623,13 +649,15 @@ class TLCPUSourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -658,15 +686,16 @@ class TLCPUSourceWrapper:
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.name,
-                    "type": self._lookup_type(buffer.dtype) + "*",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.name,
+                        "type": self._lookup_type(buffer.dtype) + "*",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
                 function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
         for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
@@ -686,7 +715,6 @@ class TLCPUSourceWrapper:
         _call_str = """"""
 
         for function_name, _ in function_informations.items():
-
             # Find the location of the global kernel function in the code
             index = match_declare_kernel_cpu(code, function_name + "(")
 
@@ -706,8 +734,8 @@ class TLCPUSourceWrapper:
     def parse_source_information(self):
         with tvm.transform.PassContext(opt_level=3, config=self.pass_configs):
             device_mod, host_mod = get_annotated_mod(self.mod, self.target)
-        assert (len(device_mod.functions) >= 1), "Device module should have at least one function."
-        assert (len(host_mod.functions) == 1), "Only support one function in host module."
+        assert len(device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(host_mod.functions) == 1, "Only support one function in host module."
 
         function_names = []
         for g_var, _ in device_mod.functions.items():
@@ -767,14 +795,15 @@ class TLCPUSourceWrapper:
 
 
 class TLMetalSourceWrapper:
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -792,6 +821,7 @@ class TLWrapper(BaseWrapper):
     """
     A wrapper class for the TileLang backend.
     """
+
     device_mod: IRModule | None = None
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
@@ -836,12 +866,12 @@ class TLWrapper(BaseWrapper):
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
         return wrapper.lib_code
 
 
 class TLPyWrapper(TLWrapper):
-
     def __init__(self, target: Target):
         super().__init__(target)
 
@@ -849,6 +879,7 @@ class TLPyWrapper(TLWrapper):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
         if is_cuda_target(self.target):
             from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
+
             wrapper_class = TLNVRTCSourceWrapper
         else:
             raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
@@ -858,5 +889,6 @@ class TLPyWrapper(TLWrapper):
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
         return wrapper.host_func, wrapper.function_names
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
index fe600002..492e8cb0 100644
--- a/tilelang/jit/execution_backend.py
+++ b/tilelang/jit/execution_backend.py
@@ -46,6 +46,7 @@ def allowed_backends_for_target(target: Target, *, include_unavailable: bool = T
         # Drop NVRTC if not importable
         try:
             from tilelang.jit.adapter.nvrtc import is_nvrtc_available  # lazy
+
             if not is_nvrtc_available and "nvrtc" in allowed:
                 allowed = [b for b in allowed if b != "nvrtc"]
         except Exception:
@@ -89,12 +90,14 @@ def resolve_execution_backend(requested: str | None, target: Target) -> str:
     if req not in allowed_all:
         raise ValueError(
             f"Invalid execution backend '{requested}' for target '{_target_kind(target)}'. "
-            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'.")
+            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'."
+        )
 
     # Promote to availability-aware set for nicer errors (e.g., nvrtc not installed)
     if req not in allowed_avail:
         raise ValueError(
             f"Execution backend '{requested}' requires extra dependencies and is not available now. "
-            f"Try one of: {_format_options(allowed_avail)}.")
+            f"Try one of: {_format_options(allowed_avail)}."
+        )
 
     return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index 22cecf99..c05ef9e5 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from typing import Any, Callable, Generic, Literal, TypeVar
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
@@ -14,8 +15,7 @@ import tilelang
 from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
-from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  TVMFFIKernelAdapter, MetalKernelAdapter)
+from tilelang.jit.adapter import BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter, TVMFFIKernelAdapter, MetalKernelAdapter
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
@@ -24,8 +24,8 @@ import os
 
 logger = logging.getLogger(__name__)
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 class JITKernel(Generic[_P, _T]):
@@ -41,6 +41,7 @@ class JITKernel(Generic[_P, _T]):
     torch_function : Callable
         The compiled function that can be invoked as a PyTorch-compatible function.
     """
+
     prim_func: PrimFunc = None
     artifact: CompiledArtifact = None
     adapter: BaseKernelAdapter = None
@@ -111,9 +112,7 @@ class JITKernel(Generic[_P, _T]):
         if execution_backend == "cython":
             from tilelang.contrib.cc import get_cplus_compiler
 
-            assert (
-                get_cplus_compiler() is not None
-            ), "Cython backend requires a C++ compiler, please install or use other backends."
+            assert get_cplus_compiler() is not None, "Cython backend requires a C++ compiler, please install or use other backends."
 
         if from_database:
             return
@@ -200,8 +199,7 @@ class JITKernel(Generic[_P, _T]):
         """
         return self.torch_function(*args, **kwds)
 
-    def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
-                                    out_idx: list[int]) -> BaseKernelAdapter:
+    def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int]) -> BaseKernelAdapter:
         """
         Compiles the given TileLang PrimFunc using TVM and creates a kernel adapter.
 
@@ -233,7 +231,8 @@ class JITKernel(Generic[_P, _T]):
                 target=target,
                 target_host=target_host,
                 enable_host_codegen=enable_host_codegen,
-                enable_device_compile=enable_device_compile)
+                enable_device_compile=enable_device_compile,
+            )
 
         self.artifact = artifact
 
@@ -241,7 +240,7 @@ class JITKernel(Generic[_P, _T]):
         if execution_backend == "tvm_ffi":
             # Use TVMFFIKernelAdapter for interoperability with PyTorch via DLPack.
             # But we need to ensure that the runtime is enabled and the runtime module is not None.
-            assert (artifact.rt_mod is not None), "tvm_ffi backend requires a runtime module."
+            assert artifact.rt_mod is not None, "tvm_ffi backend requires a runtime module."
             adapter = TVMFFIKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -283,6 +282,7 @@ class JITKernel(Generic[_P, _T]):
             )
         elif execution_backend == "nvrtc":
             from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -315,16 +315,18 @@ class JITKernel(Generic[_P, _T]):
 
         return adapter
 
-    def _create_adapter_from_database(self,
-                                      params: list[KernelParam],
-                                      result_idx: list[int] | int,
-                                      target: str | Target,
-                                      func_or_mod: PrimFunc | tvm.runtime.Module,
-                                      host_kernel_source: str,
-                                      device_kernel_source: str,
-                                      kernel_lib_path: str,
-                                      pass_configs: dict[str, Any] | None = None,
-                                      compile_flags: list[str] | None = None) -> BaseKernelAdapter:
+    def _create_adapter_from_database(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int] | int,
+        target: str | Target,
+        func_or_mod: PrimFunc | tvm.runtime.Module,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ) -> BaseKernelAdapter:
         target = self.target
         execution_backend = self.execution_backend
 
@@ -366,6 +368,7 @@ class JITKernel(Generic[_P, _T]):
             )
         elif execution_backend == "nvrtc":
             from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
@@ -402,8 +405,7 @@ class JITKernel(Generic[_P, _T]):
         """
         return cls(func=tilelang_func, **kwargs)
 
-    def get_profiler(self,
-                     tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
+    def get_profiler(self, tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
         """
         Creates a profiler to benchmark the compiled runtime module.
 
@@ -417,8 +419,7 @@ class JITKernel(Generic[_P, _T]):
         Profiler
             A Profiler instance for benchmarking the runtime module.
         """
-        return Profiler(self.params, self.out_idx,
-                        tensor_supply_type).with_default_adapter(self.adapter)
+        return Profiler(self.params, self.out_idx, tensor_supply_type).with_default_adapter(self.adapter)
 
     def get_kernel_source(self, kernel_only: bool = True) -> str:
         """
@@ -507,21 +508,19 @@ class JITKernel(Generic[_P, _T]):
                 dir_path = os.path.dirname(kernel_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(kernel_path, 'w') as f:
+                with open(kernel_path, "w") as f:
                     f.write(self.get_kernel_source())
             if host_path is not None:
                 dir_path = os.path.dirname(host_path)
                 if dir_path:
                     os.makedirs(dir_path, exist_ok=True)
-                with open(host_path, 'w') as f:
+                with open(host_path, "w") as f:
                     f.write(self.get_host_source())
         except Exception as e:
             logger.error(f"Failed to export sources: {e}")
 
     # Backward compatibility alias (deprecated)
-    def print_source_code(self,
-                          which: Literal["kernel", "host", "both"] = "kernel",
-                          file: str | None = None) -> None:
+    def print_source_code(self, which: Literal["kernel", "host", "both"] = "kernel", file: str | None = None) -> None:
         """
         Deprecated: use show_source() or export_sources() instead.
 
@@ -541,16 +540,14 @@ class JITKernel(Generic[_P, _T]):
         >>> # Old API (still works but deprecated)
         >>> jit_kernel.print_source_code(file="/tmp/kernel.cu")
         """
-        logger.warning(
-            "print_source_code is deprecated; use show_source() or export_sources() instead.")
+        logger.warning("print_source_code is deprecated; use show_source() or export_sources() instead.")
         if file is not None:
             # Historical behavior wrote only kernel source when file provided
             self.export_sources(kernel_path=file)
         else:
             self.show_source(which=which)
 
-    def update_tuner_result(self, latency: float, config: dict[str, Any],
-                            ref_latency: float) -> JITKernel:
+    def update_tuner_result(self, latency: float, config: dict[str, Any], ref_latency: float) -> JITKernel:
         """
         Updates the tuning results for this kernel.
 
@@ -651,8 +648,7 @@ class JITKernel(Generic[_P, _T]):
             verbose = self.verbose
         # Ensure target is set so nvcc picks correct arch via Target.current()
         with self.target:
-            return tl_nvcc.get_ptx_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_ptx_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_ptx(self) -> None:
         """
@@ -714,8 +710,7 @@ class JITKernel(Generic[_P, _T]):
         if verbose is None:
             verbose = self.verbose
         with self.target:
-            return tl_nvcc.get_sass_from_source(
-                code, compile_flags=self.compile_flags, verbose=verbose)
+            return tl_nvcc.get_sass_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
 
     def show_sass(self) -> None:
         """
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index c91ac3cb..0f3d5fb1 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 # from .parser import *
@@ -102,7 +103,10 @@ from .utils import index_to_coordinates  # noqa: F401
 
 from .symbolics import dynamic, symbolic  # noqa: F401
 from .annotations import (  # noqa: F401
-    use_swizzle, annotate_layout, annotate_safe_value, annotate_l2_hit_ratio,
+    use_swizzle,
+    annotate_layout,
+    annotate_safe_value,
+    annotate_l2_hit_ratio,
 )
 
 
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index 73377822..b26f0b8f 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -13,8 +13,10 @@ Available allocation functions:
 Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
+
 from __future__ import annotations
 from typing import TypeVar, overload, Literal, Callable
+
 # Python 3.9 compatibility for advanced typing features (PEP 646)
 try:
     from typing import TypeVarTuple, Unpack  # type: ignore[attr-defined]
@@ -30,13 +32,11 @@ from .v2.dtypes import dtype as tl_dtype
 from .v2.builder import OutTensor
 from .v2.annot import Tensor, SharedBuffer, LocalBuffer, FragmentBuffer
 
-_Shapes = TypeVarTuple('_Shapes')
-_DType = TypeVar('_DType')
+_Shapes = TypeVarTuple("_Shapes")
+_DType = TypeVar("_DType")
 
 
-def alloc_shared(shape: tuple[Unpack[_Shapes]],
-                 dtype: _DType,
-                 scope="shared.dyn") -> SharedBuffer[Callable[[Unpack[_Shapes]]], _DType]:
+def alloc_shared(shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="shared.dyn") -> SharedBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -54,9 +54,7 @@ def alloc_shared(shape: tuple[Unpack[_Shapes]],
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape: tuple[Unpack[_Shapes]],
-                dtype: _DType,
-                scope="local") -> LocalBuffer[Callable[[Unpack[_Shapes]]], _DType]:
+def alloc_local(shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="local") -> LocalBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -70,9 +68,9 @@ def alloc_local(shape: tuple[Unpack[_Shapes]],
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape: tuple[Unpack[_Shapes]],
-                   dtype: _DType,
-                   scope="local.fragment") -> FragmentBuffer[Callable[[Unpack[_Shapes]]], _DType]:
+def alloc_fragment(
+    shape: tuple[Unpack[_Shapes]], dtype: _DType, scope="local.fragment"
+) -> FragmentBuffer[Callable[[Unpack[_Shapes]]], _DType]:
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -87,16 +85,11 @@ def alloc_fragment(shape: tuple[Unpack[_Shapes]],
 
 
 @overload
-def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = 'local.var') -> Buffer:
-    ...
+def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
 
 
 @overload
-def alloc_var(dtype: str,
-              scope: str = 'local.var',
-              *,
-              init: PrimExpr | int | float | None = None) -> Buffer:
-    ...
+def alloc_var(dtype: str, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
 
 
 def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
@@ -142,8 +135,7 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
             raise TypeError("Scope must be provided as a string in alloc_var.")
         parsed_scope = parsed_scope_arg
     elif len(args) > 2:
-        raise TypeError(
-            f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
+        raise TypeError(f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
 
     if not isinstance(parsed_scope, str):
         raise TypeError("Scope must be a string in alloc_var.")
@@ -274,13 +266,10 @@ def alloc_tcgen05_instr_desc(dtype: str = "uint32"):
 
 
 @overload
-def empty(shape: tuple[Unpack[_Shapes]],
-          dtype: str = 'float32') -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
-    ...
+def empty(shape: tuple[Unpack[_Shapes]], dtype: str = "float32") -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
 
 
-def empty(*shape: Unpack[_Shapes],
-          dtype: str = 'float32') -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+def empty(*shape: Unpack[_Shapes], dtype: str = "float32") -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
     if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
         return OutTensor(shape[0], dtype)
     elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
@@ -288,4 +277,4 @@ def empty(*shape: Unpack[_Shapes],
     elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
         return OutTensor(shape, dtype)
     else:
-        raise RuntimeError(f'Invalid shape {shape}')
+        raise RuntimeError(f"Invalid shape {shape}")
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 2ce71cb9..09cfa58b 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -1,4 +1,5 @@
 """Annotation helpers exposed on the TileLang language surface."""
+
 from typing import Callable
 
 from tilelang.layout import Layout
diff --git a/tilelang/language/ast/__init__.py b/tilelang/language/ast/__init__.py
index 9d774544..6ab6249b 100644
--- a/tilelang/language/ast/__init__.py
+++ b/tilelang/language/ast/__init__.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """Package tvm.script.ir_builder.tir"""
+
 from .ir import *  # noqa: F401
 from .ir import boolean as bool  # noqa: F401
 from .ir import buffer as Buffer  # noqa: F401
diff --git a/tilelang/language/ast/_ffi_api.py b/tilelang/language/ast/_ffi_api.py
index 518d57ea..5cc74762 100644
--- a/tilelang/language/ast/_ffi_api.py
+++ b/tilelang/language/ast/_ffi_api.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """FFI APIs"""
+
 import tvm.ffi
 
 tvm.ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index 41b658d7..03525143 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -558,7 +558,8 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisSpatial(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def reduce(
@@ -585,7 +586,8 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisReduce(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def scan(
@@ -612,7 +614,8 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisScan(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def opaque(
@@ -639,7 +642,8 @@ class axis:  # pylint: disable=invalid-name
             The iteration variable.
         """
         return _ffi_api.AxisOpaque(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
@@ -662,17 +666,15 @@ class axis:  # pylint: disable=invalid-name
             The iteration variables.
         """
         iter_vars = _ffi_api.AxisRemap(  # type: ignore[attr-defined] # pylint: disable=no-member
-            kinds, bindings, dtype)
+            kinds, bindings, dtype
+        )
         return iter_vars[0] if len(iter_vars) == 1 else iter_vars
 
     S = spatial  # pylint: disable=invalid-name
     R = reduce  # pylint: disable=invalid-name
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -700,10 +702,7 @@ def serial(start: PrimExpr,
     return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -731,10 +730,7 @@ def parallel(start: PrimExpr,
     return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -762,10 +758,7 @@ def vectorized(start: PrimExpr,
     return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -837,7 +830,8 @@ def thread_binding(
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
-        start, stop, thread, annotations)
+        start, stop, thread, annotations
+    )
 
 
 def grid(*extents: PrimExpr) -> frame.ForFrame:
@@ -878,10 +872,10 @@ def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame:  # pylint: d
 
 
 def LetStmt(  # pylint: disable=invalid-name
-        value: PrimExpr,
-        type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
-        *,
-        var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
+    value: PrimExpr,
+    type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
+    *,
+    var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
 ) -> frame.LetFrame:
     """Create a LetStmt binding
 
@@ -909,8 +903,8 @@ def LetStmt(  # pylint: disable=invalid-name
 
 
 def Let(  # pylint: disable=invalid-name
-        expr: PrimExpr,
-        where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
+    expr: PrimExpr,
+    where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
 ) -> PrimExpr:
     """Create a Let expression binding"""
     assert len(where) == 1, "T.Let only allows `where` to have exactly one element"
@@ -980,7 +974,8 @@ def realize(
         The result RealizeFrame.
     """
     return _ffi_api.Realize(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer_slice, storage_scope, condition)
+        buffer_slice, storage_scope, condition
+    )
 
 
 def allocate(
@@ -1012,7 +1007,8 @@ def allocate(
     if isinstance(condition, bool):
         condition = IntImm("bool", condition)
     return _ffi_api.Allocate(  # type: ignore[attr-defined] # pylint: disable=no-member
-        extents, dtype, scope, condition, annotations)
+        extents, dtype, scope, condition, annotations
+    )
 
 
 def allocate_const(
@@ -1048,7 +1044,8 @@ def allocate_const(
         np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np_data), dtype, extents, annotations)
+        ndarray.array(np_data), dtype, extents, annotations
+    )
 
 
 def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame:
@@ -1297,7 +1294,8 @@ def buffer_store(
     if isinstance(value, bool) and buffer.dtype == "bool":
         value = IntImm("bool", value)
     return _ffi_api.BufferStore(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer, value, expr_indices)
+        buffer, value, expr_indices
+    )
 
 
 def prefetch(
@@ -1464,10 +1462,7 @@ def boolean(expr: Optional[PrimExpr] = None, is_size_var: bool = False) -> PrimE
     return _ffi_api.Boolean(expr, is_size_var)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def handle(dtype: Optional[str] = None,
-           storage_scope: str = "global",
-           *,
-           is_size_var: bool = False) -> Var:
+def handle(dtype: Optional[str] = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -1667,7 +1662,7 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
-    return CommReducer(args[:num_args // 2], args[num_args // 2:], res, identity)
+    return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
 
 
 def index_map(
@@ -1700,16 +1695,15 @@ def target(
         The target.
     """
     if not isinstance(target_config, (str, dict)):
-        raise ValueError(
-            f"T.target expected a config dict or string, but got {type(target_config)}")
+        raise ValueError(f"T.target expected a config dict or string, but got {type(target_config)}")
     if host is not None and not isinstance(host, (str, dict, Target)):
-        raise ValueError("T.target expected the host to be "
-                         "a config dict, string, or T.target, "
-                         f"but got {type(host)}")
+        raise ValueError(f"T.target expected the host to be a config dict, string, or T.target, but got {type(host)}")
     if isinstance(target_config, dict) and "host" in target_config and host is not None:
-        raise ValueError("T.target expects to either receive the host "
-                         "as part of the target's config dictionary, "
-                         "or as a separate argument, but not both.")
+        raise ValueError(
+            "T.target expects to either receive the host "
+            "as part of the target's config dictionary, "
+            "or as a separate argument, but not both."
+        )
     return Target(target_config, host)
 
 
@@ -1742,7 +1736,6 @@ class meta_var:  # pylint: disable=invalid-name
         self.value = value
 
     def __iter__(self):
-
         def f():
             for i in self.value:
                 yield meta_var(i)
@@ -1754,7 +1747,6 @@ class meta_var:  # pylint: disable=invalid-name
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -1874,7 +1866,6 @@ vscale = _op_wrapper(_tir_op.vscale)
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 07e45bbc..89a3af25 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -1,6 +1,7 @@
 # Copyright (c) Tile-AI Corporation.
 # Licensed under the MIT License.
 """Atomic operations for tilelang."""
+
 from __future__ import annotations
 
 import tilelang.language as T
@@ -18,10 +19,7 @@ _MEMORY_ORDER_ID_MAP = {
 }
 
 
-def atomic_max(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
@@ -64,10 +62,7 @@ def atomic_max(dst: Buffer,
         return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
 
 
-def atomic_min(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
@@ -112,11 +107,7 @@ def atomic_min(dst: Buffer,
         return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
 
 
-def atomic_add(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False,
-               use_tma: bool = False) -> PrimExpr:
+def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
     """
     Atomically add `value` into `dst`, returning a handle to the operation.
 
@@ -191,8 +182,7 @@ def atomic_add(dst: Buffer,
         if memory_order is None:
             return T.call_extern(return_type, func_name, dst, value)
         else:
-            return T.call_extern(return_type, func_name, dst, value,
-                                 _MEMORY_ORDER_ID_MAP[memory_order])
+            return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
 
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
@@ -208,14 +198,12 @@ def atomic_add(dst: Buffer,
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
     if return_prev:
-        raise NotImplementedError(
-            "return_prev is not supported for tile-region-based atomic operations")
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
 
     if memory_order is None:
         return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma, 0)
     else:
-        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma,
-                             _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, use_tma, _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index 0bc12fcd..60739e61 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tilelang import tvm as tvm
@@ -179,38 +180,32 @@ def set_max_nreg(reg_count: int, is_inc: int):
 
 
 def inc_max_nreg(reg_count: int):
-    """Increment the maximum number of registers to use.
-    """
+    """Increment the maximum number of registers to use."""
     return set_max_nreg(reg_count, 1)
 
 
 def dec_max_nreg(reg_count: int):
-    """Decrement the maximum number of registers to use.
-    """
+    """Decrement the maximum number of registers to use."""
     return set_max_nreg(reg_count, 0)
 
 
 def annotate_producer_reg_dealloc(reg_count: int = 24):
-    """Annotate the producer reg dealloc.
-    """
+    """Annotate the producer reg dealloc."""
     return dec_max_nreg(reg_count)
 
 
 def annotate_consumer_reg_alloc(reg_count: int = 240):
-    """Annotate the consumer reg alloc.
-    """
+    """Annotate the consumer reg alloc."""
     return inc_max_nreg(reg_count)
 
 
 def no_set_max_nreg():
-    """Disable the maximum register limit setting.
-    """
+    """Disable the maximum register limit setting."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.no_set_max_nreg"))
 
 
 def disable_warp_group_reg_alloc():
-    """Disable the warp group reg alloc.
-    """
+    """Disable the warp group reg alloc."""
     return no_set_max_nreg()
 
 
@@ -325,7 +320,9 @@ def warpgroup_wait(num_mma: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.warpgroup_wait"), num_mma)
 
 
-def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_lane_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the logical lane index of the calling thread within a warp.
 
     Parameters
@@ -350,7 +347,9 @@ def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_lane_idx"), warp_size_expr)
 
 
-def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx_sync(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index, assuming the warp's threads are converged.
 
     Parameters
@@ -374,7 +373,9 @@ def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_idx_sync"), warp_size_expr)
 
 
-def get_warp_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index without synchronizing the warp.
 
     Parameters
@@ -429,8 +430,7 @@ def get_warp_group_idx(
         args.append(warp_size_expr)
     if warps_per_group_expr is not None:
         if warp_size_expr is None:
-            raise ValueError("get_warp_group_idx expects `warp_size` when specifying "
-                             "`warps_per_group`.")
+            raise ValueError("get_warp_group_idx expects `warp_size` when specifying `warps_per_group`.")
         args.append(warps_per_group_expr)
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_group_idx"), *args)
 
@@ -459,10 +459,9 @@ def shuffle_elect(thread_extent: int) -> PrimExpr:
     return tir.call_intrin("bool", tir.op.Op.get("tl.tl_shuffle_elect"), thread_extent)
 
 
-def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
-                            offset: int | PrimExpr = 0,
-                            num_regs: int | PrimExpr | None = None,
-                            dtype: str | None = None):
+def warpgroup_fence_operand(
+    buffer_or_ptr: tir.Buffer | PrimExpr, offset: int | PrimExpr = 0, num_regs: int | PrimExpr | None = None, dtype: str | None = None
+):
     """Insert a warpgroup fence for the destination accumulator registers.
 
     This prevents NVCC from sinking uses of accumulator fragments past the corresponding
@@ -517,7 +516,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
 
     if isinstance(buffer_or_ptr, tir.Buffer):
         data_ptr = buffer_or_ptr.data
@@ -531,8 +531,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 if isinstance(dim, tir.IntImm):
                     total_elems *= int(dim)
                 else:
-                    raise ValueError(
-                        "warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
+                    raise ValueError("warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
             bits_per_elem = DataType(dtype).bits
             num_regs = (total_elems * bits_per_elem + 31) // 32
     elif isinstance(buffer_or_ptr, BufferRegion):
@@ -569,9 +568,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 bits_per_elem = DataType(dtype).bits
                 num_regs = (total_elems * bits_per_elem + 31) // 32
             else:
-                raise ValueError(
-                    "warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic."
-                )
+                raise ValueError("warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic.")
         return evaluate(
             tir.call_intrin(
                 "handle",
@@ -580,7 +577,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 data_ptr,
                 convert(offset),
                 convert(num_regs),
-            ))
+            )
+        )
     else:
         data_ptr = buffer_or_ptr
         # Try to infer dtype from common pointer expressions when not provided
@@ -603,9 +601,7 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
                 except Exception:
                     inferred = None
             if inferred is None:
-                raise ValueError(
-                    "dtype must be provided when passing a pointer expression and cannot be inferred."
-                )
+                raise ValueError("dtype must be provided when passing a pointer expression and cannot be inferred.")
             dtype = inferred
         if num_regs is None:
             raise ValueError("num_regs must be provided when passing a pointer expression.")
@@ -618,7 +614,8 @@ def warpgroup_fence_operand(buffer_or_ptr: tir.Buffer | PrimExpr,
             data_ptr,
             convert(offset),
             convert(num_regs),
-        ))
+        )
+    )
 
 
 def wait_wgmma(id: int):
@@ -673,7 +670,7 @@ def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_xor", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -686,7 +683,7 @@ def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Cal
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_down", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -699,12 +696,11 @@ def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call)
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_up", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xFFFFFFFF, value, offset)
 
 
 def sync_threads(barrier_id: int = None, arrive_count: int = None):
-    """Synchronize all threads in a block.
-    """
+    """Synchronize all threads in a block."""
     args = []
     if barrier_id is not None:
         args.append(barrier_id)
@@ -714,8 +710,7 @@ def sync_threads(barrier_id: int = None, arrive_count: int = None):
 
 
 def sync_global():
-    """Synchronize all threads in the entire grid.
-    """
+    """Synchronize all threads in the entire grid."""
     tx, ty, tz = get_thread_bindings()
     ex, ey, ez = get_block_extents()
     print(tx, ty, tz, ex, ey, ez)
@@ -724,8 +719,7 @@ def sync_global():
 
 
 def sync_grid():
-    """Synchronize all threads in a grid.
-    """
+    """Synchronize all threads in a grid."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid"))
 
 
@@ -741,12 +735,10 @@ def initialize_wgmma_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -757,7 +749,8 @@ def initialize_wgmma_descriptor(
             layout_type_,
             int(leading_byte_offset),
             int(stride_byte_offset),
-        ))
+        )
+    )
 
 
 def initialize_tcgen05_descriptor(
@@ -774,12 +767,10 @@ def initialize_tcgen05_descriptor(
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or
-                                               descriptor.shape[0] != 1):
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
     return evaluate(
         tir.call_intrin(
@@ -792,7 +783,8 @@ def initialize_tcgen05_descriptor(
             int(base_offset),
             tir.IntImm("int32", 1 if leading_is_absolute else 0),
             int(swizzle_mode),
-        ))
+        )
+    )
 
 
 def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimExpr:
@@ -809,27 +801,21 @@ def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimEx
     if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
         raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
 
-    if isinstance(descriptor, tir.Buffer) and len(
-            descriptor.shape) != 1 or descriptor.shape[0] != 1:
+    if isinstance(descriptor, tir.Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
         raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
-    return evaluate(
-        tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor,
-                        offset))
+    return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor, offset))
 
 
 def loop_break():
-    """Break out of the innermost loop.
-    """
+    """Break out of the innermost loop."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.loop_break"))
 
 
 def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
-    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
-    """
+    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
 
 
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index cabc4a3e..b80a24e7 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from typing import Literal
 from tilelang import language as T
@@ -10,11 +11,13 @@ from tilelang.utils.language import (
 from tvm import ir, tir
 
 
-def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-         dst: tir.Buffer | tir.BufferLoad,
-         coalesced_width: int | None = None,
-         disable_tma: bool = False,
-         eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+def copy(
+    src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    dst: tir.Buffer | tir.BufferLoad,
+    coalesced_width: int | None = None,
+    disable_tma: bool = False,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Copy data between memory regions.
 
     Args:
@@ -65,8 +68,7 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
     src_extent = get_extent(src)
     dst_extent = get_extent(dst)
     # Combine the nested if statements into a single if statement as suggested by SIM102
-    if (src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and
-            isinstance(dst, tir.BufferLoad)):
+    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
         # check if the case is like this:
         # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
         # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
@@ -90,19 +92,20 @@ def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, coalesced_width,
-                           disable_tma, eviction_policy)
-
-
-def c2d_im2col(img: tir.Buffer,
-               col: tir.Buffer,
-               nhw_step: tir.PrimExpr,
-               c_step: tir.PrimExpr,
-               kernel: int,
-               stride: int,
-               dilation: int,
-               pad: int,
-               eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, coalesced_width, disable_tma, eviction_policy)
+
+
+def c2d_im2col(
+    img: tir.Buffer,
+    col: tir.Buffer,
+    nhw_step: tir.PrimExpr,
+    c_step: tir.PrimExpr,
+    kernel: int,
+    stride: int,
+    dilation: int,
+    pad: int,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Perform im2col transformation for 2D convolution.
 
     Args:
@@ -124,5 +127,16 @@ def c2d_im2col(img: tir.Buffer,
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
     img_region = to_buffer_region(img, access_type="r")
     col_region = to_buffer_region(col, access_type="w")
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.c2d_im2col"), img_region, col_region,
-                           nhw_step, c_step, kernel, stride, dilation, pad, eviction_policy)
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.c2d_im2col"),
+        img_region,
+        col_region,
+        nhw_step,
+        c_step,
+        kernel,
+        stride,
+        dilation,
+        pad,
+        eviction_policy,
+    )
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 720c9e99..e2f4b1c8 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,8 +1,9 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
-from tilelang.utils.language import (bits_product, prim_expr_equal)
+from tilelang.utils.language import bits_product, prim_expr_equal
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
@@ -46,9 +47,9 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     Returns:
         Buffer: A new buffer view with the specified shape
     """
-    assert prim_expr_equal(
-        bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)
-    ), f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
+    assert prim_expr_equal(bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)), (
+        f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
+    )
     return T.Tensor(shape, src.dtype, src.data)
 
 
@@ -61,8 +62,7 @@ def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = N
         shape = src.shape
     if dtype is None:
         dtype = src.dtype
-    assert prim_expr_equal(bits_product(shape, dtype),
-                           bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
+    assert prim_expr_equal(bits_product(shape, dtype), bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, dtype, src.data)
 
 
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index 4a20f3fb..5adac926 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
@@ -11,7 +12,8 @@ from tilelang.utils.language import (
     prim_expr_equal,
 )
 from tilelang.language.utils import (
-    buffer_region_to_tile_region,)
+    buffer_region_to_tile_region,
+)
 
 
 def gemm_sp(
@@ -169,18 +171,19 @@ def gemm_sp_v2(
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
     if len(A_shape) > 2:
         for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
+            assert A_shape[i] == 1, (
                 "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
     if len(B_shape) > 2:
         for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
+            assert B_shape[i] == 1, (
                 "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
 
     M, N = C_shape
     K = 2 * (A_shape[-2] if transpose_A else A_shape[-1])
     K_B = B_shape[-1] if transpose_B else B_shape[-2]
-    assert prim_expr_equal(
-        K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
+    assert prim_expr_equal(K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
 
     stride_a = A_stride[-2]
     stride_b = B_stride[-2]
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index b2373337..af301c26 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
@@ -32,8 +33,7 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
             extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
     else:
         extents = []
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"),
-                           to_buffer_region(buffer, access_type="w", extents=extents), value)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"), to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
 def clear(buffer: tir.Buffer | tir.Var):
@@ -55,8 +55,7 @@ def clear(buffer: tir.Buffer | tir.Var):
         elif isinstance(buffer_region, tir.BufferLoad):
             region = get_buffer_region_from_load(buffer_region)
             if region is None:
-                raise ValueError(
-                    f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+                raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
             return fill(region, 0)
         else:
             raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index db649952..7e60f46e 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,4 +1,5 @@
 """Override the LetFrame to print a message when entering the frame."""
+
 from __future__ import annotations
 from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
@@ -29,7 +30,7 @@ class FrameStack:
             item: The frame object to push onto the stack
         """
         self._stack.append(item)
-        if hasattr(item, 'var') and hasattr(item, 'value'):
+        if hasattr(item, "var") and hasattr(item, "value"):
             self._var_value_map[item.var] = item.value
 
     def pop(self):
@@ -43,7 +44,7 @@ class FrameStack:
         """
         if self._stack:
             item = self._stack.pop()
-            if hasattr(item, 'var'):
+            if hasattr(item, "var"):
                 self._var_value_map.pop(item.var, None)
             return item
         raise IndexError(f"{self.__class__.__name__} is empty")
@@ -129,8 +130,7 @@ class LetFrame(TIRFrame):
                     is_block_load = True
                     break
             if is_block_load:
-                self.value = BufferRegion(self.value.buffer,
-                                          [Range(x.base, x.lanes) for x in indices])
+                self.value = BufferRegion(self.value.buffer, [Range(x.base, x.lanes) for x in indices])
 
         _get_let_stack().push(self)
         return self.var
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index db8e04ab..56f6805f 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 import tilelang.language as T
@@ -11,7 +12,8 @@ from tilelang.utils.language import (
     prim_expr_equal,
 )
 from tilelang.language.utils import (
-    buffer_region_to_tile_region,)
+    buffer_region_to_tile_region,
+)
 from tilelang.env import env as _env
 
 
@@ -68,12 +70,14 @@ def _gemm_impl(
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
     if len(A_shape) > 2:
         for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
+            assert A_shape[i] == 1, (
                 "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
     if len(B_shape) > 2:
         for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
+            assert B_shape[i] == 1, (
                 "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
 
     M, N = C_shape
     K = A_shape[-2] if transpose_A else A_shape[-1]
@@ -96,9 +100,29 @@ def _gemm_impl(
     A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
     B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
     C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
-    return tir.call_intrin("handle", tir.op.Op.get(op_key), A_arg, B_arg, C_arg, transpose_A,
-                           transpose_B, M, N, K, policy, clear_accum, stride_a, stride_b, offset_a,
-                           offset_b, k_pack, wg_wait, mbar, C_coords[0], C_coords[1])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get(op_key),
+        A_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+        mbar,
+        C_coords[0],
+        C_coords[1],
+    )
 
 
 # Public wrappers
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 5e819da7..625531b3 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from collections import deque
 from tvm import tir
@@ -107,8 +108,7 @@ class KernelLaunchFrame(TIRFrame):
         _get_current_stack().push(self)
 
         last_block_frame = self.frames[-1]
-        assert isinstance(last_block_frame,
-                          BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
+        assert isinstance(last_block_frame, BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
 
         maybe_cpu = last_block_frame.annotations.get("tilelang.is_cpu_kernel_frame", False)
 
@@ -303,56 +303,48 @@ def Kernel(
 
 
 def get_thread_binding(dim: int = 0) -> Var:
-    """Returns the thread binding for the given dimension.
-    """
+    """Returns the thread binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_binding(dim)
 
 
 def get_thread_bindings() -> list[Var]:
-    """Returns all three thread bindings.
-    """
+    """Returns all three thread bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_bindings()
 
 
 def get_block_binding(dim: int = 0) -> Var:
-    """Returns the block binding for the given dimension.
-    """
+    """Returns the block binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_binding(dim)
 
 
 def get_block_bindings() -> list[Var]:
-    """Returns all three block bindings.
-    """
+    """Returns all three block bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_bindings()
 
 
 def get_thread_extent(dim: int = 0) -> int:
-    """Returns the thread extent for the given dimension.
-    """
+    """Returns the thread extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extent(dim)
 
 
 def get_thread_extents() -> list[int]:
-    """Returns all three thread extents.
-    """
+    """Returns all three thread extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extents()
 
 
 def get_block_extent(dim: int = 0) -> int:
-    """Returns the block extent for the given dimension.
-    """
+    """Returns the block extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extent(dim)
 
 
 def get_block_extents() -> list[int]:
-    """Returns all three block extents.
-    """
+    """Returns all three block extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extents()
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index a09088e6..fb4b88a6 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tilelang import language as T
@@ -36,8 +37,7 @@ def any_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
@@ -71,7 +71,6 @@ def all_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 3478b6cc..f28f097c 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from typing import Any
 from tvm import tir
@@ -94,11 +95,9 @@ def Pipelined(
     return _ffi_api.Pipelined(start, stop, num_stages, order, stage, sync, group)
 
 
-def serial(start: tir.PrimExpr,
-           stop: tir.PrimExpr | None = None,
-           step: tir.PrimExpr | None = None,
-           *,
-           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
+def serial(
+    start: tir.PrimExpr, stop: tir.PrimExpr | None = None, step: tir.PrimExpr | None = None, *, annotations: dict[str, Any] | None = None
+) -> frame.ForFrame:
     step_is_one = False
     step_is_one |= isinstance(step, int) and step == 1
     step_is_one |= isinstance(step, IntImm) and step.value == 1
@@ -111,13 +110,15 @@ def serial(start: tir.PrimExpr,
         return SerialForWithStep(start, stop, step, annotations=annotations)
 
 
-def unroll(start: tir.PrimExpr,
-           stop: tir.PrimExpr | None = None,
-           step: tir.PrimExpr | None = None,
-           *,
-           explicit: bool = False,
-           unroll_factor: int | None = None,
-           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
+def unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 39cab27a..7a6104c7 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -3,7 +3,7 @@ from tvm import tir
 
 def _validate_rounding_mode(rounding_mode):
     """Validate that the rounding mode is one of the supported IEEE modes"""
-    valid_modes = {'rn', 'rz', 'ru', 'rd'}
+    valid_modes = {"rn", "rz", "ru", "rd"}
     if isinstance(rounding_mode, str) and rounding_mode in valid_modes:
         return
     raise ValueError(f"Invalid rounding mode '{rounding_mode}'. Must be one of: {valid_modes}")
diff --git a/tilelang/language/overrides/parser.py b/tilelang/language/overrides/parser.py
index af42098a..0b2fcc44 100644
--- a/tilelang/language/overrides/parser.py
+++ b/tilelang/language/overrides/parser.py
@@ -1,4 +1,5 @@
 """TVMScript parser overrides tailored for TileLang."""
+
 from functools import partial
 
 from tvm.script.ir_builder import tir as T
@@ -58,8 +59,12 @@ def tilelang_visit_assign(self, node: doc.Assign) -> None:  # pylint: disable=un
             lhs.ctx = load_ctx
             lhs_value = self.eval_expr(lhs)
             lhs.ctx = store_ctx
-            if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                    len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+            if (
+                isinstance(lhs_value, BufferLoad)
+                and lhs_value.buffer.scope() == "local.var"
+                and len(lhs_value.indices) == 1
+                and lhs_value.indices[0] == 0
+            ):
                 T.buffer_store(lhs_value.buffer, rhs, indices=[0])
                 continue
 
@@ -106,8 +111,12 @@ def tilelang_visit_aug_assign(self, node: doc.AugAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
@@ -131,8 +140,12 @@ def tilelang_visit_ann_assign(self, node: doc.AnnAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index aa98cf56..5f2aaab7 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -18,6 +18,7 @@
 # which is part of the TVM project (https://tvm.apache.org/).
 # ruff: noqa
 """The entry point of TVM parser for tir."""
+
 import inspect
 from typing import Callable, Optional, Union
 
@@ -29,9 +30,7 @@ from tvm.script.parser._core import parse, scan_macro, utils
 from tvm.script.parser.core.parser import Parser, ScriptMacro
 
 
-def prim_func(func: Optional[Callable] = None,
-              private: bool = False,
-              check_well_formed=True) -> Union[PrimFunc, Callable]:
+def prim_func(func: Optional[Callable] = None, private: bool = False, check_well_formed=True) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -149,8 +148,7 @@ def macro(*args, hygienic: bool = True) -> Callable:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 class BufferProxy:
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index b2138acf..473da432 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """The tir expression operation registration"""
+
 from tvm import tir
 from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
 from tvm.tir import IntImm
@@ -55,11 +56,9 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
         return dtype[0:index]
 
     def _auto_broadcast(a, b, op):
-
         if isinstance(a, int):
             if hasattr(b, "dtype"):
-                if (DataType(b.dtype).type_code == DataTypeCode.INT or
-                        DataType(b.dtype).type_code == DataTypeCode.UINT):
+                if DataType(b.dtype).type_code == DataTypeCode.INT or DataType(b.dtype).type_code == DataTypeCode.UINT:
                     a = IntImm(_get_type_str(b.dtype), a)
                 elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
                     a = FloatImm(_get_type_str(b.dtype), a)
@@ -75,8 +74,7 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
 
         assert isinstance(a, tir.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (DataType(a.dtype).type_code == DataTypeCode.INT or
-                    DataType(a.dtype).type_code == DataTypeCode.UINT):
+            if DataType(a.dtype).type_code == DataTypeCode.INT or DataType(a.dtype).type_code == DataTypeCode.UINT:
                 b = IntImm(_get_type_str(a.dtype), b)
             elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
                 b = FloatImm(_get_type_str(a.dtype), b)
@@ -85,10 +83,10 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
 
         if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
             return op(a, b)
-        elif (DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_a = tir.Broadcast(a, DataType(b.dtype).lanes)
             return op(broadcast_a, b)
-        elif (DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_b = tir.Broadcast(b, DataType(a.dtype).lanes)
             return op(a, broadcast_b)
         else:
diff --git a/tilelang/language/parser/parser.py b/tilelang/language/parser/parser.py
index 3aa720d4..4cac0ad7 100644
--- a/tilelang/language/parser/parser.py
+++ b/tilelang/language/parser/parser.py
@@ -146,8 +146,7 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
         res = value.__enter__()
         IRBuilder.name(var_name, res)
         return res
-    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and
-                                                  not self.var_table.exist(value)):
+    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and not self.var_table.exist(value)):
         IRBuilder.name(var_name, value)
         return value
     else:
@@ -191,8 +190,7 @@ def visit_for(self: Parser, node: doc.For) -> None:
     if not isinstance(for_frame, T.frame.ForFrame):
         self.report_error(
             node.iter,
-            "Expect the for loop to be one of the following: "
-            "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
+            "Expect the for loop to be one of the following: range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
         )
     with self.var_table.with_frame():
         with for_frame as iters:
@@ -361,8 +359,7 @@ def visit_with(self: Parser, node: doc.With) -> None:
         for item in node.items:
             frame = self.eval_expr(item.context_expr)
             if not isinstance(frame, Frame):
-                self.report_error(item.context_expr,
-                                  "Invalid context expression in the with-statement.")
+                self.report_error(item.context_expr, "Invalid context expression in the with-statement.")
             rhs = stack.enter_context(frame)
             if item.optional_vars is not None:
                 self.eval_assign(target=item.optional_vars, source=rhs, bind_value=bind_with_value)
@@ -505,8 +502,7 @@ def visit_if(self: Parser, node: doc.If) -> None:
                 with self.var_table.with_frame():
                     self.visit_body(node.orelse)
         else:
-            self.report_error(node.test,
-                              f"If condition must be a boolean expression, but got {predicate}")
+            self.report_error(node.test, f"If condition must be a boolean expression, but got {predicate}")
 
 
 @dispatch.register(token="tir", type_name="Assert")
diff --git a/tilelang/language/print.py b/tilelang/language/print.py
index 08e18f42..bbaa119e 100644
--- a/tilelang/language/print.py
+++ b/tilelang/language/print.py
@@ -26,9 +26,7 @@ def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
 
 
 @macro
-def print_var_with_condition(condition: tir.PrimExpr,
-                             var: tir.PrimExpr,
-                             msg: str = "") -> tir.PrimExpr:
+def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints a TIR primitive expression (PrimExpr) if a given condition is True.
 
@@ -44,10 +42,7 @@ def print_var_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_global_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_global_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
     """
@@ -55,17 +50,13 @@ def print_global_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
     else:
         tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_shared_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -81,15 +72,11 @@ def print_shared_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
-                                         buffer: tir.Buffer,
-                                         elems: int,
-                                         msg: str = "") -> tir.PrimExpr:
+def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -111,10 +98,7 @@ def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_local_buffer_with_condition(condition: tir.PrimExpr,
-                                      buffer: tir.Buffer,
-                                      elems: int,
-                                      msg: str = "") -> tir.PrimExpr:
+def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -130,8 +114,7 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 from tilelang.utils.target import check_cuda_availability
@@ -201,7 +184,7 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_fragment_buffer_with_condition(condition, buffer, elems, msg)
@@ -212,7 +195,7 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_shared_buffer_with_condition(condition, buffer, elems, msg)
@@ -234,5 +217,4 @@ def print(obj: Any, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) ->
 
     else:
         # Unsupported object type.
-        raise ValueError(
-            f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
+        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 9e209a1b..7807a466 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from typing import Any, SupportsIndex, TYPE_CHECKING, Generic, TypeVar
@@ -51,11 +52,9 @@ class BufferProxy:
             return self(keys)
         return self(*keys)  # type: ignore[attr-defined] # pylint: disable=no-member
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -76,6 +75,7 @@ class BaseTensorProxy:
     customizable default values for scope, alignment, and offset factors. It implements
     the core functionality for creating TIR buffers with specific memory configurations.
     """
+
     default_scope = "global"
     default_align = 0
     default_offset_factor = 0
@@ -118,11 +118,9 @@ class BaseTensorProxy:
             keys = (keys,)
         return self(*keys)
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -151,19 +149,10 @@ class TensorProxy(BaseTensorProxy):
             strides.append(s)
         return tuple(reversed(strides))
 
-    def __call__(self,
-                 shape: tuple[Any] | PrimExpr | int,
-                 dtype: str = "float32",
-                 data=None,
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any] | PrimExpr | int, dtype: str = "float32", data=None, scope=None) -> tir.Buffer:
         if isinstance(shape, (int, PrimExpr)):
             shape = (shape,)
-        return super().__call__(
-            shape,
-            dtype=dtype,
-            strides=TensorProxy._construct_strides(shape),
-            data=data,
-            scope=scope)
+        return super().__call__(shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data, scope=scope)
 
 
 class StridedTensorProxy(BaseTensorProxy):
@@ -172,11 +161,7 @@ class StridedTensorProxy(BaseTensorProxy):
     This class implements the default tensor proxy with global memory scope, with the stride information required.
     """
 
-    def __call__(self,
-                 shape: tuple[Any],
-                 strides: tuple[Any],
-                 dtype: str = "float32",
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any], strides: tuple[Any], dtype: str = "float32", scope=None) -> tir.Buffer:
         if len(shape) != len(strides):
             raise ValueError("Invalid shape/strides' dimensions")
         return super().__call__(shape, dtype=dtype, strides=strides, scope=scope)
@@ -188,6 +173,7 @@ class FragmentBufferProxy(BaseTensorProxy):
     This class represents tensor proxies specifically for local fragment memory,
     typically used in GPU tensor core operations.
     """
+
     default_scope = "local.fragment"
 
 
@@ -197,6 +183,7 @@ class SharedBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for dynamic shared memory,
     commonly used in GPU shared memory operations.
     """
+
     default_scope = "shared.dyn"
 
 
@@ -206,6 +193,7 @@ class LocalBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for local memory scope,
     typically used for temporary computations in GPU kernels.
     """
+
     default_scope = "local"
 
 
@@ -216,15 +204,12 @@ Buffer = BufferProxy()  # pylint: disable=invalid-name
 if TYPE_CHECKING:
 
     class BaseTensor:
-
         def __class_getitem__(cls, key):
             return cls
 
-        def __getitem__(self, key) -> Any:
-            ...
+        def __getitem__(self, key) -> Any: ...
 
-        def __setitem__(self, key, value) -> None:
-            ...
+        def __setitem__(self, key, value) -> None: ...
 
         def __init__(
             self,
@@ -238,36 +223,26 @@ if TYPE_CHECKING:
             offset_factor=None,
             buffer_type="",
             axis_separators=None,
-        ):
-            ...
+        ): ...
 
         @classmethod
-        def from_ptr(cls,
-                     pointer_var: Var,
-                     shape: Sequence[PrimExpr, ...],
-                     dtype: str = "float32",
-                     strides: tuple[PrimExpr, ...] = None) -> Self:
-            ...
+        def from_ptr(
+            cls, pointer_var: Var, shape: Sequence[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+        ) -> Self: ...
 
-    class Tensor(BaseTensor):
-        ...
+    class Tensor(BaseTensor): ...
 
-    class StridedTensor(BaseTensor):
-        ...
+    class StridedTensor(BaseTensor): ...
 
-    class FragmentBuffer(BaseTensor):
-        ...
+    class FragmentBuffer(BaseTensor): ...
 
-    class SharedBuffer(BaseTensor):
-        ...
+    class SharedBuffer(BaseTensor): ...
 
-    class LocalBuffer(BaseTensor):
-        ...
+    class LocalBuffer(BaseTensor): ...
 
-    _T = TypeVar('_T')
+    _T = TypeVar("_T")
 
-    class Ref(Generic[_T], tir.Var):
-        ...
+    class Ref(Generic[_T], tir.Var): ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
     StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
@@ -275,14 +250,10 @@ else:
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
 
-    class Ref:
-        ...
+    class Ref: ...
 
 
-def ptr(dtype: str | None = None,
-        storage_scope: str = "global",
-        *,
-        is_size_var: bool = False) -> Var:
+def ptr(dtype: str | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -304,8 +275,5 @@ def ptr(dtype: str | None = None,
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var,
-                shape: tuple[PrimExpr, ...],
-                dtype: str = "float32",
-                strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+def make_tensor(ptr: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
     return Tensor.from_ptr(ptr, shape, dtype, strides)
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index fb84b6d7..9bb3b179 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
@@ -30,15 +31,13 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         tir.Call: Handle to the reduction operation
     """
     # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
-    expected_shapes = [
-        buffer.shape[:dim] + buffer.shape[dim + 1:],
-        buffer.shape[:dim] + [1] + buffer.shape[dim + 1:]
-    ]
+    expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
     if list(out.shape) not in expected_shapes:
-        expected_shapes_str = ' or '.join(map(str, expected_shapes))
+        expected_shapes_str = " or ".join(map(str, expected_shapes))
         raise ValueError(
             f"Invalid reduce output shape, buffer shape is {buffer.shape}, dim is {dim}, "
-            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}")
+            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
+        )
 
     @macro
     def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index 22702ae4..82ae7d70 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -7,9 +7,7 @@ from tvm.tir.function import PrimFunc
 from tvm.script.parser._core import parse, scan_macro, utils
 
 
-def prim_func(func: Callable | None = None,
-              private: bool = False,
-              check_well_formed: bool = False) -> PrimFunc | Callable:
+def prim_func(func: Callable | None = None, private: bool = False, check_well_formed: bool = False) -> PrimFunc | Callable:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -113,8 +111,7 @@ def macro(*args, hygienic: bool = True) -> Callable:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 setattr(macro, "dispatch_token", "tir")  # noqa: B010
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index 74cb32f7..a8367933 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -6,10 +6,7 @@ import tilelang.language.tir.op as _tir_op
 import functools
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -31,10 +28,7 @@ def serial(start: PrimExpr,
     return _ir.serial(start=start, stop=stop, annotations=annotations)
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -56,10 +50,7 @@ def parallel(start: PrimExpr,
     return _ir.parallel(start=start, stop=stop, annotations=annotations)
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -81,10 +72,7 @@ def vectorized(start: PrimExpr,
     return _ir.vectorized(start=start, stop=stop, annotations=annotations)
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -161,7 +149,6 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -172,7 +159,6 @@ def _dtype_forward(func):
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
diff --git a/tilelang/language/tir/ir.pyi b/tilelang/language/tir/ir.pyi
index fe25b58f..7723f137 100644
--- a/tilelang/language/tir/ir.pyi
+++ b/tilelang/language/tir/ir.pyi
@@ -1,22 +1,22 @@
 from typing import TypeVar, Literal
 from tvm.tir.expr import Span, PrimExpr, BufferLoad, Var, IntImm
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
-def abs(x: _T, span: Span | None=None) -> _T: ...
+def abs(x: _T, span: Span | None = None) -> _T: ...
 def acos(x: _T) -> _T: ...
 def acosh(x: _T) -> _T: ...
-def address_of(buffer_load: BufferLoad, span: Span | None=None) -> PrimExpr: ...
+def address_of(buffer_load: BufferLoad, span: Span | None = None) -> PrimExpr: ...
 def asin(x: _T) -> _T: ...
 def asinh(x: _T) -> _T: ...
 def atan(x: _T) -> _T: ...
 def atan2(x1: _T, x2: _T) -> _T: ...
 def atanh(x: _T) -> _T: ...
-def bitwise_and(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def bitwise_not(x: _T, span: Span | None=None) -> _T: ...
-def bitwise_or(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def bitwise_xor(x: _T, y: _T, span: Span | None=None) -> _T: ...
-def ceil(x: _T, span: Span | None=None) -> _T: ...
+def bitwise_and(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_not(x: _T, span: Span | None = None) -> _T: ...
+def bitwise_or(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_xor(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def ceil(x: _T, span: Span | None = None) -> _T: ...
 def clz(x: _T) -> _T: ...
 def copysign(x1: _T, x2: _T) -> _T: ...
 def cos(x: _T) -> _T: ...
@@ -25,35 +25,37 @@ def erf(x: _T) -> _T: ...
 def exp(x: _T) -> _T: ...
 def exp2(x: _T) -> _T: ...
 def exp10(x: _T) -> _T: ...
-def floor(x: _T, span: Span | None=None) -> _T: ...
-def ceildiv(lhs: _T, rhs: _T, span: Span | None=None) -> _T: ...
-def floordiv(a: _T, b: _T, span: Span | None=None) -> _T: ...
-def floormod(a: _T, b: _T, span: Span | None=None) -> _T: ...
+def floor(x: _T, span: Span | None = None) -> _T: ...
+def ceildiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
+def floordiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def floormod(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def fmod(x: _T, y: _T) -> _T: ...
 def hypot(x1: _T, x2: _T) -> _T: ...
-def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None=None) -> _T: ...
-def infinity(dtype: _T, span: Span | None=None) -> _T: ...
-def isfinite(x: _T, span: Span | None=None) -> _T: ...
-def isinf(x: _T, span: Span | None=None) -> _T: ...
-def isnan(x: _T, span: Span | None=None) -> _T: ...
-def isnullptr(x: _T, span: Span | None=None) -> _T: ...
+def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None = None) -> _T: ...
+def infinity(dtype: _T, span: Span | None = None) -> _T: ...
+def isfinite(x: _T, span: Span | None = None) -> _T: ...
+def isinf(x: _T, span: Span | None = None) -> _T: ...
+def isnan(x: _T, span: Span | None = None) -> _T: ...
+def isnullptr(x: _T, span: Span | None = None) -> _T: ...
 def ldexp(x1: _T, x2: _T) -> _T: ...
-def likely(cond: _T, span: Span | None=None) -> _T: ...
+def likely(cond: _T, span: Span | None = None) -> _T: ...
 def log(x: _T) -> _T: ...
 def log1p(x: _T) -> _T: ...
 def log2(x: _T) -> _T: ...
 def log10(x: _T) -> _T: ...
-def lookup_param(param_name: str, span: Span | None=None) -> PrimExpr: ...
-def max_value(dtype: str, span: Span | None=None) -> PrimExpr: ...
-def min_value(dtype: str, span: Span | None=None) -> PrimExpr: ...
-def nearbyint(x: _T, span: Span | None=None) -> _T: ...
+def lookup_param(param_name: str, span: Span | None = None) -> PrimExpr: ...
+def max_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def min_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def nearbyint(x: _T, span: Span | None = None) -> _T: ...
 def nextafter(x1: _T, x2: _T) -> _T: ...
 def popcount(x: _T) -> _T: ...
-def pow(x: _T, y: _T, span: Span | None=None) -> _T: ...
+def pow(x: _T, y: _T, span: Span | None = None) -> _T: ...
 def q_multiply_shift(x: _T, y: _T, q: _T, s: _T) -> _T: ...
-def q_multiply_shift_per_axis(x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm) -> PrimExpr: ...
+def q_multiply_shift_per_axis(
+    x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm
+) -> PrimExpr: ...
 def ret(val: _T) -> _T: ...
-def round(x: _T, span: Span | None=None) -> _T: ...
+def round(x: _T, span: Span | None = None) -> _T: ...
 def rsqrt(x: _T) -> _T: ...
 def shift_left(x: _T, y: _T, span=None) -> _T: ...
 def shift_right(x: _T, y: _T, span=None) -> _T: ...
@@ -63,14 +65,16 @@ def sinh(x: _T) -> _T: ...
 def sqrt(x: _T) -> _T: ...
 def tan(x: _T) -> _T: ...
 def tanh(x: _T) -> _T: ...
-def trunc(x: _T, span: Span | None=None) -> _T: ...
-def truncdiv(a: _T, b: _T, span: Span | None=None) -> _T: ...
-def truncmod(a: _T, b: _T, span: Span | None=None) -> _T: ...
+def trunc(x: _T, span: Span | None = None) -> _T: ...
+def truncdiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def truncmod(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def tvm_access_ptr(ptype: PrimExpr, data, offset: int, extent: int, rw_mask: int) -> PrimExpr: ...
 def tvm_throw_last_error() -> _T: ...
 def tvm_stack_alloca(dtype_str: str, num: int) -> PrimExpr: ...
 def tvm_stack_make_shape(*args) -> _T: ...
-def tvm_stack_make_array(data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset) -> PrimExpr: ...
+def tvm_stack_make_array(
+    data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset
+) -> PrimExpr: ...
 def tvm_check_return(expected: int, return_unexpected: int, nested_call: PrimExpr) -> PrimExpr: ...
 def call_packed(*args, span=None) -> _T: ...
 def call_cpacked(*args, span=None) -> _T: ...
@@ -80,11 +84,47 @@ def tvm_tuple(*value) -> _T: ...
 def tvm_struct_set(arr, index: int, field: int, value: PrimExpr) -> PrimExpr: ...
 def tvm_thread_invariant(cond: _T) -> _T: ...
 def tvm_thread_allreduce(*freduce_args) -> _T: ...
-def tvm_load_matrix_sync(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, buffer_ptr: PrimExpr, stride: PrimExpr, layout: Literal['row_major', 'column_major']) -> PrimExpr: ...
-def tvm_mma_sync(fragment_d: Var, index_d: PrimExpr, fragment_a: Var, index_a: PrimExpr, fragment_b: Var, index_b: PrimExpr, fragment_c: Var, index_c: PrimExpr) -> PrimExpr: ...
-def tvm_bmma_sync(fragment_d: Var, index_d: PrimExpr, fragment_a: Var, index_a: PrimExpr, fragment_b: Var, index_b: PrimExpr, fragment_c: Var, index_c: PrimExpr) -> PrimExpr: ...
+def tvm_load_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def tvm_mma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_bmma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
 def tvm_fill_fragment(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, value: PrimExpr) -> PrimExpr: ...
-def tvm_store_matrix_sync(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, buffer_ptr: PrimExpr, stride: PrimExpr, layout: Literal['row_major', 'column_major']) -> PrimExpr: ...
+def tvm_store_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
 def ptx_wait_group(num: int) -> PrimExpr: ...
 def ptx_commit_group() -> _T: ...
 def ptx_cp_async_barrier(barrier_id: int) -> PrimExpr: ...
@@ -93,7 +133,7 @@ def ptx_arrive_barrier(barrier_id: int) -> PrimExpr: ...
 def ptx_arrive_barrier_expect_tx(barrier_id: int, byte_count: int) -> PrimExpr: ...
 def ptx_wait_barrier(barrier_id: int) -> PrimExpr: ...
 def create_barriers(barrier_count: int) -> PrimExpr: ...
-def assume(cond: _T=None) -> _T: ...
+def assume(cond: _T = None) -> _T: ...
 def undef() -> _T: ...
 def TVMBackendAllocWorkspace(device_type: int, device_id: int, nbytes: int, dtype_code_hint: int, dtype_bits_hint: int) -> PrimExpr: ...
 def TVMBackendFreeWorkspace(device_type: int, device_id: int, ptr: Var) -> PrimExpr: ...
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index a9ce6a53..6cf78418 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -724,8 +724,7 @@ def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     return _tvm_op.tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout)
 
 
-def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                 index_c):
+def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core mma_sync operators
 
     Parameters
@@ -759,12 +758,10 @@ def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                fragment_c, index_c)
+    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
-def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                  index_c):
+def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core bmma_sync operators
 
     Parameters
@@ -798,8 +795,7 @@ def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                 fragment_c, index_c)
+    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
 def tvm_fill_fragment(fragment, m, n, k, index, value):
@@ -1121,7 +1117,6 @@ def ptx_wgmma_rs(
     scale_in_a,
     scale_in_b,
 ):
-
     return call_intrin(
         dtype,
         _tvm_op.Op.get("tl.ptx_wgmma_rs"),
@@ -1345,8 +1340,7 @@ def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, sme
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr,
-                                smem_offset)
+    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset)
 
 
 def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
@@ -1381,8 +1375,7 @@ def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, by
     return _tvm_op.ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes)
 
 
-def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes,
-                      barrier_id):
+def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async.bulk
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 
@@ -1414,8 +1407,7 @@ def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offse
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset,
-                                     bytes, barrier_id)
+    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id)
 
 
 def ptx_commit_group():
@@ -2951,8 +2943,7 @@ def q_multiply_shift_per_axis(
     z : PrimExpr
         The result.
     """
-    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required,
-                                             is_rshift_required)
+    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required, is_rshift_required)
 
 
 def shift_left(x, y, span=None):
@@ -3302,8 +3293,7 @@ def TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dt
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint,
-                                            dtype_bits_hint)
+    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dtype_bits_hint)
 
 
 def TVMBackendFreeWorkspace(device_type, device_id, ptr):
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 136bc0ba..7d682941 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -14,23 +14,18 @@ def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list
     """Convert a BufferLoad to a tl.region call with explicit extents."""
     indices = list(load.indices)
     if len(indices) > len(extents):
-        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))
-                  ] + list(extents)
+        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))] + list(extents)
     assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
     return region(load, access_type, *extents)
 
 
-def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str,
-                                 extents: list[tir.PrimExpr]):
+def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]):
     """Clamp extents and return a tl.region call."""
     mins = [r.min for r in buffer_region.region]
     region_extents = [r.extent for r in buffer_region.region]
-    assert len(region_extents) >= len(extents), (
-        f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
-    )
+    assert len(region_extents) >= len(extents), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
     clamped_extents = [
-        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i]
-        for i in range(len(region_extents))
+        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i] for i in range(len(region_extents))
     ]
     return region(tir.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
 
diff --git a/tilelang/language/v2/annot.py b/tilelang/language/v2/annot.py
index b61d9d11..bac92142 100644
--- a/tilelang/language/v2/annot.py
+++ b/tilelang/language/v2/annot.py
@@ -5,6 +5,7 @@ from tvm import tir
 from tvm.ir.expr import PrimExpr
 from tvm.script.ir_builder.tir import buffer
 from typing import Any, Callable, Literal, TypeVar, Generic, TYPE_CHECKING
+
 # Python 3.9 compatibility for advanced typing features
 try:
     from typing import ParamSpec, TypeVarTuple, Unpack, Self  # type: ignore[attr-defined]
@@ -37,16 +38,16 @@ from tvm.script.ir_builder import IRBuilder
 import torch
 import inspect
 
-_Shapes = TypeVarTuple('_Shapes')
-_Shape = ParamSpec('_Shape')
-_Stride = ParamSpec('_Stride')
-_DType = TypeVar('_DType')
+_Shapes = TypeVarTuple("_Shapes")
+_Shape = ParamSpec("_Shape")
+_Stride = ParamSpec("_Stride")
+_DType = TypeVar("_DType")
 
-Scope = Literal['global', 'shared.dyn', 'local', 'local.fragment']
+Scope = Literal["global", "shared.dyn", "local", "local.fragment"]
 
 
 class Annot(ABC):
-    '''
+    """
     Base class for tilelang kernel annotations
     Tilelang kernel annotations are used to specify how to interpret each argument of the jit kernel
 
@@ -54,12 +55,12 @@ class Annot(ABC):
     1. determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
     2. parse the argument value into a hash key for jit caching
     3. convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
-    '''
+    """
 
     def is_kernel_arg(self) -> bool:
-        '''
+        """
         Determine whether the argument is a kernel argument (i.e., needs to be passed at kernel launch time)
-        '''
+        """
         return False
 
     @abstractmethod
@@ -68,29 +69,29 @@ class Annot(ABC):
 
     @abstractmethod
     def get_key_parser(self) -> Callable[[str, Any], tuple[Any, ...]]:
-        '''
+        """
         Return a parser function that converts the argument value into a hash key for jit caching
-        '''
+        """
 
     @abstractmethod
     def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable) -> tir.Var | tir.Buffer:
-        '''
+        """
         Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
-        '''
+        """
 
     def promote(self) -> TIRAnnot | None:
-        '''
+        """
         Try to promote the annotation into a FixedAnnot if possible
         Return None if not promotable
-        '''
+        """
         return None
 
 
 @dataclass
 class ArgVarTable:
-    '''
+    """
     ArgVarTable is used to manage the mapping from argument names to tir.Var objects
-    '''
+    """
 
     var_tab: dict[str, tir.Var] = field(default_factory=dict)
     tmp_name_idx: int = 0
@@ -103,50 +104,49 @@ class ArgVarTable:
         return self.var_tab[name]
 
     def create_tmp_name(self) -> str:
-        name = f'varg_{self.tmp_name_idx}'
+        name = f"varg_{self.tmp_name_idx}"
         self.tmp_name_idx += 1
         return name
 
 
 @dataclass
 class Value(Annot):
-    kind: Literal['static', 'dynamic'] = 'dynamic'
+    kind: Literal["static", "dynamic"] = "dynamic"
     name: str | None = None
     dtype: dt.dtype | None = dt.int32
     value: int | tir.Var | None = None
     creator: Callable[[], Any] | None = None
 
     def is_kernel_arg(self) -> bool:
-        return self.kind == 'dynamic'
+        return self.kind == "dynamic"
 
     @classmethod
     def from_value(cls, value: Any, prefer_name: str = None) -> Value:
         if isinstance(value, int):
             # handle A: T.Tensor[[1024, 1024], ...]
-            return Value(kind='static', name=prefer_name, dtype=dt.int32, value=value)
+            return Value(kind="static", name=prefer_name, dtype=dt.int32, value=value)
         elif isinstance(value, float):
-            return Value(kind='static', name=prefer_name, dtype=dt.float32, value=value)
+            return Value(kind="static", name=prefer_name, dtype=dt.float32, value=value)
         elif isinstance(value, dt.dtype):
             # handle A: T.float32
-            return Value(kind='dynamic', name=prefer_name, dtype=value, value=None)
+            return Value(kind="dynamic", name=prefer_name, dtype=value, value=None)
         elif isinstance(value, Value):
             # handle A: T.dyn
             return value
         elif isinstance(value, TypeVar):
-            return Value(kind='static', name=value.__name__, value=None)
+            return Value(kind="static", name=value.__name__, value=None)
         elif isinstance(value, (tir.Var, PrimExpr)):
             # handle A: T.Tensor[[M, N, K], ...]
             # or primexpr annotation like A: T.Tensor[[M, N * 4 +1]]
             name = value.name if isinstance(value, tir.Var) else prefer_name
-            return Value(kind='dynamic', name=name, dtype=value.dtype, value=value)
-        elif value is Any or value is None or value is dt.dtype or isinstance(
-                value, (type,) + _GenericAliasTypes):
+            return Value(kind="dynamic", name=name, dtype=value.dtype, value=value)
+        elif value is Any or value is None or value is dt.dtype or isinstance(value, (type,) + _GenericAliasTypes):
             # A # no annotation
             # A: Any
             # A: _T
             # A: dt.dtype
             # A: tuple[...]
-            return Value(kind='static', name=prefer_name, value=None)
+            return Value(kind="static", name=prefer_name, value=None)
         else:
             raise TypeError(f"Unsupported Value annotation: {value!r}, type: {type(value)}")
 
@@ -154,7 +154,7 @@ class Value(Annot):
         return Value(kind=self.kind, name=self.name or name, dtype=self.dtype, value=self.value)
 
     def get_key_parser(self):
-        if self.kind == 'static':
+        if self.kind == "static":
             if self.value is not None:
                 expected_value = self.value
 
@@ -172,7 +172,7 @@ class Value(Annot):
         return self.get_key_parser()(target)
 
     def create_prim_func_arg(self, name: str, value: Any, vt: ArgVarTable, create_arg: bool = True):
-        if self.kind == 'static':
+        if self.kind == "static":
             if self.value:
                 assert self.value == value, f"static value mismatch for {name}: expected {self.value}, got {value}"
             return value
@@ -187,18 +187,18 @@ class Value(Annot):
             return tb_tir.arg(name, arg) if create_arg else arg
 
     def __repr__(self):
-        if self.kind == 'static':
+        if self.kind == "static":
             if self.value is not None:
                 return repr(self.value)
             else:
-                return (str(self.name) or '$unnamed') + '$'
+                return (str(self.name) or "$unnamed") + "$"
         else:
             if self.value is not None:
                 return repr(self.value)
             elif self.creator is not None:
                 return repr(self.creator())
             else:
-                return (str(self.name) or '$unnamed') + '$dyn'
+                return (str(self.name) or "$unnamed") + "$dyn"
 
 
 def _canonicalize_dtype(val: Any) -> dt.dtype | None:
@@ -226,7 +226,7 @@ def _shape_with_name(shape: Sequence[Value], base_name: str) -> list[Value]:
         return None
     res = []
     for i, dim in enumerate(shape):
-        dim = dim.with_name(f'{base_name}_{i}')
+        dim = dim.with_name(f"{base_name}_{i}")
         res.append(dim)
     return res
 
@@ -236,7 +236,7 @@ def _try_convert_static_shape(shape: Sequence[Value]):
         return None
     res = []
     for s in shape:
-        if s.kind == 'static' and s.value is not None or s.kind == 'dynamic' and s.value is not None:
+        if s.kind == "static" and s.value is not None or s.kind == "dynamic" and s.value is not None:
             res.append(s.value)
     if len(res) == len(shape):
         return res
@@ -253,7 +253,7 @@ class BufferAnnot(Annot):
 
     @property
     def scope(self):
-        return 'global'
+        return "global"
 
     def __call__(
         self,
@@ -290,8 +290,8 @@ class BufferAnnot(Annot):
         return self.__class__(shape, strides=self.strides, dtype=dtype)
 
     def with_name(self, name: str):
-        shape = _shape_with_name(self.shape, base_name=f'{name}_shape')
-        strides = _shape_with_name(self.strides, base_name=f'{name}_stride')
+        shape = _shape_with_name(self.shape, base_name=f"{name}_shape")
+        strides = _shape_with_name(self.strides, base_name=f"{name}_stride")
         return self.__class__(shape, strides, self.dtype)
 
     def get_key_parser(self):
@@ -299,14 +299,14 @@ class BufferAnnot(Annot):
         if self.shape is not None:
             raw_shapes = False
             shape_len = len(self.shape)
-            static_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == 'static']
+            static_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == "static"]
             # static_fixed_shape_idx = [i for i, dim in enumerate(self.shape) if dim.kind == 'static' and dim.value is not None]
             # static_fixed_shape_values = [dim.value for dim in self.shape if dim.kind == 'static' and dim.value is not None]
         raw_strides = True
         if self.strides is not None:
             raw_strides = False
             strides_len = len(self.strides)
-            strides_shape_idx = [i for i, dim in enumerate(self.strides) if dim.kind == 'static']
+            strides_shape_idx = [i for i, dim in enumerate(self.strides) if dim.kind == "static"]
             # static_fixed_strides_idx = [i for i, dim in enumerate(self.strides) if dim.kind == 'static' and dim.value is not None]
             # static_fixed_strides_values = [dim.value for dim in self.strides if dim.kind == 'static' and dim.value is not None]
         raw_dtype = True
@@ -340,9 +340,7 @@ class BufferAnnot(Annot):
             if not raw_dtype:
                 dtype = dt.dtype(dtype)
                 if dtype != expected_dtype:
-                    raise TypeError(
-                        f"Tensor dtype mismatch for argument `{name}`, expected {expected_dtype}, got {dtype}"
-                    )
+                    raise TypeError(f"Tensor dtype mismatch for argument `{name}`, expected {expected_dtype}, got {dtype}")
             return shape, strides, dtype
 
         return key_parser
@@ -384,7 +382,6 @@ class BufferAnnot(Annot):
 
 
 class TensorAnnot(BufferAnnot):
-
     @staticmethod
     def _construct_strides(shape: tuple[Any]):
         s, strides = 1, [1]
@@ -419,7 +416,8 @@ class TensorAnnot(BufferAnnot):
             align=align,
             offset_factor=offset_factor,
             buffer_type=buffer_type,
-            axis_separators=axis_separators)
+            axis_separators=axis_separators,
+        )
 
     def promote(self):
         shape = _try_convert_static_shape(self.shape)
@@ -430,7 +428,6 @@ class TensorAnnot(BufferAnnot):
 
 
 class StridedTensorAnnot(BufferAnnot):
-
     def __call__(
         self,
         shape,
@@ -466,30 +463,27 @@ class StridedTensorAnnot(BufferAnnot):
 
 
 class FragmentBufferAnnot(BufferAnnot):
-
     @property
     def scope(self):
-        return 'local.fragment'
+        return "local.fragment"
 
 
 class SharedBufferAnnot(BufferAnnot):
-
     @property
     def scope(self):
-        return 'shared.dyn'
+        return "shared.dyn"
 
 
 class LocalBufferAnnot(BufferAnnot):
-
     @property
     def scope(self):
-        return 'local'
+        return "local"
 
 
 class DynAnnot(Value):
-    '''
+    """
     Dynamic variable annotation represents a tvm tir.Var argument
-    '''
+    """
 
     def __call__(self, dtype: AnyDType = dt.float32, name: str | None = None) -> DynAnnot:
         return tir.Var(name, dtype)
@@ -499,16 +493,16 @@ class DynAnnot(Value):
             params = (params,)
         dtype = None
         if len(params) == 1:
-            name, = params
+            (name,) = params
         if len(params) == 2:
             dtype, name = params
         dtype = _canonicalize_dtype(dtype) or dt.int32
-        return DynAnnot(kind='dynamic', dtype=dtype, name=name)
+        return DynAnnot(kind="dynamic", dtype=dtype, name=name)
 
 
 @dataclass
 class DTypeAnnot(Annot):
-    '''
+    """
     Data type annotation ensures automatically conversion from AnyDType to dtype
     >>> def foo(A: T.dtype): print(A)
     >>> foo(torch.float32)
@@ -517,7 +511,8 @@ class DTypeAnnot(Annot):
     dtype('float32')
     >>> foo('float32')
     dtype('float32')
-    '''
+    """
+
     name: str | None = None
 
     def is_kernel_arg(self) -> bool:
@@ -533,15 +528,16 @@ class DTypeAnnot(Annot):
         return dt.dtype(value)
 
     def __repr__(self):
-        return self.name + '$dtype'
+        return self.name + "$dtype"
 
 
 @dataclass
 class TIRAnnot(Annot):
-    '''
+    """
     TIR annotation is used to directly pass tir.Buffer or tir.Var as kernel arguments
     >>> def foo(A: T.Buffer((128,), T.float32)): ...
-    '''
+    """
+
     data: tir.Buffer | tir.Var
 
     def is_kernel_arg(self) -> bool:
@@ -564,7 +560,6 @@ class TIRAnnot(Annot):
 if TYPE_CHECKING:
 
     class Buffer(Generic[_Shape, _DType]):
-
         def __init__(
             shape: tuple[Unpack[_Shapes]],
             dtype: _DType = "float32",
@@ -576,26 +571,20 @@ if TYPE_CHECKING:
             offset_factor=0,
             buffer_type="",
             axis_separators=None,
-        ) -> Buffer[Callable[[Unpack[_Shapes]]], _DType]:
-            ...
+        ) -> Buffer[Callable[[Unpack[_Shapes]]], _DType]: ...
 
         @property
-        def shape(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> tuple[Unpack[_Shapes]]:
-            ...
+        def shape(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> tuple[Unpack[_Shapes]]: ...
 
         @property
-        def dtype(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> dt.dtype[_DType]:
-            ...
+        def dtype(self: Buffer[Callable[[Unpack[_Shapes]]], _DType]) -> dt.dtype[_DType]: ...
 
         @property
-        def strides(self) -> tuple[tir.PrimExpr]:
-            ...
+        def strides(self) -> tuple[tir.PrimExpr]: ...
 
-        def scope(self) -> Scope:
-            ...
+        def scope(self) -> Scope: ...
 
     class Tensor(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
-
         def __new__(
             shape: tuple[Unpack[_Shapes]],
             dtype: _DType = "float32",
@@ -607,11 +596,9 @@ if TYPE_CHECKING:
             offset_factor=0,
             buffer_type="",
             axis_separators=None,
-        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
-            ...
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
 
     class StridedTensor(Generic[_Shape, _Stride, _DType], Buffer[_Shape, _DType]):
-
         def __new__(
             shape: tuple[Unpack[_Shapes]],
             strides=None,
@@ -623,8 +610,7 @@ if TYPE_CHECKING:
             offset_factor=0,
             buffer_type="",
             axis_separators=None,
-        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
-            ...
+        ) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
 
     class FragmentBuffer(Generic[_Shape, _DType], Buffer[_Shape, _DType]):
         pass
@@ -636,16 +622,12 @@ if TYPE_CHECKING:
         pass
 
     class dyn(tir.Var):
-
-        def __new__(cls, dtype: _DType = "float32", name: str | None = None) -> dyn[_DType]:
-            ...
+        def __new__(cls, dtype: _DType = "float32", name: str | None = None) -> dyn[_DType]: ...
 
         @property
-        def dtype(self: dyn[_DType]) -> dt.dtype[_DType]:
-            ...
+        def dtype(self: dyn[_DType]) -> dt.dtype[_DType]: ...
 
 else:
-
     Buffer = BufferAnnot()
     Tensor = TensorAnnot()
     StridedTensor = StridedTensorAnnot()
@@ -670,7 +652,7 @@ class FuncAnnot:
         ker_arg_names = []
         for param in sig.parameters.values():
             name = param.name
-            annot = func_annots.get(name, Value('static', name))
+            annot = func_annots.get(name, Value("static", name))
             if not isinstance(annot, Annot):
                 if not isinstance(annot, type) and callable(annot):
                     annot = annot()
@@ -679,7 +661,7 @@ class FuncAnnot:
                 elif isinstance(annot, (tir.Buffer, tir.Var)):
                     annot = TIRAnnot(data=annot)
                 else:
-                    annot = Value(kind='static', name=name)
+                    annot = Value(kind="static", name=name)
             annot = annot.promote() or annot
             annots[name] = annot.with_name(name)
             if annot.is_kernel_arg():
@@ -689,9 +671,9 @@ class FuncAnnot:
         return FuncAnnot(sig, arg_names, annots, arg_parser, ker_arg_names)
 
     def parse_key(self, *args, **kws):
-        '''
+        """
         Parse arguments and generates the cache key for jit caching
-        '''
+        """
         args = {name: arg for name, arg in zip(self.arg_names, args)}
         arg_dict = dict(**args, **kws)
         parsed = []
@@ -706,15 +688,15 @@ class FuncAnnot:
         return [arg_dict[name] for name in self.ker_arg_names]
 
     def create_argument(self, name: str, value: Any, vt: ArgVarTable):
-        '''
+        """
         Convert the argument into a tvm tir argument (tir.Var | tir.Buffer) for prim func generation
-        '''
+        """
         return self.annots[name].create_prim_func_arg(name, value, vt)
 
     def is_all_static(self):
-        '''
+        """
         Check if all arguments are static (i.e., can be fully determined at compile time)
-        '''
+        """
         return all(isinstance(annot, TIRAnnot) for annot in self.annots.values())
 
     def get_all_static_args(self):
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/v2/ast.py
index c6dfecf1..26c1851e 100644
--- a/tilelang/language/v2/ast.py
+++ b/tilelang/language/v2/ast.py
@@ -4,16 +4,18 @@ from dataclasses import dataclass
 from typing import Callable, Generic, Any, Literal, TypeVar
 from contextlib import AbstractContextManager
 from collections.abc import Iterable
+
 # Python 3.9 compatibility for ParamSpec
 try:
     from typing import ParamSpec
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 import inspect
+
 # from .utils import get_ast, get_compiled_object
 from . import utils
 
-_span_attrs = ['lineno', 'col_offset', 'end_lineno', 'end_col_offset']
+_span_attrs = ["lineno", "col_offset", "end_lineno", "end_col_offset"]
 
 
 def ast_has_span(ast: ast.AST) -> bool:
@@ -34,7 +36,6 @@ def ast_set_span(ast: ast.AST, span: tuple[int, int, int, int]):
 
 
 class QuoteVisitor(ast.NodeTransformer):
-
     def __init__(self, names: dict[str, ast.AST], passes: list[Any] | None = None, span=None):
         self.names = names
         self.passes = passes or []
@@ -76,9 +77,8 @@ def quote_expr(expr: str, **kws) -> ast.expr:
     return res.value
 
 
-Operator = Literal['Add', 'Sub', 'Mult', 'MatMult', 'Div', 'Mod', 'Pow', 'LShift', 'RShift',
-                   'BitOr', 'BitXor', 'BitAnd', 'FloorDiv']
-BoolOp = Literal['And', 'Or', 'Not']
+Operator = Literal["Add", "Sub", "Mult", "MatMult", "Div", "Mod", "Pow", "LShift", "RShift", "BitOr", "BitXor", "BitAnd", "FloorDiv"]
+BoolOp = Literal["And", "Or", "Not"]
 
 
 def get_operator_name(operator: ast.operator) -> Operator:
@@ -89,84 +89,83 @@ def get_boolop_name(boolop: ast.boolop) -> BoolOp:
     return boolop.__class__.__name__
 
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
 
 def eval_op(op: Operator, left: Any, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         return left + right
-    if op == 'Sub':
+    if op == "Sub":
         return left - right
-    if op == 'Mult':
+    if op == "Mult":
         return left * right
-    if op == 'MatMult':
+    if op == "MatMult":
         return left @ right
-    if op == 'Div':
+    if op == "Div":
         return left / right
-    if op == 'Mod':
+    if op == "Mod":
         return left % right
-    if op == 'Pow':
+    if op == "Pow":
         return left**right
-    if op == 'LShift':
+    if op == "LShift":
         return left << right
-    if op == 'RShift':
+    if op == "RShift":
         return left >> right
-    if op == 'BitOr':
+    if op == "BitOr":
         return left | right
-    if op == 'BitXor':
+    if op == "BitXor":
         return left ^ right
-    if op == 'BitAnd':
+    if op == "BitAnd":
         return left & right
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         return left // right
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
 def eval_aug_assign(op: Operator, left: Any, sl: slice, right: Any) -> Any:
-    if op == 'Add':
+    if op == "Add":
         left[sl] += right
         return left
-    if op == 'Sub':
+    if op == "Sub":
         left[sl] -= right
         return left
-    if op == 'Mult':
+    if op == "Mult":
         left[sl] *= right
         return left
-    if op == 'MatMult':
+    if op == "MatMult":
         left[sl] @= right
         return left
-    if op == 'Div':
+    if op == "Div":
         left[sl] /= right
         return left
-    if op == 'Mod':
+    if op == "Mod":
         left[sl] %= right
         return left
-    if op == 'Pow':
+    if op == "Pow":
         left[sl] **= right
         return left
-    if op == 'LShift':
+    if op == "LShift":
         left[sl] <<= right
         return left
-    if op == 'RShift':
+    if op == "RShift":
         left[sl] >>= right
         return left
-    if op == 'BitOr':
+    if op == "BitOr":
         left[sl] |= right
         return left
-    if op == 'BitXor':
+    if op == "BitXor":
         left[sl] ^= right
         return left
-    if op == 'BitAnd':
+    if op == "BitAnd":
         left[sl] &= right
         return left
-    if op == 'FloorDiv':
+    if op == "FloorDiv":
         left[sl] //= right
         return left
-    raise ValueError(f'Unknown operator: {op}')
+    raise ValueError(f"Unknown operator: {op}")
 
 
-class _empty:
-    ...
+class _empty: ...
 
 
 class BaseBuilder:
@@ -218,13 +217,13 @@ class BaseBuilder:
         eval_aug_assign(op, target, sl, aug_value)
 
     def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
-        if op == 'And':
+        if op == "And":
             return left and right()
-        if op == 'Or':
+        if op == "Or":
             return left or right()
-        if op == 'Not':
+        if op == "Not":
             return not left
-        raise ValueError(f'Unknown boolop: {op}')
+        raise ValueError(f"Unknown boolop: {op}")
 
     def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
         return then() if cond else otherwise()
@@ -249,7 +248,6 @@ class BaseBuilder:
 
 
 class DSLMutator(ast.NodeTransformer):
-
     def __init__(self, closure_names: list[str]):
         self.tmp_counter = 0
         self.closure_names = closure_names
@@ -264,19 +262,13 @@ class DSLMutator(ast.NodeTransformer):
         br = self.get_tmp()
         if len(node.orelse) == 0:
             return quote(
-                f"for {br} in __tb.ctx_if(cond):\n"
-                f"  for _ in __tb.ctx_then({br}):\n"
-                "    pass\n",
+                f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n",
                 cond=node.test,
                 passes=[node.body],
                 span=node,
             )
         return quote(
-            f"for {br} in __tb.ctx_if(cond):\n"
-            f"  for _ in __tb.ctx_then({br}):\n"
-            f"    pass\n"
-            f"  for _ in __tb.ctx_else({br}):\n"
-            f"    pass\n",
+            f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n  for _ in __tb.ctx_else({br}):\n    pass\n",
             cond=node.test,
             passes=[node.body, node.orelse],
             span=node,
@@ -290,7 +282,7 @@ class DSLMutator(ast.NodeTransformer):
         if isinstance(target, ast.Name):
             return f"'{target.id}'"
         elif isinstance(target, ast.Tuple):
-            return ("(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)")
+            return "(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)"
         else:
             s = ast.unparse(target)
             raise NotImplementedError(f"Unsupported for target `{s}`")
@@ -303,8 +295,7 @@ class DSLMutator(ast.NodeTransformer):
         ast_set_span(var, ast_get_span(node.target))
         stmts = self._emit_assign_target(node.target, var)
         return quote(
-            f"for {tmp} in __tb.ctx_for(range):\n"
-            "  pass\n",
+            f"for {tmp} in __tb.ctx_for(range):\n  pass\n",
             target=node.target,
             range=node.iter,
             passes=[stmts + node.body],
@@ -319,24 +310,15 @@ class DSLMutator(ast.NodeTransformer):
         node = self.generic_visit(node)
         return quote("if __tb.ctx_break(): break", span=node)
 
-    def _emit_assign_target(self,
-                            target: ast.expr,
-                            rval: ast.expr,
-                            annot: ast.expr = None) -> list[ast.AST]:
+    def _emit_assign_target(self, target: ast.expr, rval: ast.expr, annot: ast.expr = None) -> list[ast.AST]:
         if isinstance(target, ast.Name):
             if annot is None:
-                return quote(
-                    f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
+                return quote(f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
             else:
-                return quote(
-                    f'name = __tb.bind("{target.id}", value, annot)',
-                    name=target,
-                    value=rval,
-                    annot=annot,
-                    span=target)
+                return quote(f'name = __tb.bind("{target.id}", value, annot)', name=target, value=rval, annot=annot, span=target)
         elif isinstance(target, ast.Attribute):
             s = ast.unparse(target)
-            raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+            raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
         elif isinstance(target, ast.Subscript):
             if annot is None:
                 return quote(
@@ -356,7 +338,6 @@ class DSLMutator(ast.NodeTransformer):
                     span=target,
                 )
         else:
-
             # flatten nested tuple into a list of (tmp_name, target)
             unpacked = []
 
@@ -374,11 +355,9 @@ class DSLMutator(ast.NodeTransformer):
                     return res
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Attribute assignment not supported yet, `{s}`')
+                    raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
 
-            unpack_stmt = ast.Assign(
-                targets=[_visit_target(target)],
-                value=quote_expr('__tb.unwrap_value(rval)', rval=rval, span=rval))
+            unpack_stmt = ast.Assign(targets=[_visit_target(target)], value=quote_expr("__tb.unwrap_value(rval)", rval=rval, span=rval))
             ast_set_span(unpack_stmt, ast_get_span(target))
             stmts = [unpack_stmt]
             bind_lvals = []
@@ -386,8 +365,7 @@ class DSLMutator(ast.NodeTransformer):
 
             def flush_binds():
                 if bind_lvals:
-                    stmts.append(
-                        quote1(f'{", ".join(bind_lvals)}, = {", ".join(bind_rvals)},', span=target))
+                    stmts.append(quote1(f"{', '.join(bind_lvals)}, = {', '.join(bind_rvals)},", span=target))
                     bind_lvals.clear()
                     bind_rvals.clear()
 
@@ -417,15 +395,10 @@ class DSLMutator(ast.NodeTransformer):
                     bind_rvals.append(f'__tb.bind("{target.id}", {tmp})')
                 elif isinstance(target, ast.Subscript):
                     flush_binds()
-                    stmts.append(
-                        quote1(
-                            f'__tb.assign_slice(lval, slice, {tmp})',
-                            lval=target.value,
-                            slice=target.slice,
-                            span=target))
+                    stmts.append(quote1(f"__tb.assign_slice(lval, slice, {tmp})", lval=target.value, slice=target.slice, span=target))
                 else:
                     s = ast.unparse(target)
-                    raise NotImplementedError(f'Unsupported target: {s}')
+                    raise NotImplementedError(f"Unsupported target: {s}")
             flush_binds()
             return stmts
 
@@ -450,11 +423,7 @@ class DSLMutator(ast.NodeTransformer):
         target, rval = node.target, node.value
         op = get_operator_name(node.op)
         if isinstance(target, ast.Name):
-            return quote(
-                f"name = __tb.aug_assign('{op}', {target.id}, value)",
-                name=target,
-                value=rval,
-                span=node)
+            return quote(f"name = __tb.aug_assign('{op}', {target.id}, value)", name=target, value=rval, span=node)
         elif isinstance(target, ast.Subscript):
             return quote(
                 f"__tb.aug_assign_slice('{op}', lval, slice, value)",
@@ -468,16 +437,12 @@ class DSLMutator(ast.NodeTransformer):
 
     def visit_AnnAssign(self, node: ast.AnnAssign):
         node = self.generic_visit(node)
-        rval = node.value or quote_expr('__tb.empty', span=node, annot=node)
+        rval = node.value or quote_expr("__tb.empty", span=node, annot=node)
         return self._emit_assign_target(node.target, rval, annot=node.annotation)
 
     def visit_While(self, node):
         node = self.generic_visit(node)
-        return quote1(
-            "for _ in __tb.ctx_while(lambda: cond):\n  pass",
-            cond=node.test,
-            passes=[node.body],
-            span=node)
+        return quote1("for _ in __tb.ctx_while(lambda: cond):\n  pass", cond=node.test, passes=[node.body], span=node)
 
     def visit_FunctionDef(self, node: ast.FunctionDef):
         node = self.generic_visit(node)
@@ -536,18 +501,14 @@ class DSLMutator(ast.NodeTransformer):
             left = comp
         last = split[-1]
         for i in reversed(range(len(split) - 1)):
-            last = quote_expr(
-                "__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
+            last = quote_expr("__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
         return last
 
     def visit_IfExp(self, node: ast.IfExp) -> ast.Expr:
         node = self.generic_visit(node)
         return quote_expr(
-            '__tb.ifexp(cond, lambda: then, lambda: otherwise)',
-            cond=node.test,
-            then=node.body,
-            otherwise=node.orelse,
-            span=node)
+            "__tb.ifexp(cond, lambda: then, lambda: otherwise)", cond=node.test, then=node.body, otherwise=node.orelse, span=node
+        )
 
     def visit_Return(self, node: ast.Return):
         node = self.generic_visit(node)
@@ -569,7 +530,7 @@ class DSLMutator(ast.NodeTransformer):
         return node
 
 
-_P = ParamSpec('_P')
+_P = ParamSpec("_P")
 
 
 @dataclass
@@ -626,7 +587,7 @@ def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
 
     make_closure = utils.get_compiled_object(
         tree,
-        'make_closure',
+        "make_closure",
         filename,
         func.__globals__,  # use the original globalns
     )
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 436756df..645a1ad9 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -18,6 +18,7 @@ from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, U
 from collections.abc import Sequence
 from .annot import FuncAnnot, ArgVarTable, Annot
 import pprint
+
 # Python 3.9 compatibility for ParamSpec and Self
 try:
     from typing import ParamSpec, Self
@@ -32,9 +33,9 @@ logger = logging.getLogger(__name__)
 
 
 def unwrap_expr(expr) -> PrimExpr | int | float:
-    '''
+    """
     unwrap expr and convert it into PrimExpr like
-    '''
+    """
     if isinstance(expr, tir.meta_var):
         expr = expr.value
     elif isinstance(expr, Ref):
@@ -47,9 +48,9 @@ def unwrap_expr(expr) -> PrimExpr | int | float:
 
 
 def unwrap_cond(expr):
-    '''
+    """
     unwrap expr and convert to bool condition
-    '''
+    """
     expr = unwrap_expr(expr)
     if isinstance(expr, (IntImm, FloatImm, StringImm)):
         return bool(expr.value)
@@ -61,10 +62,10 @@ def unwrap_cond(expr):
         return bool(expr)
     else:
         logger.warning(
-            f"Python expression `{expr}` is used as condition in TileLang, \n"
-            "this is treated as a constant expression. ",
+            f"Python expression `{expr}` is used as condition in TileLang, \nthis is treated as a constant expression. ",
             stack_info=True,
-            stacklevel=3)
+            stacklevel=3,
+        )
         return bool(expr)
 
 
@@ -72,44 +73,35 @@ thread_local_storage = threading.local()
 
 
 class Frame:
-    '''
+    """
     Frame are virtual context managers used in frontend only
     They do not have any runtime representation in the generated TIR.
-    '''
+    """
 
-    def __enter__(self):
-        ...
+    def __enter__(self): ...
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
 
 
-class MacroFrame(Frame):
-    ...
+class MacroFrame(Frame): ...
 
 
-class ExitedMacroFrame(Frame):
-    ...
+class ExitedMacroFrame(Frame): ...
 
 
-class BoolOpFrame(Frame):
-    ...
+class BoolOpFrame(Frame): ...
 
 
-class ConstIfFrame(Frame):
-    ...
+class ConstIfFrame(Frame): ...
 
 
-class BlockFrame(Frame):
-    ...
+class BlockFrame(Frame): ...
 
 
-class ContinueFrame(Frame):
-    ...
+class ContinueFrame(Frame): ...
 
 
-class BreakFrame(Frame):
-    ...
+class BreakFrame(Frame): ...
 
 
 @dataclass
@@ -145,8 +137,7 @@ class Ref:
         return self.bufload
 
 
-class UnrollForWithStep(SerialForWithStep):
-    ...
+class UnrollForWithStep(SerialForWithStep): ...
 
 
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
@@ -172,11 +163,10 @@ TIR_VAR_SCOPE_FRAME = (
 
 
 def is_var(v: Any) -> bool:
-    return isinstance(v, Buffer) and v.scope() == 'local.var'
+    return isinstance(v, Buffer) and v.scope() == "local.var"
 
 
 class Builder(BaseBuilder):
-
     def __init__(self, func_annot: FuncAnnot = None):
         self.frames: list[AnyFrame] = []
         self.ir_builder = IRBuilder()
@@ -189,7 +179,7 @@ class Builder(BaseBuilder):
 
     @classmethod
     def current(cls) -> Self:
-        builder = getattr(thread_local_storage, 'builder', None)
+        builder = getattr(thread_local_storage, "builder", None)
         return builder
 
     @contextmanager
@@ -199,14 +189,15 @@ class Builder(BaseBuilder):
             tir.func_name(name)
             yield
         if len(self.out_idx) != self.out_tensor_cnt:
-            raise RuntimeError('Not all tensor allocated from `T.empty` are returned')
+            raise RuntimeError("Not all tensor allocated from `T.empty` are returned")
 
     @contextmanager
     def macro(self, name=None, annotations=None):
         if self.find_frame_idx(BoolOpFrame) is not None:
             raise RuntimeError(
                 f"Macro `{name}` is used inside boolean expressions, "
-                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs")
+                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs"
+            )
         save = self.name_inside_frame, self.macro_arg_annot
         self.name_inside_frame = {}
         self.macro_arg_annot = annotations or {}
@@ -244,10 +235,7 @@ class Builder(BaseBuilder):
     def check_continue_break(self):
         idx = self.find_frame_idx(ContinueOrBreak)
         if idx is not None:
-            logger.warning(
-                'Writing code after continue/break may cause undefined behavior in tilelang.',
-                stack_info=True,
-                stacklevel=3)
+            logger.warning("Writing code after continue/break may cause undefined behavior in tilelang.", stack_info=True, stacklevel=3)
 
     @contextmanager
     def with_frame(self, frame: AbstractContextManager[Any] | None):
@@ -256,8 +244,7 @@ class Builder(BaseBuilder):
         while len(self.frames) > pop_idx:
             self.frames.pop().__exit__(None, None, None)
 
-    class _has_if_frame:
-        ...
+    class _has_if_frame: ...
 
     def ctx_if(self, cond):
         self.check_continue_break()
@@ -294,7 +281,7 @@ class Builder(BaseBuilder):
         elif isinstance(val, tir.frame.IRBuilderFrame):
             if isinstance(val, tir.frame.ForFrame):
                 logger.warning(
-                    'Evaluating a for frame may cause undefined behavior in tilelang.',
+                    "Evaluating a for frame may cause undefined behavior in tilelang.",
                     stack_info=True,
                     stacklevel=1,
                 )
@@ -310,8 +297,7 @@ class Builder(BaseBuilder):
         elif isinstance(val, (Buffer, Var)):
             pass
         else:
-            logger.warning(
-                f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
+            logger.warning(f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
 
     def ctx_for(self, it):
         self.check_continue_break()
@@ -321,15 +307,13 @@ class Builder(BaseBuilder):
             if isinstance(it.step, (int, IntImm)):
                 step_value = it.step if isinstance(it.step, int) else it.step.value
                 if step_value == 0:
-                    raise ValueError('Invalid stepped serial: step must be non-zero')
+                    raise ValueError("Invalid stepped serial: step must be non-zero")
                 if step_value > 0:
                     real_stop = tir.ceildiv(it.stop - it.start, step_value)
                 else:
                     real_stop = tir.ceildiv(it.start - it.stop, -step_value)
             else:
-                logger.warning(
-                    f'Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang'
-                )
+                logger.warning(f"Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang")
                 real_stop = tir.ceildiv(it.stop - it.start, it.step)
             if isinstance(it, UnrollForWithStep):
                 real_frame = tir.unroll(real_stop, annotations=it.annotations)
@@ -338,15 +322,17 @@ class Builder(BaseBuilder):
             else:
                 raise TypeError(
                     f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
-                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding")
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding"
+                )
             with self.with_frame(real_frame) as v:
-                IRBuilder.name('_tmp', v)
+                IRBuilder.name("_tmp", v)
                 yield it.start + v * it.step
         else:
             if not isinstance(it, tir.frame.ForFrame):
                 raise TypeError(
                     f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
-                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding")
+                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding"
+                )
             with self.with_frame(it) as v:
                 yield v
 
@@ -369,15 +355,16 @@ class Builder(BaseBuilder):
         if not isinstance(cond_v_unwrap, PrimExpr):
             if cond_v_unwrap:
                 raise RuntimeError(
-                    f'Infinite while loop detected in TileLang\n'
-                    f'Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n'
+                    f"Infinite while loop detected in TileLang\n"
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n"
                 )
             else:
                 logger.warning(
-                    'While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n',
-                    f'Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n',
+                    "While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n",
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n",
                     stack_info=True,
-                    stacklevel=2)
+                    stacklevel=2,
+                )
         with self.with_frame(tir.While(cond_v_unwrap)):
             yield None
 
@@ -406,14 +393,14 @@ class Builder(BaseBuilder):
         # 2. Quick return for trivil types
         if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
             return value
-        if isinstance(value, tir.IntImm) and value.dtype == 'int32':
+        if isinstance(value, tir.IntImm) and value.dtype == "int32":
             return value.value
         if isinstance(value, (Var, Buffer)):
             # Bind TVM Var/Buffer names and also record scope so reusing the same
             # Python name (e.g., loop vars like `i`) across different for-frames
             # works without triggering out-of-scope errors.
             IRBuilder.name(name, value)
-            if name != '_':
+            if name != "_":
                 frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
                 assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
                 self.name_inside_frame[name] = self.frames[frame]
@@ -423,12 +410,12 @@ class Builder(BaseBuilder):
         res = self.bind_immutable(name, value)
 
         # 4. Check variable scope and shadowing
-        if name != '_':
+        if name != "_":
             frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
             assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
             if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
                 logger.warning(
-                    f'Variable `{name}` is declared twice, are you looking for a T.alloc_var?',
+                    f"Variable `{name}` is declared twice, are you looking for a T.alloc_var?",
                     stack_info=True,
                     stacklevel=2,
                 )
@@ -436,9 +423,9 @@ class Builder(BaseBuilder):
         return res
 
     def unwrap_value(self, value):
-        '''
+        """
         Unwrap some tilelang objects to get their inner value
-        '''
+        """
         value = unwrap_expr(value)
         # handle bx, by = tl.Kernel(128, 128), rval is frame
         if isinstance(value, tir.frame.IRBuilderFrame):
@@ -447,11 +434,11 @@ class Builder(BaseBuilder):
             return value
 
     def bind_immutable(self, name, value):
-        '''
+        """
         Bind an immutable tilelang objects.
         The immutability means the result is usually not changed or re-assigned in a python block.
-        '''
-        if name == '_':
+        """
+        if name == "_":
             # use _tmp to make the generated tir more readable
             name = "_tmp"
         if isinstance(value, tir.meta_var):
@@ -459,18 +446,20 @@ class Builder(BaseBuilder):
         elif isinstance(value, tir.frame.IRBuilderFrame):
             if isinstance(value, tir.frame.ForFrame):
                 logger.warning(
-                    'Binding a for frame to variable may cause undefined behavior in tilelang.',
+                    "Binding a for frame to variable may cause undefined behavior in tilelang.",
                     stack_info=True,
                     stacklevel=2,
                 )
             return self.enter_frame(value)
         elif isinstance(value, OutTensor):
-            arg = tir.arg(name,
-                          tir.buffer(
-                              shape=value.shape,
-                              dtype=value.dtype,
-                              strides=value.strides,
-                          ))
+            arg = tir.arg(
+                name,
+                tir.buffer(
+                    shape=value.shape,
+                    dtype=value.dtype,
+                    strides=value.strides,
+                ),
+            )
             arg._out_idx = self.out_tensor_cnt
             self.out_tensor_cnt += 1
             return arg
@@ -490,8 +479,7 @@ class Builder(BaseBuilder):
     def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
         self.check_continue_break()
         if annot is not self.empty:
-            logger.warning(
-                "Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
+            logger.warning("Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
         if isinstance(lval, Buffer):
             tir.buffer_store(lval, value, sl)
         else:
@@ -521,11 +509,11 @@ class Builder(BaseBuilder):
         left = unwrap_cond(left)
         if isinstance(left, PrimExpr):
             with self.with_frame(BoolOpFrame()):
-                if op == 'And':
+                if op == "And":
                     return tir.And(left, right())
-                if op == 'Or':
+                if op == "Or":
                     return tir.Or(left, right())
-                if op == 'Not':
+                if op == "Not":
                     return tir.Not(left)
             raise RuntimeError(f"Unsupported boolean operator: {op}")
         else:
@@ -557,7 +545,7 @@ class Builder(BaseBuilder):
                     "You should allocate a var before the control flow, assign value inside the blocks, \n"
                     "and return the var after the control flow. i.e.\n"
                     "```\n"
-                    "@T.macro\n" \
+                    "@T.macro\n"
                     "def my_macro(cond):\n"
                     "    a = T.alloc_var(T.float16)\n"
                     "    if cond:\n"
@@ -570,14 +558,12 @@ class Builder(BaseBuilder):
             if not isinstance(value, tuple):
                 value = (value,)
             for v in value:
-                if not isinstance(v, Buffer) or not hasattr(v, '_out_idx'):
-                    raise RuntimeError(
-                        f'Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})'
-                    )
+                if not isinstance(v, Buffer) or not hasattr(v, "_out_idx"):
+                    raise RuntimeError(f"Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})")
                 # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
                 self.out_idx.append(v._out_idx - self.out_tensor_cnt)
             if len(self.out_idx) != self.out_tensor_cnt:
-                raise RuntimeError(f'Not all tensor from `T.empty` are returned, only got {value}')
+                raise RuntimeError(f"Not all tensor from `T.empty` are returned, only got {value}")
             return NotImplemented
 
     def ctx_with(self, ctx):
@@ -591,7 +577,7 @@ class Builder(BaseBuilder):
         self.check_continue_break()
         cond = unwrap_cond(cond)
         if msg is None:
-            msg = 'Assertion failed'
+            msg = "Assertion failed"
         if isinstance(cond, PrimExpr):
             self.enter_frame(tir.Assert(cond, msg))
         elif not cond:
@@ -611,23 +597,18 @@ class Builder(BaseBuilder):
         annot_value = self.macro_arg_annot.get(name, None)
         if annot_value is Var or annot_value is Ref:
             if annot_value is Var:
-                logger.warning('Use `T.Var` as macro annotations is deprecated, please use `T.Ref`')
+                logger.warning("Use `T.Var` as macro annotations is deprecated, please use `T.Ref`")
             if isinstance(value, BufferLoad):
                 if is_var(value.buffer):
                     return value.buffer
-                idx = [self.bind('_', idx) for idx in value.indices]
+                idx = [self.bind("_", idx) for idx in value.indices]
                 # indices = self.bind(f'_', value.indices)
                 return Ref(BufferLoad(value.buffer, indices=idx))
             if isinstance(value, BufferRegion):
-                region = [
-                    Range(
-                        self.bind('_', x.begin),
-                        end=self.bind('_', x.end) if x.end is not None else None)
-                    for x in value.region
-                ]
+                region = [Range(self.bind("_", x.begin), end=self.bind("_", x.end) if x.end is not None else None) for x in value.region]
                 return BufferRegion(value.buffer, region=region)
             raise ValueError(
-                f'To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})'
+                f"To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})"
             )
         elif isinstance(value, (PrimExpr, int, float)):
             return self.bind(name, value)
@@ -652,13 +633,14 @@ class Builder(BaseBuilder):
 
     def override(self, name: str):
         from tilelang.language import serial
-        if name == 'range':
+
+        if name == "range":
             return serial
-        raise ValueError(f'Unknown override: {name}')
+        raise ValueError(f"Unknown override: {name}")
 
 
-_P = ParamSpec('_P')
-_T = TypeVar('_T')
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
 
 
 @dataclass
@@ -683,14 +665,8 @@ class PrimFuncCreater(Generic[_P, _T]):
         return res
 
     def __repr__(self):
-        fmt = pprint.pformat(
-            {
-                'annot': self.func_annot.annots,
-                'ir_gen': self.ir_gen,
-                'orig_func': self.orig_func
-            },
-            indent=2)
-        return f'{self.__class__.__name__}(\n{fmt}\n)'
+        fmt = pprint.pformat({"annot": self.func_annot.annots, "ir_gen": self.ir_gen, "orig_func": self.orig_func}, indent=2)
+        return f"{self.__class__.__name__}(\n{fmt}\n)"
 
 
 if TYPE_CHECKING:
@@ -769,8 +745,7 @@ def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
 
     def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
         annotations = get_type_hints(func)
-        return Macro(
-            name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
+        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
 
     return impl(func) if func is not None else impl
 
@@ -779,9 +754,9 @@ from typing import _eval_type
 
 
 def get_type_hints(func):
-    annot = getattr(func, '__annotations__', None)
+    annot = getattr(func, "__annotations__", None)
     if annot is None:
-        raise TypeError(f'Failed to get function type hints, {func} is not a function')
+        raise TypeError(f"Failed to get function type hints, {func} is not a function")
     hints = {}
     # Build eval namespaces from function globals plus captured closure variables
     # This lets annotations reference symbols like `n`, `h`, or dtype vars
@@ -808,7 +783,7 @@ def get_type_hints(func):
     #     ... # empty function, do not use `n`
     localns = utils.get_func_nonlocals(func)
     for name, value in annot.items():
-        if name == 'return':
+        if name == "return":
             continue
         if isinstance(value, tvm.DataType):
             hints[name] = value
@@ -821,7 +796,7 @@ def get_type_hints(func):
             #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
             # here we manually interpret it to return T.float32 object
             try:
-                _, v = value.split('.', maxsplit=1)
+                _, v = value.split(".", maxsplit=1)
             except ValueError:
                 v = value
             if v in dt._all_dtypes:
@@ -837,9 +812,7 @@ def get_type_hints(func):
     return hints
 
 
-def prim_func(func: Callable[_P, _T] = None,
-              *,
-              generator: bool = False) -> PrimFunc[_P, _T] | PrimFuncCreater[_P, _T]:
+def prim_func(func: Callable[_P, _T] = None, *, generator: bool = False) -> PrimFunc[_P, _T] | PrimFuncCreater[_P, _T]:
     """
     Decorator to create a primitive function (PrimFunc) for TileLang IR generation.
     This decorator transforms a Python function into a TileLang primitive function by analyzing
@@ -903,7 +876,8 @@ def prim_func(func: Callable[_P, _T] = None,
                 raise ValueError(
                     f"Cannot create PrimFunc for `{func.__name__}`, some arguments are not compile-time known, \n"
                     f"Annotations:\n{func_annot.annots}"
-                    f"Unknown Args: {unknown_args}")
+                    f"Unknown Args: {unknown_args}"
+                )
             return prim_func_generator
 
     return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 3b215870..6ed56b48 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -6,14 +6,12 @@ from tvm import tir
 import tvm.script.ir_builder.tir._ffi_api as tb_ffi
 import numpy as np
 
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 
 if TYPE_CHECKING:
 
     class dtype(Generic[_T]):
-
-        def torch(self) -> torch.dtype:
-            ...
+        def torch(self) -> torch.dtype: ...
 else:
     dtype = tvm.DataType
 
@@ -21,53 +19,53 @@ else:
 AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
 
 _PYTHON_DTYPE_TO_STR = {
-    bool: 'bool',
-    int: 'int32',
-    float: 'float32',
+    bool: "bool",
+    int: "int32",
+    float: "float32",
 }
 
 _NUMPY_DTYPE_TO_STR = {
-    np.bool_: 'bool',
-    np.short: 'int16',
-    np.int_: 'int64',
-    np.longlong: 'int64',
-    np.half: 'float16',
-    np.double: 'float64',
-    np.int8: 'int8',
-    np.int16: 'int16',
-    np.int32: 'int32',
-    np.int64: 'int64',
-    np.uint8: 'uint8',
-    np.uint16: 'uint16',
-    np.uint32: 'uint32',
-    np.uint64: 'uint64',
-    np.float16: 'float16',
-    np.float32: 'float32',
-    np.float64: 'float64',
+    np.bool_: "bool",
+    np.short: "int16",
+    np.int_: "int64",
+    np.longlong: "int64",
+    np.half: "float16",
+    np.double: "float64",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "float16",
+    np.float32: "float32",
+    np.float64: "float64",
 }
 
 _NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
 
 _TORCH_DTYPE_TO_STR = {
-    torch.bool: 'bool',
-    torch.short: 'int16',
-    torch.int: 'int32',
-    torch.long: 'int64',
-    torch.half: 'float16',
-    torch.float: 'float32',
-    torch.double: 'float64',
-    torch.int8: 'int8',
-    torch.int16: 'int16',
-    torch.int32: 'int32',
-    torch.int64: 'int64',
-    torch.uint8: 'uint8',
-    torch.uint16: 'uint16',
-    torch.uint32: 'uint32',
-    torch.uint64: 'uint64',
-    torch.float16: 'float16',
-    torch.float32: 'float32',
-    torch.float64: 'float64',
-    torch.bfloat16: 'bfloat16',
+    torch.bool: "bool",
+    torch.short: "int16",
+    torch.int: "int32",
+    torch.long: "int64",
+    torch.half: "float16",
+    torch.float: "float32",
+    torch.double: "float64",
+    torch.int8: "int8",
+    torch.int16: "int16",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.uint8: "uint8",
+    torch.uint16: "uint16",
+    torch.uint32: "uint32",
+    torch.uint64: "uint64",
+    torch.float16: "float16",
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.bfloat16: "bfloat16",
 }
 
 # _STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
@@ -77,24 +75,24 @@ _TORCH_DTYPE_TO_STR = {
 _DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
 
 _STR_TO_TVM_DTYPE_CALL = {
-    'bool': 'Boolean',
-    'int8': 'Int8',
-    'int32': 'Int32',
-    'int64': 'Int64',
-    'uint8': 'UInt8',
-    'uint16': 'UInt16',
-    'uint32': 'UInt32',
-    'uint64': 'UInt64',
-    'float16': 'Float16',
-    'float32': 'Float32',
-    'float64': 'Float64',
-    'bfloat16': 'BFloat16',
-    'float8_e4m3': 'Float8E4M3',
-    'float8_e4m3fn': 'Float8E4M3FN',
-    'float8_e4m3fnuz': 'Float8E4M3FNUZ',
-    'float8_e5m2': 'Float8E5M2',
-    'float8_e5m2fnuz': 'Float8E5M2FNUZ',
-    'float8_e8m0fnu': 'Float8E8M0FNU'
+    "bool": "Boolean",
+    "int8": "Int8",
+    "int32": "Int32",
+    "int64": "Int64",
+    "uint8": "UInt8",
+    "uint16": "UInt16",
+    "uint32": "UInt32",
+    "uint64": "UInt64",
+    "float16": "Float16",
+    "float32": "Float32",
+    "float64": "Float64",
+    "bfloat16": "BFloat16",
+    "float8_e4m3": "Float8E4M3",
+    "float8_e4m3fn": "Float8E4M3FN",
+    "float8_e4m3fnuz": "Float8E4M3FNUZ",
+    "float8_e5m2": "Float8E5M2",
+    "float8_e5m2fnuz": "Float8E5M2FNUZ",
+    "float8_e8m0fnu": "Float8E8M0FNU",
 }
 
 int_ = int
@@ -108,23 +106,24 @@ def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var
         call = getattr(tb_ffi, attr, None)
         return call(expr, is_size_var)
     # try to construct the ffi call
-    if self.startswith('uint'):
-        val = 'UInt' + self[4:]
-    elif self.startswith('int'):
-        val = 'Int' + self[3:]
-    elif self.startswith('float'):
-        val = 'Float' + self[5:]
-    elif self.startswith('bfloat'):
-        val = 'BFloat' + self[6:]
+    if self.startswith("uint"):
+        val = "UInt" + self[4:]
+    elif self.startswith("int"):
+        val = "Int" + self[3:]
+    elif self.startswith("float"):
+        val = "Float" + self[5:]
+    elif self.startswith("bfloat"):
+        val = "BFloat" + self[6:]
     else:
-        raise TypeError(f'Invalid type {self}')
-    if '_' in val:
-        first, second = val.split('_', maxsplit=1)
+        raise TypeError(f"Invalid type {self}")
+    if "_" in val:
+        first, second = val.split("_", maxsplit=1)
         val = first + second.upper()
     call = getattr(tb_ffi, val, None)
     if call is None:
-        raise TypeError(f"Convert to datatype `{self}` is not supported by tvm\n"
-                        f"calling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`")
+        raise TypeError(
+            f"Convert to datatype `{self}` is not supported by tvm\ncalling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`"
+        )
     return call(expr, is_size_var)
 
 
@@ -152,7 +151,6 @@ def get_tvm_dtype(value: AnyDType) -> dtype:
 
 
 if TYPE_CHECKING:
-
     # yapf: disable
     class bool(dtype): ...
     class short(dtype): ...
@@ -319,336 +317,336 @@ if TYPE_CHECKING:
     # yapf: enable
 
 else:
-    bool = dtype('bool')
-    short = dtype('int16')
-    int = dtype('int32')
-    long = dtype('int64')
-    half = dtype('float16')
-    float = dtype('float32')
-    double = dtype('float64')
-    int8 = dtype('int8')
-    int16 = dtype('int16')
-    int32 = dtype('int32')
-    int64 = dtype('int64')
-    int8x2 = dtype('int8x2')
-    int16x2 = dtype('int16x2')
-    int32x2 = dtype('int32x2')
-    int64x2 = dtype('int64x2')
-    int8x4 = dtype('int8x4')
-    int16x4 = dtype('int16x4')
-    int32x4 = dtype('int32x4')
-    int64x4 = dtype('int64x4')
-    int8x8 = dtype('int8x8')
-    int16x8 = dtype('int16x8')
-    int32x8 = dtype('int32x8')
-    int64x8 = dtype('int64x8')
-    int8x16 = dtype('int8x16')
-    int16x16 = dtype('int16x16')
-    int32x16 = dtype('int32x16')
-    int64x16 = dtype('int64x16')
-    int8x32 = dtype('int8x32')
-    int16x32 = dtype('int16x32')
-    int32x32 = dtype('int32x32')
-    int64x32 = dtype('int64x32')
-    int8x64 = dtype('int8x64')
-    int16x64 = dtype('int16x64')
-    int32x64 = dtype('int32x64')
-    int64x64 = dtype('int64x64')
-    uint8 = dtype('uint8')
-    uint16 = dtype('uint16')
-    uint32 = dtype('uint32')
-    uint64 = dtype('uint64')
-    uint8x2 = dtype('uint8x2')
-    uint16x2 = dtype('uint16x2')
-    uint32x2 = dtype('uint32x2')
-    uint64x2 = dtype('uint64x2')
-    uint8x4 = dtype('uint8x4')
-    uint16x4 = dtype('uint16x4')
-    uint32x4 = dtype('uint32x4')
-    uint64x4 = dtype('uint64x4')
-    uint8x8 = dtype('uint8x8')
-    uint16x8 = dtype('uint16x8')
-    uint32x8 = dtype('uint32x8')
-    uint64x8 = dtype('uint64x8')
-    uint8x16 = dtype('uint8x16')
-    uint16x16 = dtype('uint16x16')
-    uint32x16 = dtype('uint32x16')
-    uint64x16 = dtype('uint64x16')
-    uint8x32 = dtype('uint8x32')
-    uint16x32 = dtype('uint16x32')
-    uint32x32 = dtype('uint32x32')
-    uint64x32 = dtype('uint64x32')
-    uint8x64 = dtype('uint8x64')
-    uint16x64 = dtype('uint16x64')
-    uint32x64 = dtype('uint32x64')
-    uint64x64 = dtype('uint64x64')
-    float16 = dtype('float16')
-    float32 = dtype('float32')
-    float64 = dtype('float64')
-    float16x2 = dtype('float16x2')
-    float32x2 = dtype('float32x2')
-    float64x2 = dtype('float64x2')
-    float16x4 = dtype('float16x4')
-    float32x4 = dtype('float32x4')
-    float64x4 = dtype('float64x4')
-    float16x8 = dtype('float16x8')
-    float32x8 = dtype('float32x8')
-    float64x8 = dtype('float64x8')
-    float16x16 = dtype('float16x16')
-    float32x16 = dtype('float32x16')
-    float64x16 = dtype('float64x16')
-    float16x32 = dtype('float16x32')
-    float32x32 = dtype('float32x32')
-    float64x32 = dtype('float64x32')
-    float16x64 = dtype('float16x64')
-    float32x64 = dtype('float32x64')
-    float64x64 = dtype('float64x64')
-    float8_e3m4 = dtype('float8_e3m4')
-    float8_e3m4x2 = dtype('float8_e3m4x2')
-    float8_e3m4x4 = dtype('float8_e3m4x4')
-    float8_e3m4x8 = dtype('float8_e3m4x8')
-    float8_e3m4x16 = dtype('float8_e3m4x16')
-    float8_e3m4x32 = dtype('float8_e3m4x32')
-    float8_e3m4x64 = dtype('float8_e3m4x64')
-    float8_e4m3 = dtype('float8_e4m3')
-    float8_e4m3x2 = dtype('float8_e4m3x2')
-    float8_e4m3x4 = dtype('float8_e4m3x4')
-    float8_e4m3x8 = dtype('float8_e4m3x8')
-    float8_e4m3x16 = dtype('float8_e4m3x16')
-    float8_e4m3x32 = dtype('float8_e4m3x32')
-    float8_e4m3x64 = dtype('float8_e4m3x64')
-    float8_e4m3b11fnuz = dtype('float8_e4m3b11fnuz')
-    float8_e4m3b11fnuzx2 = dtype('float8_e4m3b11fnuzx2')
-    float8_e4m3b11fnuzx4 = dtype('float8_e4m3b11fnuzx4')
-    float8_e4m3b11fnuzx8 = dtype('float8_e4m3b11fnuzx8')
-    float8_e4m3b11fnuzx16 = dtype('float8_e4m3b11fnuzx16')
-    float8_e4m3b11fnuzx32 = dtype('float8_e4m3b11fnuzx32')
-    float8_e4m3b11fnuzx64 = dtype('float8_e4m3b11fnuzx64')
-    float8_e4m3fn = dtype('float8_e4m3fn')
-    float8_e4m3fnx2 = dtype('float8_e4m3fnx2')
-    float8_e4m3fnx4 = dtype('float8_e4m3fnx4')
-    float8_e4m3fnx8 = dtype('float8_e4m3fnx8')
-    float8_e4m3fnx16 = dtype('float8_e4m3fnx16')
-    float8_e4m3fnx32 = dtype('float8_e4m3fnx32')
-    float8_e4m3fnx64 = dtype('float8_e4m3fnx64')
-    float8_e4m3fnuz = dtype('float8_e4m3fnuz')
-    float8_e4m3fnuzx2 = dtype('float8_e4m3fnuzx2')
-    float8_e4m3fnuzx4 = dtype('float8_e4m3fnuzx4')
-    float8_e4m3fnuzx8 = dtype('float8_e4m3fnuzx8')
-    float8_e4m3fnuzx16 = dtype('float8_e4m3fnuzx16')
-    float8_e4m3fnuzx32 = dtype('float8_e4m3fnuzx32')
-    float8_e4m3fnuzx64 = dtype('float8_e4m3fnuzx64')
-    float8_e5m2 = dtype('float8_e5m2')
-    float8_e5m2x2 = dtype('float8_e5m2x2')
-    float8_e5m2x4 = dtype('float8_e5m2x4')
-    float8_e5m2x8 = dtype('float8_e5m2x8')
-    float8_e5m2x16 = dtype('float8_e5m2x16')
-    float8_e5m2x32 = dtype('float8_e5m2x32')
-    float8_e5m2x64 = dtype('float8_e5m2x64')
-    float8_e5m2fnuz = dtype('float8_e5m2fnuz')
-    float8_e5m2fnuzx2 = dtype('float8_e5m2fnuzx2')
-    float8_e5m2fnuzx4 = dtype('float8_e5m2fnuzx4')
-    float8_e5m2fnuzx8 = dtype('float8_e5m2fnuzx8')
-    float8_e5m2fnuzx16 = dtype('float8_e5m2fnuzx16')
-    float8_e5m2fnuzx32 = dtype('float8_e5m2fnuzx32')
-    float8_e5m2fnuzx64 = dtype('float8_e5m2fnuzx64')
-    float8_e8m0fnu = dtype('float8_e8m0fnu')
-    float8_e8m0fnux2 = dtype('float8_e8m0fnux2')
-    float8_e8m0fnux4 = dtype('float8_e8m0fnux4')
-    float8_e8m0fnux8 = dtype('float8_e8m0fnux8')
-    float8_e8m0fnux16 = dtype('float8_e8m0fnux16')
-    float8_e8m0fnux32 = dtype('float8_e8m0fnux32')
-    float8_e8m0fnux64 = dtype('float8_e8m0fnux64')
-    float6_e2m3fn = dtype('float6_e2m3fn')
-    float6_e2m3fnx2 = dtype('float6_e2m3fnx2')
-    float6_e2m3fnx4 = dtype('float6_e2m3fnx4')
-    float6_e2m3fnx8 = dtype('float6_e2m3fnx8')
-    float6_e2m3fnx16 = dtype('float6_e2m3fnx16')
-    float6_e2m3fnx32 = dtype('float6_e2m3fnx32')
-    float6_e2m3fnx64 = dtype('float6_e2m3fnx64')
-    float6_e3m2fn = dtype('float6_e3m2fn')
-    float6_e3m2fnx2 = dtype('float6_e3m2fnx2')
-    float6_e3m2fnx4 = dtype('float6_e3m2fnx4')
-    float6_e3m2fnx8 = dtype('float6_e3m2fnx8')
-    float6_e3m2fnx16 = dtype('float6_e3m2fnx16')
-    float6_e3m2fnx32 = dtype('float6_e3m2fnx32')
-    float6_e3m2fnx64 = dtype('float6_e3m2fnx64')
-    float4_e2m1fn = dtype('float4_e2m1fn')
-    float4_e2m1fnx2 = dtype('float4_e2m1fnx2')
-    float4_e2m1fnx4 = dtype('float4_e2m1fnx4')
-    float4_e2m1fnx8 = dtype('float4_e2m1fnx8')
-    float4_e2m1fnx16 = dtype('float4_e2m1fnx16')
-    float4_e2m1fnx32 = dtype('float4_e2m1fnx32')
-    float4_e2m1fnx64 = dtype('float4_e2m1fnx64')
-    bfloat16 = dtype('bfloat16')
+    bool = dtype("bool")
+    short = dtype("int16")
+    int = dtype("int32")
+    long = dtype("int64")
+    half = dtype("float16")
+    float = dtype("float32")
+    double = dtype("float64")
+    int8 = dtype("int8")
+    int16 = dtype("int16")
+    int32 = dtype("int32")
+    int64 = dtype("int64")
+    int8x2 = dtype("int8x2")
+    int16x2 = dtype("int16x2")
+    int32x2 = dtype("int32x2")
+    int64x2 = dtype("int64x2")
+    int8x4 = dtype("int8x4")
+    int16x4 = dtype("int16x4")
+    int32x4 = dtype("int32x4")
+    int64x4 = dtype("int64x4")
+    int8x8 = dtype("int8x8")
+    int16x8 = dtype("int16x8")
+    int32x8 = dtype("int32x8")
+    int64x8 = dtype("int64x8")
+    int8x16 = dtype("int8x16")
+    int16x16 = dtype("int16x16")
+    int32x16 = dtype("int32x16")
+    int64x16 = dtype("int64x16")
+    int8x32 = dtype("int8x32")
+    int16x32 = dtype("int16x32")
+    int32x32 = dtype("int32x32")
+    int64x32 = dtype("int64x32")
+    int8x64 = dtype("int8x64")
+    int16x64 = dtype("int16x64")
+    int32x64 = dtype("int32x64")
+    int64x64 = dtype("int64x64")
+    uint8 = dtype("uint8")
+    uint16 = dtype("uint16")
+    uint32 = dtype("uint32")
+    uint64 = dtype("uint64")
+    uint8x2 = dtype("uint8x2")
+    uint16x2 = dtype("uint16x2")
+    uint32x2 = dtype("uint32x2")
+    uint64x2 = dtype("uint64x2")
+    uint8x4 = dtype("uint8x4")
+    uint16x4 = dtype("uint16x4")
+    uint32x4 = dtype("uint32x4")
+    uint64x4 = dtype("uint64x4")
+    uint8x8 = dtype("uint8x8")
+    uint16x8 = dtype("uint16x8")
+    uint32x8 = dtype("uint32x8")
+    uint64x8 = dtype("uint64x8")
+    uint8x16 = dtype("uint8x16")
+    uint16x16 = dtype("uint16x16")
+    uint32x16 = dtype("uint32x16")
+    uint64x16 = dtype("uint64x16")
+    uint8x32 = dtype("uint8x32")
+    uint16x32 = dtype("uint16x32")
+    uint32x32 = dtype("uint32x32")
+    uint64x32 = dtype("uint64x32")
+    uint8x64 = dtype("uint8x64")
+    uint16x64 = dtype("uint16x64")
+    uint32x64 = dtype("uint32x64")
+    uint64x64 = dtype("uint64x64")
+    float16 = dtype("float16")
+    float32 = dtype("float32")
+    float64 = dtype("float64")
+    float16x2 = dtype("float16x2")
+    float32x2 = dtype("float32x2")
+    float64x2 = dtype("float64x2")
+    float16x4 = dtype("float16x4")
+    float32x4 = dtype("float32x4")
+    float64x4 = dtype("float64x4")
+    float16x8 = dtype("float16x8")
+    float32x8 = dtype("float32x8")
+    float64x8 = dtype("float64x8")
+    float16x16 = dtype("float16x16")
+    float32x16 = dtype("float32x16")
+    float64x16 = dtype("float64x16")
+    float16x32 = dtype("float16x32")
+    float32x32 = dtype("float32x32")
+    float64x32 = dtype("float64x32")
+    float16x64 = dtype("float16x64")
+    float32x64 = dtype("float32x64")
+    float64x64 = dtype("float64x64")
+    float8_e3m4 = dtype("float8_e3m4")
+    float8_e3m4x2 = dtype("float8_e3m4x2")
+    float8_e3m4x4 = dtype("float8_e3m4x4")
+    float8_e3m4x8 = dtype("float8_e3m4x8")
+    float8_e3m4x16 = dtype("float8_e3m4x16")
+    float8_e3m4x32 = dtype("float8_e3m4x32")
+    float8_e3m4x64 = dtype("float8_e3m4x64")
+    float8_e4m3 = dtype("float8_e4m3")
+    float8_e4m3x2 = dtype("float8_e4m3x2")
+    float8_e4m3x4 = dtype("float8_e4m3x4")
+    float8_e4m3x8 = dtype("float8_e4m3x8")
+    float8_e4m3x16 = dtype("float8_e4m3x16")
+    float8_e4m3x32 = dtype("float8_e4m3x32")
+    float8_e4m3x64 = dtype("float8_e4m3x64")
+    float8_e4m3b11fnuz = dtype("float8_e4m3b11fnuz")
+    float8_e4m3b11fnuzx2 = dtype("float8_e4m3b11fnuzx2")
+    float8_e4m3b11fnuzx4 = dtype("float8_e4m3b11fnuzx4")
+    float8_e4m3b11fnuzx8 = dtype("float8_e4m3b11fnuzx8")
+    float8_e4m3b11fnuzx16 = dtype("float8_e4m3b11fnuzx16")
+    float8_e4m3b11fnuzx32 = dtype("float8_e4m3b11fnuzx32")
+    float8_e4m3b11fnuzx64 = dtype("float8_e4m3b11fnuzx64")
+    float8_e4m3fn = dtype("float8_e4m3fn")
+    float8_e4m3fnx2 = dtype("float8_e4m3fnx2")
+    float8_e4m3fnx4 = dtype("float8_e4m3fnx4")
+    float8_e4m3fnx8 = dtype("float8_e4m3fnx8")
+    float8_e4m3fnx16 = dtype("float8_e4m3fnx16")
+    float8_e4m3fnx32 = dtype("float8_e4m3fnx32")
+    float8_e4m3fnx64 = dtype("float8_e4m3fnx64")
+    float8_e4m3fnuz = dtype("float8_e4m3fnuz")
+    float8_e4m3fnuzx2 = dtype("float8_e4m3fnuzx2")
+    float8_e4m3fnuzx4 = dtype("float8_e4m3fnuzx4")
+    float8_e4m3fnuzx8 = dtype("float8_e4m3fnuzx8")
+    float8_e4m3fnuzx16 = dtype("float8_e4m3fnuzx16")
+    float8_e4m3fnuzx32 = dtype("float8_e4m3fnuzx32")
+    float8_e4m3fnuzx64 = dtype("float8_e4m3fnuzx64")
+    float8_e5m2 = dtype("float8_e5m2")
+    float8_e5m2x2 = dtype("float8_e5m2x2")
+    float8_e5m2x4 = dtype("float8_e5m2x4")
+    float8_e5m2x8 = dtype("float8_e5m2x8")
+    float8_e5m2x16 = dtype("float8_e5m2x16")
+    float8_e5m2x32 = dtype("float8_e5m2x32")
+    float8_e5m2x64 = dtype("float8_e5m2x64")
+    float8_e5m2fnuz = dtype("float8_e5m2fnuz")
+    float8_e5m2fnuzx2 = dtype("float8_e5m2fnuzx2")
+    float8_e5m2fnuzx4 = dtype("float8_e5m2fnuzx4")
+    float8_e5m2fnuzx8 = dtype("float8_e5m2fnuzx8")
+    float8_e5m2fnuzx16 = dtype("float8_e5m2fnuzx16")
+    float8_e5m2fnuzx32 = dtype("float8_e5m2fnuzx32")
+    float8_e5m2fnuzx64 = dtype("float8_e5m2fnuzx64")
+    float8_e8m0fnu = dtype("float8_e8m0fnu")
+    float8_e8m0fnux2 = dtype("float8_e8m0fnux2")
+    float8_e8m0fnux4 = dtype("float8_e8m0fnux4")
+    float8_e8m0fnux8 = dtype("float8_e8m0fnux8")
+    float8_e8m0fnux16 = dtype("float8_e8m0fnux16")
+    float8_e8m0fnux32 = dtype("float8_e8m0fnux32")
+    float8_e8m0fnux64 = dtype("float8_e8m0fnux64")
+    float6_e2m3fn = dtype("float6_e2m3fn")
+    float6_e2m3fnx2 = dtype("float6_e2m3fnx2")
+    float6_e2m3fnx4 = dtype("float6_e2m3fnx4")
+    float6_e2m3fnx8 = dtype("float6_e2m3fnx8")
+    float6_e2m3fnx16 = dtype("float6_e2m3fnx16")
+    float6_e2m3fnx32 = dtype("float6_e2m3fnx32")
+    float6_e2m3fnx64 = dtype("float6_e2m3fnx64")
+    float6_e3m2fn = dtype("float6_e3m2fn")
+    float6_e3m2fnx2 = dtype("float6_e3m2fnx2")
+    float6_e3m2fnx4 = dtype("float6_e3m2fnx4")
+    float6_e3m2fnx8 = dtype("float6_e3m2fnx8")
+    float6_e3m2fnx16 = dtype("float6_e3m2fnx16")
+    float6_e3m2fnx32 = dtype("float6_e3m2fnx32")
+    float6_e3m2fnx64 = dtype("float6_e3m2fnx64")
+    float4_e2m1fn = dtype("float4_e2m1fn")
+    float4_e2m1fnx2 = dtype("float4_e2m1fnx2")
+    float4_e2m1fnx4 = dtype("float4_e2m1fnx4")
+    float4_e2m1fnx8 = dtype("float4_e2m1fnx8")
+    float4_e2m1fnx16 = dtype("float4_e2m1fnx16")
+    float4_e2m1fnx32 = dtype("float4_e2m1fnx32")
+    float4_e2m1fnx64 = dtype("float4_e2m1fnx64")
+    bfloat16 = dtype("bfloat16")
 
 _all_dtypes = {
-    'bool',
-    'short',
-    'int',
-    'long',
-    'half',
-    'float',
-    'double',
-    'int8',
-    'int16',
-    'int32',
-    'int64',
-    'int8x2',
-    'int16x2',
-    'int32x2',
-    'int64x2',
-    'int8x4',
-    'int16x4',
-    'int32x4',
-    'int64x4',
-    'int8x8',
-    'int16x8',
-    'int32x8',
-    'int64x8',
-    'int8x16',
-    'int16x16',
-    'int32x16',
-    'int64x16',
-    'int8x32',
-    'int16x32',
-    'int32x32',
-    'int64x32',
-    'int8x64',
-    'int16x64',
-    'int32x64',
-    'int64x64',
-    'uint8',
-    'uint16',
-    'uint32',
-    'uint64',
-    'uint8x2',
-    'uint16x2',
-    'uint32x2',
-    'uint64x2',
-    'uint8x4',
-    'uint16x4',
-    'uint32x4',
-    'uint64x4',
-    'uint8x8',
-    'uint16x8',
-    'uint32x8',
-    'uint64x8',
-    'uint8x16',
-    'uint16x16',
-    'uint32x16',
-    'uint64x16',
-    'uint8x32',
-    'uint16x32',
-    'uint32x32',
-    'uint64x32',
-    'uint8x64',
-    'uint16x64',
-    'uint32x64',
-    'uint64x64',
-    'float16',
-    'float32',
-    'float64',
-    'float16x2',
-    'float32x2',
-    'float64x2',
-    'float16x4',
-    'float32x4',
-    'float64x4',
-    'float16x8',
-    'float32x8',
-    'float64x8',
-    'float16x16',
-    'float32x16',
-    'float64x16',
-    'float16x32',
-    'float32x32',
-    'float64x32',
-    'float16x64',
-    'float32x64',
-    'float64x64',
-    'float8_e3m4',
-    'float8_e3m4x2',
-    'float8_e3m4x4',
-    'float8_e3m4x8',
-    'float8_e3m4x16',
-    'float8_e3m4x32',
-    'float8_e3m4x64',
-    'float8_e4m3',
-    'float8_e4m3x2',
-    'float8_e4m3x4',
-    'float8_e4m3x8',
-    'float8_e4m3x16',
-    'float8_e4m3x32',
-    'float8_e4m3x64',
-    'float8_e4m3b11fnuz',
-    'float8_e4m3b11fnuzx2',
-    'float8_e4m3b11fnuzx4',
-    'float8_e4m3b11fnuzx8',
-    'float8_e4m3b11fnuzx16',
-    'float8_e4m3b11fnuzx32',
-    'float8_e4m3b11fnuzx64',
-    'float8_e4m3fn',
-    'float8_e4m3fnx2',
-    'float8_e4m3fnx4',
-    'float8_e4m3fnx8',
-    'float8_e4m3fnx16',
-    'float8_e4m3fnx32',
-    'float8_e4m3fnx64',
-    'float8_e4m3fnuz',
-    'float8_e4m3fnuzx2',
-    'float8_e4m3fnuzx4',
-    'float8_e4m3fnuzx8',
-    'float8_e4m3fnuzx16',
-    'float8_e4m3fnuzx32',
-    'float8_e4m3fnuzx64',
-    'float8_e5m2',
-    'float8_e5m2x2',
-    'float8_e5m2x4',
-    'float8_e5m2x8',
-    'float8_e5m2x16',
-    'float8_e5m2x32',
-    'float8_e5m2x64',
-    'float8_e5m2fnuz',
-    'float8_e5m2fnuzx2',
-    'float8_e5m2fnuzx4',
-    'float8_e5m2fnuzx8',
-    'float8_e5m2fnuzx16',
-    'float8_e5m2fnuzx32',
-    'float8_e5m2fnuzx64',
-    'float8_e8m0fnu',
-    'float8_e8m0fnux2',
-    'float8_e8m0fnux4',
-    'float8_e8m0fnux8',
-    'float8_e8m0fnux16',
-    'float8_e8m0fnux32',
-    'float8_e8m0fnux64',
-    'float6_e2m3fn',
-    'float6_e2m3fnx2',
-    'float6_e2m3fnx4',
-    'float6_e2m3fnx8',
-    'float6_e2m3fnx16',
-    'float6_e2m3fnx32',
-    'float6_e2m3fnx64',
-    'float6_e3m2fn',
-    'float6_e3m2fnx2',
-    'float6_e3m2fnx4',
-    'float6_e3m2fnx8',
-    'float6_e3m2fnx16',
-    'float6_e3m2fnx32',
-    'float6_e3m2fnx64',
-    'float4_e2m1fn',
-    'float4_e2m1fnx2',
-    'float4_e2m1fnx4',
-    'float4_e2m1fnx8',
-    'float4_e2m1fnx16',
-    'float4_e2m1fnx32',
-    'float4_e2m1fnx64',
-    'bfloat16',
+    "bool",
+    "short",
+    "int",
+    "long",
+    "half",
+    "float",
+    "double",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x2",
+    "int16x2",
+    "int32x2",
+    "int64x2",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x2",
+    "uint16x2",
+    "uint32x2",
+    "uint64x2",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float16",
+    "float32",
+    "float64",
+    "float16x2",
+    "float32x2",
+    "float64x2",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float16x64",
+    "float32x64",
+    "float64x64",
+    "float8_e3m4",
+    "float8_e3m4x2",
+    "float8_e3m4x4",
+    "float8_e3m4x8",
+    "float8_e3m4x16",
+    "float8_e3m4x32",
+    "float8_e3m4x64",
+    "float8_e4m3",
+    "float8_e4m3x2",
+    "float8_e4m3x4",
+    "float8_e4m3x8",
+    "float8_e4m3x16",
+    "float8_e4m3x32",
+    "float8_e4m3x64",
+    "float8_e4m3b11fnuz",
+    "float8_e4m3b11fnuzx2",
+    "float8_e4m3b11fnuzx4",
+    "float8_e4m3b11fnuzx8",
+    "float8_e4m3b11fnuzx16",
+    "float8_e4m3b11fnuzx32",
+    "float8_e4m3b11fnuzx64",
+    "float8_e4m3fn",
+    "float8_e4m3fnx2",
+    "float8_e4m3fnx4",
+    "float8_e4m3fnx8",
+    "float8_e4m3fnx16",
+    "float8_e4m3fnx32",
+    "float8_e4m3fnx64",
+    "float8_e4m3fnuz",
+    "float8_e4m3fnuzx2",
+    "float8_e4m3fnuzx4",
+    "float8_e4m3fnuzx8",
+    "float8_e4m3fnuzx16",
+    "float8_e4m3fnuzx32",
+    "float8_e4m3fnuzx64",
+    "float8_e5m2",
+    "float8_e5m2x2",
+    "float8_e5m2x4",
+    "float8_e5m2x8",
+    "float8_e5m2x16",
+    "float8_e5m2x32",
+    "float8_e5m2x64",
+    "float8_e5m2fnuz",
+    "float8_e5m2fnuzx2",
+    "float8_e5m2fnuzx4",
+    "float8_e5m2fnuzx8",
+    "float8_e5m2fnuzx16",
+    "float8_e5m2fnuzx32",
+    "float8_e5m2fnuzx64",
+    "float8_e8m0fnu",
+    "float8_e8m0fnux2",
+    "float8_e8m0fnux4",
+    "float8_e8m0fnux8",
+    "float8_e8m0fnux16",
+    "float8_e8m0fnux32",
+    "float8_e8m0fnux64",
+    "float6_e2m3fn",
+    "float6_e2m3fnx2",
+    "float6_e2m3fnx4",
+    "float6_e2m3fnx8",
+    "float6_e2m3fnx16",
+    "float6_e2m3fnx32",
+    "float6_e2m3fnx64",
+    "float6_e3m2fn",
+    "float6_e3m2fnx2",
+    "float6_e3m2fnx4",
+    "float6_e3m2fnx8",
+    "float6_e3m2fnx16",
+    "float6_e3m2fnx32",
+    "float6_e3m2fnx64",
+    "float4_e2m1fn",
+    "float4_e2m1fnx2",
+    "float4_e2m1fnx4",
+    "float4_e2m1fnx8",
+    "float4_e2m1fnx16",
+    "float4_e2m1fnx32",
+    "float4_e2m1fnx64",
+    "bfloat16",
 }
 
 __all__ = list(_all_dtypes) + [
-    'dtype',
-    'AnyDType',
-    'get_tvm_dtype',
+    "dtype",
+    "AnyDType",
+    "get_tvm_dtype",
 ]
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/v2/utils.py
index 022402df..207bd92a 100644
--- a/tilelang/language/v2/utils.py
+++ b/tilelang/language/v2/utils.py
@@ -12,11 +12,12 @@ def disk_compile(source, name):
     cache_dir = env.TILELANG_CACHE_DIR
     if cache_dir is not None:
         import os
+
         save_dir = os.path.join(cache_dir, "py-cache")
         os.makedirs(save_dir, exist_ok=True)
-        hash_sfx = sha256(source.encode('utf-8')).hexdigest()[:8]
+        hash_sfx = sha256(source.encode("utf-8")).hexdigest()[:8]
         path = os.path.join(save_dir, f"{name}.{hash_sfx}.py")
-        with open(path, 'w') as f:
+        with open(path, "w") as f:
             f.write(source)
     linecache.cache[path] = (len(source), None, source.splitlines(), path)
     return compile(source, path, "exec")
@@ -59,29 +60,26 @@ def get_ast(func: Callable):
     filename = inspect.getsourcefile(func) or inspect.getfile(func)
     source = inspect.getsource(func)
     source = _remove_leading_ident(source)
-    source = '\n' * (start - 1) + source
+    source = "\n" * (start - 1) + source
     tree = ast.parse(source, filename=filename)
     return tree
 
 
-CompileMethod = Literal['direct', 'disk']
+CompileMethod = Literal["direct", "disk"]
 
 
-def get_compiled_object(source: str | ast.AST,
-                        name: str,
-                        filename: str = None,
-                        globals: dict[str, Any] = None):
+def get_compiled_object(source: str | ast.AST, name: str, filename: str = None, globals: dict[str, Any] = None):
     if isinstance(source, ast.AST):
         assert filename is not None, "filename must be provided when source is an AST"
     try:
         if isinstance(source, ast.AST):
             ast.fix_missing_locations(source)
-            compiled = compile(source, filename, 'exec')
+            compiled = compile(source, filename, "exec")
         else:
             compiled = disk_compile(source, name)
     except Exception as e:
         source_str = source if isinstance(source, str) else ast.unparse(source)
-        raise RuntimeError(f'Failed to compile source for {name}, Error: {e}:\n{source_str}') from e
+        raise RuntimeError(f"Failed to compile source for {name}, Error: {e}:\n{source_str}") from e
     locs = {}
     exec(compiled, globals, locs)
     return locs[name]
@@ -95,7 +93,6 @@ def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> t
         strides.append(stride)
         stride *= s
         if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
-            raise ValueError(
-                "Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+            raise ValueError("Cannot construct strides with PrimExpr when allow_prim_expr is False.")
     strides = tuple(reversed(strides))
     return strides
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index bec76809..77cf6924 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from tvm.script.ir_builder.tir.frame import TIRFrame
 from tvm.ffi import register_object
 from tilelang import _ffi_api
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index ff45f6d5..256a7d5e 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,4 +1,5 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 import tvm
 import tvm_ffi
@@ -20,12 +21,7 @@ class Fragment(Layout):
     # Disable the linter warning about not calling super().__init__()
     # because this object is created via TVM's FFI constructor mechanism.
     # pylint: disable=super-init-not-called
-    def __init__(self,
-                 shape,
-                 forward_fn=None,
-                 forward_thread_fn=None,
-                 replicate=1,
-                 forward_index_fn=None):
+    def __init__(self, shape, forward_fn=None, forward_thread_fn=None, replicate=1, forward_index_fn=None):
         """
         Initialize the Fragment with iteration variables and optional thread replication.
 
@@ -119,10 +115,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_thread_size(self)
 
-    def repeat(self,
-               repeats,
-               repeat_on_thread: bool = False,
-               lower_dim_first: bool = True) -> 'Fragment':
+    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> "Fragment":
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -142,7 +135,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_repeat(self, repeats, repeat_on_thread, lower_dim_first)
 
-    def replicate(self, replicate: int) -> 'Fragment':
+    def replicate(self, replicate: int) -> "Fragment":
         """
         Replicate the Fragment across a new thread dimension.
 
@@ -158,7 +151,7 @@ class Fragment(Layout):
         """
         return _ffi_api.Fragment_replicate(self, replicate)
 
-    def condense_rep_var(self) -> 'Fragment':
+    def condense_rep_var(self) -> "Fragment":
         """
         Condense or fold the replicate variable into the existing iteration space.
         This operation may be used to reduce dimensionality if the replicate variable
@@ -190,8 +183,7 @@ class Fragment(Layout):
         # The thread dimension (IterVar) is accessed via the `thread` property
         forward_thread = self.thread
         # Construct an IndexMap to map the provided args into the final thread index
-        index_map = IndexMap(
-            initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
+        index_map = IndexMap(initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
         return index_map.map_indices(indices)
 
     def __repr__(self):
@@ -206,7 +198,7 @@ class Fragment(Layout):
         return self._DebugOutput()
         # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
-    def is_equal(self, other: 'Fragment') -> bool:
+    def is_equal(self, other: "Fragment") -> bool:
         """
         Check if the current fragment is equal to another fragment.
         """
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index e5d19029..e68c1167 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -1,4 +1,5 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
 import tvm
@@ -114,8 +115,7 @@ def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8_e4m3", "float8_e5m2", "int8", "uint8"
-                    ] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
     m, k = buffer.shape
@@ -134,10 +134,7 @@ def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer,
-                                 mma_dtype: str = "float16",
-                                 arch: str | None = None,
-                                 **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = "float16", arch: str | None = None, **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index 87d2ee44..fbd39e8d 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,4 +1,5 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 import tvm_ffi
 from tvm.ir import Node, Range
@@ -9,7 +10,6 @@ from tilelang import _ffi_api
 # Register the Layout class as a TVM object under the name "tl.Layout"
 @tvm_ffi.register_object("tl.Layout")
 class Layout(Node):
-
     def __init__(self, shape, forward_fn):
         """
         Initialize a Layout object.
@@ -114,13 +114,13 @@ class Layout(Node):
         index_map = IndexMap(
             initial_indices=forward_vars,  # The original iteration variables
             final_indices=forward_indexes,  # The computed forward indices
-            inverse_index_map=None  # No inverse mapping provided at this stage
+            inverse_index_map=None,  # No inverse mapping provided at this stage
         )
 
         # Map the provided indices using the constructed index mapping
         return index_map.map_indices(indices)
 
-    def inverse(self) -> 'Layout':
+    def inverse(self) -> "Layout":
         """
         Compute the inverse of the current layout transformation.
 
@@ -131,7 +131,7 @@ class Layout(Node):
         """
         return _ffi_api.Layout_inverse(self)
 
-    def is_equal(self, other: 'Layout') -> bool:
+    def is_equal(self, other: "Layout") -> bool:
         """
         Check if the current layout is equal to another layout.
 
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index 3a219c67..e083d756 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -1,4 +1,5 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
 
@@ -7,9 +8,7 @@ from tvm.tir import Buffer, BufferLoad, BufferRegion
 from tilelang import _ffi_api
 
 
-def _get_buffer_info(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion
-) -> tuple[Buffer, list[int], str]:
+def _get_buffer_info(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[Buffer, list[int], str]:
     """
     Extract buffer, shape, and dtype from Buffer, BufferLoad, or BufferRegion.
 
@@ -25,12 +24,10 @@ def _get_buffer_info(
         buf = buffer_or_load_or_region.buffer
         return buf, buf.shape, buf.dtype
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
-def _get_stride_continuous(
-        buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
+def _get_stride_continuous(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
     """
     Get stride (last 2nd dimension) and continuous (last dimension) from Buffer, BufferLoad, or BufferRegion.
 
@@ -62,9 +59,7 @@ def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegi
 
 # Use a stable swizzled layout to ensure consistent memory access patterns.
 # Swizzling should be enabled or disabled based on whether TMA (Tensor Memory Access) is applied.
-def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                         k_major: bool = True,
-                         allow_pad: bool = True):
+def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, k_major: bool = True, allow_pad: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     return _ffi_api.make_swizzled_layout(
@@ -77,9 +72,7 @@ def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for Volta Intrinsics
-def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               is_a: bool = True,
-                               k_inner: bool = True):
+def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, is_a: bool = True, k_inner: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     return _ffi_api.make_volta_swizzled_layout(
         stride,
@@ -90,9 +83,7 @@ def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for WGMMA Intrinsics
-def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                               continuity: int = None,
-                               k_major: bool = True):
+def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
@@ -107,9 +98,7 @@ def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
 
 
 # for TCGEN05MMA Intrinsics
-def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion,
-                                    continuity: int = None,
-                                    k_major: bool = True):
+def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
     stride, continuous = _get_stride_continuous(buffer)
     element_size = _get_element_size(buffer)
     if continuity is None:
diff --git a/tilelang/libinfo.py b/tilelang/libinfo.py
index 5af8c84f..d82986b7 100644
--- a/tilelang/libinfo.py
+++ b/tilelang/libinfo.py
@@ -31,6 +31,5 @@ def find_lib_path(name: str, py_ext=False):
         if os.path.exists(lib_dll_path) and os.path.isfile(lib_dll_path):
             return lib_dll_path
     else:
-        message = (f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" +
-                   "\n".join(TL_LIBS))
+        message = f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" + "\n".join(TL_LIBS)
         raise RuntimeError(message)
diff --git a/tilelang/primitives/__init__.py b/tilelang/primitives/__init__.py
index 8eccc3e5..9d2a739a 100644
--- a/tilelang/primitives/__init__.py
+++ b/tilelang/primitives/__init__.py
@@ -1,3 +1,3 @@
-""" bootstrap the primitives module via tile language """
+"""bootstrap the primitives module via tile language"""
 
 from .gemm import gemm  # noqa: F401
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
index 24843740..7664a7b5 100644
--- a/tilelang/primitives/gemm/__init__.py
+++ b/tilelang/primitives/gemm/__init__.py
@@ -3,7 +3,8 @@ from tvm import tir
 from tilelang.utils import is_local, is_fragment, is_shared
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,)
+    GemmPrimitiveMMA,
+)
 
 
 def gemm(
@@ -20,12 +21,9 @@ def gemm(
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     k_pack: int = 1,
 ):
-    assert is_local(A) or is_fragment(A) or is_shared(A), (
-        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}")
-    assert is_local(B) or is_fragment(B) or is_shared(B), (
-        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}")
-    assert is_local(C) or is_fragment(C), (
-        f"Expected C to be a local, fragment, but got {C.scope()}")
+    assert is_local(A) or is_fragment(A) or is_shared(A), f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}"
+    assert is_local(B) or is_fragment(B) or is_shared(B), f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}"
+    assert is_local(C) or is_fragment(C), f"Expected C to be a local, fragment, but got {C.scope()}"
     # TODO(lei): Now we only support Nvidia GPUs
     # Must enhance the design to implement runtime lowering
     # for different targets (hip mfma for example)
diff --git a/tilelang/primitives/gemm/base.py b/tilelang/primitives/gemm/base.py
index 827ff78f..b7fcdca9 100644
--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/primitives/gemm/base.py
@@ -131,7 +131,7 @@ class GemmWarpPolicy(IntEnum):
             # Try to find the best balanced partition
             best_m = 1
             best_n = 1
-            best_balance = float('inf')
+            best_balance = float("inf")
 
             # Try all possible combinations that satisfy the constraints
             for m in range(1, min(max_m_warps, num_warps) + 1):
@@ -202,7 +202,7 @@ class GemmBaseParams:
     warp_row_tiles: int | None = None
     warp_col_tiles: int | None = None
     chunk: int | None = None
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    policy: GemmWarpPolicy = (GemmWarpPolicy.Square,)
     k_pack: int = 1
 
     def get_warp_size(self) -> int:
@@ -267,17 +267,17 @@ class GemmBaseParams:
 
         # Determine whether block partition parameters need to be inferred
         require_infer = (
-            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or
-            warp_col_tiles is None or chunk is None)
+            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or warp_col_tiles is None or chunk is None
+        )
 
         A_shape, B_shape = A.shape, B.shape
 
         if require_infer:
-            assert (threads is not None), "threads must be provided for auto inference"
+            assert threads is not None, "threads must be provided for auto inference"
             # Auto-inference only supports 2D matrix multiplication
-            assert (
-                len(A_shape) == 2 and len(B_shape) == 2
-            ), f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
+            assert len(A_shape) == 2 and len(B_shape) == 2, (
+                f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
+            )
 
             # Analyze A/B shapes
             AM = A_shape[1] if transpose_A else A_shape[0]  # M dimension
@@ -291,8 +291,7 @@ class GemmBaseParams:
             num_warps = threads // warp_size
 
             # Infer block partition using a user-specified policy
-            block_row_warps, block_col_warps = policy.compute_warp_partition(
-                block_M, block_N, num_warps)
+            block_row_warps, block_col_warps = policy.compute_warp_partition(block_M, block_N, num_warps)
             warp_row_tiles = block_M // block_row_warps
             warp_col_tiles = block_N // block_col_warps
             chunk = int(AK)
diff --git a/tilelang/primitives/gemm/gemm_mma.py b/tilelang/primitives/gemm/gemm_mma.py
index 11e16838..7ca3208b 100644
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ b/tilelang/primitives/gemm/gemm_mma.py
@@ -31,7 +31,6 @@ class GemmPrimitiveMMA(GemmBaseParams):
         C: tir.Buffer,
         mma_emitter: TensorCoreIntrinEmitter,
     ) -> tir.PrimExpr:
-
         in_dtype = self.in_dtype
         warp_cols = mma_emitter.warp_cols
         local_size_b = mma_emitter.local_size_b
@@ -53,21 +52,24 @@ class GemmPrimitiveMMA(GemmBaseParams):
 
             if a_is_fragment:
                 # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout({
-                    A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                })
+                T.annotate_layout(
+                    {
+                        A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
+                    }
+                )
             if c_is_fragment:
                 # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
+                T.annotate_layout(
+                    {
+                        C_local: mma_emitter.make_mma_store_layout(C_local),
+                    }
+                )
 
             # Make default swizzle layout for shared memory
             # T.annotate_layout({
             #     B_shared: make_mma_swizzle_layout(B_shared),
             # })
             for ki in T.serial(0, (block_K // micro_size_k)):
-
                 # Load B into fragment
                 mma_emitter.ldmatrix_b(
                     B_local,
@@ -146,9 +148,11 @@ class GemmPrimitiveMMA(GemmBaseParams):
 
             if c_is_fragment:
                 # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
+                T.annotate_layout(
+                    {
+                        C_local: mma_emitter.make_mma_store_layout(C_local),
+                    }
+                )
 
             for ki in T.serial(0, (block_K // micro_size_k)):
                 # Load A into fragment
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index 4750fa7d..94d35015 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 from typing import Callable, Any, Literal
 from functools import partial
@@ -45,8 +46,7 @@ class Profiler:
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
+                raise ValueError(f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
@@ -113,8 +113,7 @@ class Profiler:
         ref_tensors = ins + ref_outs
         lib_tensors = ins + lib_outs
 
-        assert len(lib_tensors) == len(
-            ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
+        assert len(lib_tensors) == len(ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
         # torch.set_printoptions(edgeitems=torch.inf)
         for lhs, rhs in zip(lib_tensors, ref_tensors):
             # close_mask = torch.isclose(lhs, rhs, rtol=rtol, atol=atol)
@@ -252,10 +251,9 @@ class Profiler:
             )
         elif profiler == "tvm":
             assert func is not None, "func should not be None"
-            assert isinstance(
-                func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
+            assert isinstance(func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
 
-            ins = (self._get_inputs(with_output=True) if input_tensors is None else input_tensors)
+            ins = self._get_inputs(with_output=True) if input_tensors is None else input_tensors
             target = "cuda"
 
             with suppress(Exception):
@@ -264,8 +262,7 @@ class Profiler:
             assert target in ["cuda", "hip"], f"Unknown target: {target}"
 
             device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
-            time_evaluator = self.mod.time_evaluator(
-                self.mod.entry_name, device, number=rep, repeat=n_repeat)
+            time_evaluator = self.mod.time_evaluator(self.mod.entry_name, device, number=rep, repeat=n_repeat)
             # Transform Latency to ms
             return time_evaluator(*ins).mean * 1e3
         else:
diff --git a/tilelang/profiler/bench.py b/tilelang/profiler/bench.py
index a851ceb3..bfcb5043 100644
--- a/tilelang/profiler/bench.py
+++ b/tilelang/profiler/bench.py
@@ -1,4 +1,5 @@
 """Profiler and benchmarking utilities for PyTorch functions."""
+
 from __future__ import annotations
 
 import os
@@ -16,8 +17,8 @@ class suppress_stdout_stderr:
 
     def __enter__(self):
         # Open null device files
-        self.outnull_file = open(os.devnull, 'w')
-        self.errnull_file = open(os.devnull, 'w')
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
 
         # Save original file descriptors
         self.old_stdout_fileno_undup = sys.stdout.fileno()
@@ -56,7 +57,7 @@ class suppress_stdout_stderr:
 
 
 IS_CUDA = torch.cuda.is_available()
-device = 'cuda:0' if IS_CUDA else 'mps:0'
+device = "cuda:0" if IS_CUDA else "mps:0"
 Event = torch.cuda.Event if IS_CUDA else torch.mps.Event
 
 
@@ -93,8 +94,7 @@ def do_bench(
     Returns:
         Runtime in milliseconds (float) or list of quantile values if quantiles specified
     """
-    assert return_mode in ["min", "max", "mean", "median"], \
-        f"Invalid return_mode: {return_mode}"
+    assert return_mode in ["min", "max", "mean", "median"], f"Invalid return_mode: {return_mode}"
 
     # Initial function call and synchronization
     fn()
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index e4e7f7ee..e0788dab 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1130,16 +1130,13 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in [
-        "float16", "int8", "int4"
-    ], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
+    assert out_dtype in ["float16", "int8", "int4"], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
 
     dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
     target_dtype = dtype_mapping[out_dtype]
 
     if source_format not in ["int", "uint"]:
-        raise ValueError(
-            f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
+        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
     if with_zeros and source_format == "int":
         raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
 
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 80f3e061..e5c472cb 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -80,13 +80,9 @@ def get_mxfp_intrin_group(
         AssertionError: if out_dtype, source_format, or storage_dtype are not supported.
         KeyError: if the constructed key does not match any available C source implementation.
     """
-    assert out_dtype in ["float16", "bfloat16"
-                        ], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
-    assert source_format in ["int", "uint"
-                            ], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
-    assert storage_dtype in [
-        "int32", "int8", "uint8"
-    ], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
+    assert out_dtype in ["float16", "bfloat16"], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
+    assert source_format in ["int", "uint"], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
+    assert storage_dtype in ["int32", "int8", "uint8"], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
     dtype_map = {"float16": "f16", "bfloat16": "bf16"}
     key = f"fp{source_bit}_to_{dtype_map[out_dtype]}"
diff --git a/tilelang/quantize/utils.py b/tilelang/quantize/utils.py
index 2447ca16..2d092a0b 100644
--- a/tilelang/quantize/utils.py
+++ b/tilelang/quantize/utils.py
@@ -1,6 +1,7 @@
 def gen_quant4(k, n, groupsize=-1):
     import torch
     import torch.nn as nn
+
     maxq = 2**4
     w = torch.randn((k, n), dtype=torch.half, device="cpu")
 
@@ -48,6 +49,7 @@ def gen_quant4(k, n, groupsize=-1):
 
 def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     import torch
+
     if storage_dtype is None:
         storage_dtype = torch.int8
     elems_per_byte = 8 // source_bits
@@ -56,11 +58,11 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     int8_weight = torch.zeros(
         (*lowprecision_weight.shape[:-1], lowprecision_weight.shape[-1] // elems_per_byte),
         dtype=torch.int8,
-        device=lowprecision_weight.device)
+        device=lowprecision_weight.device,
+    )
     for j in range(lowprecision_weight.shape[-1] // elems_per_byte):
         for k in range(elems_per_byte):
-            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] <<
-                                    (source_bits * k)).to(torch.int8)
+            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] << (source_bits * k)).to(torch.int8)
 
     return int8_weight.to(storage_dtype)
 
@@ -82,6 +84,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         interleave_weight(qweight, 4, "float16")
     """
     import torch
+
     assert target_dtype in ["float16", "int8"]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(torch.int32)
diff --git a/tilelang/testing/__init__.py b/tilelang/testing/__init__.py
index 6a203149..635fad36 100644
--- a/tilelang/testing/__init__.py
+++ b/tilelang/testing/__init__.py
@@ -5,20 +5,19 @@ import random
 import torch
 import numpy as np
 from tilelang.contrib import nvcc
-from tvm.testing.utils import (requires_cuda, requires_package, requires_llvm, requires_metal,
-                               requires_rocm, _compose)
+from tvm.testing.utils import requires_cuda, requires_package, requires_llvm, requires_metal, requires_rocm, _compose
 
 from tilelang.utils.tensor import torch_assert_close as torch_assert_close
 
 __all__ = [
-    'requires_package',
-    'requires_cuda',
-    'requires_metal',
-    'requires_rocm',
-    'requires_llvm',
-    'main',
-    'requires_cuda_compute_version',
-] + [f'requires_cuda_compute_version_{op}' for op in ('ge', 'gt', 'le', 'lt', 'eq')]
+    "requires_package",
+    "requires_cuda",
+    "requires_metal",
+    "requires_rocm",
+    "requires_llvm",
+    "main",
+    "requires_cuda_compute_version",
+] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
 
 
 # pytest.main() wrapper to allow running single test file
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 90960904..4d2caf8c 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -23,8 +23,7 @@ def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range)
 
 
 @tvm_ffi.register_global_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range,
-                  thread_var: tir.Var):
+def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range, thread_var: tir.Var):
     thread_nums = thread_bounds.extent
     stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
     return stmt
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
index 862ec725..d827d8a2 100644
--- a/tilelang/tileop/gemm/gemm_mfma.py
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMFMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -56,12 +55,10 @@ class GemmMFMA(GemmBase):
                 self.C: mfma_emitter.make_mfma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -153,7 +150,6 @@ class GemmMFMA(GemmBase):
                     T.clear(C_buf)
 
                 for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
-
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -183,7 +179,6 @@ class GemmMFMA(GemmBase):
                 if clear_accum:
                     T.clear(C_buf)
                 for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
-
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
                         B_local,
@@ -217,8 +212,7 @@ class GemmMFMA(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma.py b/tilelang/tileop/gemm/gemm_mma.py
index ce27409b..b1517348 100644
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -54,12 +53,10 @@ class GemmMMA(GemmBase):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -177,7 +174,6 @@ class GemmMMA(GemmBase):
                 if clear_accum:
                     T.clear(C_buf)
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -211,8 +207,7 @@ class GemmMMA(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rrr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma_sm70.py b/tilelang/tileop/gemm/gemm_mma_sm70.py
index 12b729c2..52a4bf32 100644
--- a/tilelang/tileop/gemm/gemm_mma_sm70.py
+++ b/tilelang/tileop/gemm/gemm_mma_sm70.py
@@ -2,7 +2,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_volta_swizzled_layout
 from tilelang.intrinsics.mma_sm70_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -12,10 +13,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMASm70(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -45,12 +44,10 @@ class GemmMMASm70(GemmBase):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -140,7 +137,6 @@ class GemmMMASm70(GemmBase):
                     T.clear(C_buf)
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
@@ -155,8 +151,7 @@ class GemmMMASm70(GemmBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index 76f919e0..f93a403e 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_tcgen05mma_swizzled_layout
 from tilelang.intrinsics.tcgen05_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 from tvm import tir
@@ -18,10 +19,8 @@ _FLOAT8_DTYPES = {
 
 
 class GemmTCGEN5(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -40,27 +39,20 @@ class GemmTCGEN5(GemmBase):
         b_is_k_major = self.trans_B
 
         if self.is_gemm_ss():
-
             a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
             b_continuity = self.K if b_is_k_major else self.N // n_warp
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_tcgen05mma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_tcgen05mma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_tcgen05mma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_tcgen05mma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         # No special swizzle requirement; rely on existing layout.
         return {}
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -82,11 +74,9 @@ class GemmTCGEN5(GemmBase):
             mma_emitter._assign_b_shared_layout(layout_map[self.B])
 
         if not self.is_gemm_ss():
-            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got "
-                             f"A scope {self.A.scope()}, B scope {self.B.scope()}")
+            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got A scope {self.A.scope()}, B scope {self.B.scope()}")
 
-        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(
-            self.M, self.N, self.K)
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
 
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
@@ -108,7 +98,7 @@ class GemmTCGEN5(GemmBase):
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype not in ["float32", 'float16']:
+        if accum_dtype not in ["float32", "float16"]:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
diff --git a/tilelang/tileop/gemm/gemm_wgmma.py b/tilelang/tileop/gemm/gemm_wgmma.py
index 2325f45d..038aa2cd 100644
--- a/tilelang/tileop/gemm/gemm_wgmma.py
+++ b/tilelang/tileop/gemm/gemm_wgmma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_wgmma_swizzled_layout
 from tilelang.intrinsics.wgmma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmWGMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -38,33 +37,22 @@ class GemmWGMMA(GemmBase):
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_wgmma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_wgmma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         elif self.is_gemm_rs():
             b_continuity = self.N if b_is_k_major else 4 * self.K // n_warp
             return {
-                self.A:
-                    mma_emitter.make_mma_load_layout(self.A, matrix="A"),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
 
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
@@ -133,8 +121,7 @@ class GemmWGMMA(GemmBase):
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
-        raise ValueError(
-            f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+        raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
index fdac694c..c22bca8d 100644
--- a/tilelang/tileop/gemm_sp/__init__.py
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -1,7 +1,8 @@
 from tilelang import tvm as tvm
 from tvm import tir
 from tilelang.utils.target import (
-    target_is_cuda,)
+    target_is_cuda,
+)
 from tvm.target import Target
 from tvm.ir.base import Node
 from tvm.ir import Range
@@ -18,8 +19,7 @@ def gemm_sp_py_infer_layout(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds
 
 
 @tvm_ffi.register_global_func("tl.gemm_sp_py.lower")
-def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range,
-                     thread_var: tir.Var):
+def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range, thread_var: tir.Var):
     thread_nums = thread_bounds.extent
     stmt = gemm_sp_py.lower(target, thread_nums, thread_var)
     return stmt
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
index 50a40bb9..76a0d4a9 100644
--- a/tilelang/tileop/gemm_sp/gemm_sp_mma.py
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -10,10 +10,8 @@ from tilelang.transform.simplify import _Simplify
 
 
 class GemmSPMMA(GemmSPBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = SparseTensorCoreIntrinEmitter(
@@ -55,12 +53,10 @@ class GemmSPMMA(GemmSPBase):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = SparseTensorCoreIntrinEmitter(
@@ -146,7 +142,6 @@ class GemmSPMMA(GemmSPBase):
                 E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
 
                 for ki in T.serial(0, (self.K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -231,8 +226,7 @@ class GemmSPMMA(GemmSPBase):
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rrr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tools/Analyzer.py b/tilelang/tools/Analyzer.py
index 205c647e..3af5222f 100644
--- a/tilelang/tools/Analyzer.py
+++ b/tilelang/tools/Analyzer.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 from tilelang import tvm
 from tvm.tir.stmt_functor import ir_transform
 import logging
+
 # Configuration for different hardware architectures.
 # Each entry contains: (cores per SM, default clock (GHz), FLOPs per cycle, max SM count)
 ARCH_CONFIGS = {"80": (128, 1.41, 2, 108), "86": (128, 1.70, 2, 84), "89": (128, 2.52, 2, 128)}
@@ -23,6 +24,7 @@ class AnalysisResult:
         tflops: Achieved TFLOPS (trillions of FLOPs per second).
         bandwidth_GBps: Achieved memory bandwidth in GB/s.
     """
+
     total_flops: int
     total_global_bytes: int
     estimated_time: float
@@ -81,7 +83,7 @@ class Analyzer:
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         total_bytes = bytes_transferred * loop_product * total_blocks
         self.total_global_bytes += total_bytes
@@ -100,7 +102,7 @@ class Analyzer:
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         self.total_flops += flops_per_call * loop_product * total_blocks
 
@@ -127,8 +129,7 @@ class Analyzer:
                         iter_var = stmt.node
                         thread_tag = iter_var.thread_tag
                         if thread_tag in self.block_counts:
-                            extent = stmt.value.value if hasattr(stmt.value,
-                                                                 'value') else stmt.value
+                            extent = stmt.value.value if hasattr(stmt.value, "value") else stmt.value
                             self.block_counts[thread_tag] = extent
                 elif isinstance(stmt, tvm.tir.For):
                     # Push loop extent onto the stack
@@ -178,9 +179,7 @@ class Analyzer:
             """
             arch_key = device.compute_capability[:2]
             if arch_key not in ARCH_CONFIGS:
-                logger.info(
-                    f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None"
-                )
+                logger.info(f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None")
                 return None
 
             cores_per_sm, default_clock, flops_per_cycle, compute_max_core = ARCH_CONFIGS[arch_key]
@@ -203,7 +202,8 @@ class Analyzer:
             total_global_bytes=self.total_global_bytes,
             estimated_time=estimated_time,
             expected_tflops=peak_tflops,
-            expected_bandwidth_GBps=bandwidth_GBps)
+            expected_bandwidth_GBps=bandwidth_GBps,
+        )
 
     @classmethod
     def analysis(cls, fn, device):
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 06e01f48..299c3e86 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -2,12 +2,14 @@ from __future__ import annotations
 import tilelang.language as T
 
 
-def plot_layout(layout: T.Fragment,
-                save_directory="./tmp",
-                name: str = "layout",
-                colormap: str = "RdPu",
-                verbose: bool = False,
-                formats: str | list[str] = "png") -> None:
+def plot_layout(
+    layout: T.Fragment,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "RdPu",
+    verbose: bool = False,
+    formats: str | list[str] = "png",
+) -> None:
     """
     Plot the layout of a buffer.
 
@@ -90,11 +92,13 @@ def plot_layout(layout: T.Fragment,
     # Warn if the number of threads is less than the warp size
     if num_threads < warp_size:
         import warnings
+
         warnings.warn(
             f"Layout visualization has {num_threads} threads, which is less than the warp size ({warp_size}). "
             f"For the best viewing experience, it is recommended to have at least {warp_size} threads.",
             UserWarning,
-            stacklevel=2)
+            stacklevel=2,
+        )
     spectral_camp = plt.get_cmap("hsv", warp_size * 6)
 
     for i in range(min(warp_size, num_threads)):
@@ -118,12 +122,7 @@ def plot_layout(layout: T.Fragment,
 
             color = colors[thread_ids[0]]  # Select color based on thread ID
             # Create a rectangle patch for visualization
-            rect = patches.Rectangle((j, i),
-                                     1,
-                                     1,
-                                     linewidth=0.5,
-                                     edgecolor='black',
-                                     facecolor=color)
+            rect = patches.Rectangle((j, i), 1, 1, linewidth=0.5, edgecolor="black", facecolor=color)
             ax.add_patch(rect)  # Add the rectangle to the plot
 
             # Add text annotations inside the rectangles
@@ -139,41 +138,19 @@ def plot_layout(layout: T.Fragment,
             thread_fontsize = min(font_size, font_size * (4 / len(thread_str)))
 
             # Add thread ID text with adjusted font size
-            ax.text(
-                j + 0.5,
-                i + 0.3,
-                thread_str,
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=thread_fontsize)
+            ax.text(j + 0.5, i + 0.3, thread_str, ha="center", va="center", color="black", fontsize=thread_fontsize)
             # Add local ID text with original font size
-            ax.text(
-                j + 0.5,
-                i + 0.7,
-                f"L{local_id}",
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=font_size)
+            ax.text(j + 0.5, i + 0.7, f"L{local_id}", ha="center", va="center", color="black", fontsize=font_size)
 
     # Add row labels to the left side of the plot
     for i in range(nrows):
         text = f"row {i}"
-        ax.text(-0.75, i + 0.5, text, ha='center', va='center', color='black', fontsize=font_size)
+        ax.text(-0.75, i + 0.5, text, ha="center", va="center", color="black", fontsize=font_size)
 
     # Add column labels at the top of the plot
     for j in range(ncols):
         text = f"col {j}"
-        ax.text(
-            j + 0.5,
-            -0.5,
-            text,
-            ha='center',
-            va='center',
-            color='black',
-            fontsize=font_size,
-            rotation=45)
+        ax.text(j + 0.5, -0.5, text, ha="center", va="center", color="black", fontsize=font_size, rotation=45)
 
     # Set the plot limits
     ax.set_xlim(0, ncols)
@@ -189,17 +166,15 @@ def plot_layout(layout: T.Fragment,
     legend_x = 1.0 + (0.5 / fig_width)  # Adjust x position based on figure width
     legend_y = 1.0 + (1.7 / fig_height)  # Adjust y position based on figure height
 
-    legend_patches = [
-        patches.Patch(color='black', label="T: Thread ID"),
-        patches.Patch(color='black', label="L: Local ID")
-    ]
+    legend_patches = [patches.Patch(color="black", label="T: Thread ID"), patches.Patch(color="black", label="L: Local ID")]
     ax.legend(
         handles=legend_patches,
         loc="upper right",
         fontsize=font_size - 4,
         frameon=False,
         bbox_to_anchor=(legend_x, legend_y),  # Dynamic position
-        ncols=2)
+        ncols=2,
+    )
 
     # Create the output directory if it does not exist
     tmp_directory = pathlib.Path(save_directory)
@@ -211,28 +186,29 @@ def plot_layout(layout: T.Fragment,
 
     if isinstance(formats, str):
         formats_str = formats.strip().lower()
-        if formats_str == 'all':
-            formats_list = ['pdf', 'png', 'svg']
+        if formats_str == "all":
+            formats_list = ["pdf", "png", "svg"]
         elif "," in formats_str:
-            formats_list = [f.strip() for f in formats_str.split(',')]
+            formats_list = [f.strip() for f in formats_str.split(",")]
         else:
             formats_list = [formats_str]
     else:
-        raise TypeError(f"Expected str, but got {type(formats).__name__}. "
-                        f"Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'.")
+        raise TypeError(
+            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+        )
 
     # Save the figure
-    if 'pdf' in formats_list:
+    if "pdf" in formats_list:
         pdf_path = tmp_directory / f"{name}.pdf"
         plt.savefig(pdf_path, bbox_inches="tight")
         print(f"Saved pdf format into {pdf_path}")
 
-    if 'png' in formats_list:
+    if "png" in formats_list:
         png_path = tmp_directory / f"{name}.png"
         plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
         print(f"Saved png format into {png_path}")
 
-    if 'svg' in formats_list:
+    if "svg" in formats_list:
         svg_path = tmp_directory / f"{name}.svg"
         plt.savefig(svg_path, bbox_inches="tight", format="svg")
         print(f"Saved svg format into {svg_path}")
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index a86ffe21..bb9202a3 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -110,8 +110,7 @@ def LowerHopperIntrin():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return (_ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f
-           )  # type: ignore
+    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
 
 
 def WarpSpecializedPipeline():
@@ -365,8 +364,7 @@ def FlattenBuffer():
 
 
 def EliminateStorageSyncForMBarrier():
-    """EliminateStorageSyncForMBarrier
-    """
+    """EliminateStorageSyncForMBarrier"""
     return _ffi_api.EliminateStorageSyncForMBarrier()  # type: ignore
 
 
@@ -378,19 +376,16 @@ def MergeSharedMemoryAllocations(enable_aggressive_merge: bool = False, align_by
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge,
-                                                 align_bytes)  # type: ignore
+    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge, align_bytes)  # type: ignore
 
 
 def LowerL2Persistent():
-    """LowerL2Persistent
-    """
+    """LowerL2Persistent"""
     return _ffi_api.LowerL2Persistent()  # type: ignore
 
 
 def PersistThreadblock():
-    """PersistThreadblock
-    """
+    """PersistThreadblock"""
     return _ffi_api.PersistThreadblock()  # type: ignore
 
 
@@ -409,8 +404,7 @@ def AlignDynamicSharedMemoryAllocations(align_bytes: int = 16):
 
 
 def LowerSharedBarrier():
-    """LowerSharedBarrier
-    """
+    """LowerSharedBarrier"""
     return _ffi_api.LowerSharedBarrier()  # type: ignore
 
 
@@ -437,20 +431,17 @@ def StorageRewrite():
 
 
 def LowerOpaqueBlock():
-    """LowerOpaqueBlock
-    """
+    """LowerOpaqueBlock"""
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
 def LowerThreadAllreduce():
-    """LowerThreadAllreduce
-    """
+    """LowerThreadAllreduce"""
     return _ffi_api.LowerThreadAllreduce()  # type: ignore
 
 
 def LowerIntrin():
-    """LowerIntrin
-    """
+    """LowerIntrin"""
     return _ffi_api.LowerIntrin()  # type: ignore
 
 
@@ -468,8 +459,7 @@ def LowerDeviceKernelLaunch():
 
 
 def LowerSharedTmem():
-    """LowerSharedTmem
-    """
+    """LowerSharedTmem"""
     return _ffi_api.LowerSharedTmem()  # type: ignore
 
 
diff --git a/tilelang/transform/add_bufstore_wrapper.py b/tilelang/transform/add_bufstore_wrapper.py
index d8457f99..c1dd41e0 100644
--- a/tilelang/transform/add_bufstore_wrapper.py
+++ b/tilelang/transform/add_bufstore_wrapper.py
@@ -1,4 +1,4 @@
-from tvm.tir import (BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm)
+from tvm.tir import BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm
 from tvm.tir.stmt_functor import ir_transform, post_order_visit
 from tvm.tir.transform import prim_func_pass
 
@@ -97,7 +97,7 @@ def AddWrapperForSingleBufStore():
             Returns:
                 True if the loop is a tile operation (parallel or has num_stages annotation)
             """
-            return loop.kind == ForKind.PARALLEL or 'num_stages' in loop.annotations
+            return loop.kind == ForKind.PARALLEL or "num_stages" in loop.annotations
 
         def pre_visit(statement):
             """
@@ -105,7 +105,7 @@ def AddWrapperForSingleBufStore():
             """
             nonlocal tile_operation_depth
 
-            if isinstance(statement, AttrStmt) and statement.attr_key == 'thread_extent':
+            if isinstance(statement, AttrStmt) and statement.attr_key == "thread_extent":
                 thread_binding_vars.add(statement.node.var)
             elif isinstance(statement, For) and is_tile_operation_loop(statement):
                 tile_operation_depth += 1
@@ -139,7 +139,8 @@ def AddWrapperForSingleBufStore():
                             if isinstance(index, IntImm) and index != 0:
                                 raise ValueError(
                                     f"Fragment buffer access with non-zero index [{index}] is not supported. "
-                                    "Only fragment[0] access is allowed.")
+                                    "Only fragment[0] access is allowed."
+                                )
 
                     # Wrap fragment[0] access with T.Parallel loop
                     return For(Var("_", "int32"), 0, 1, ForKind.PARALLEL, statement)
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index 92adcb42..92a7313b 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -5,6 +5,7 @@ from enum import Enum
 
 class PassConfigKey(str, Enum):
     """Pass configuration keys for TileLang compiler."""
+
     # TileLang specific configs
     TL_SIMPLIFY = "tl.Simplify"
     """Enable/disable TileLang simplification passes. Default: True"""
diff --git a/tilelang/transform/simplify.py b/tilelang/transform/simplify.py
index 7e0c5062..c5e577d0 100644
--- a/tilelang/transform/simplify.py
+++ b/tilelang/transform/simplify.py
@@ -51,7 +51,6 @@ def _Simplify(stmt: PrimFunc | IRModule, inline_let: bool = False) -> PrimFunc |
 
 # Decorator to simplify the output of a function
 def simplify_prim_func(func: Callable) -> Callable:
-
     def wrapper(*args, **kwargs):
         stmt: PrimFunc | IRModule = (func)(*args, **kwargs)
         return _Simplify(stmt)
diff --git a/tilelang/utils/deprecated.py b/tilelang/utils/deprecated.py
index 2aff08b5..2944f292 100644
--- a/tilelang/utils/deprecated.py
+++ b/tilelang/utils/deprecated.py
@@ -1,11 +1,10 @@
 def deprecated_warning(method_name: str, new_method_name: str, phaseout_version: str = None):
-    """A function to indicate that a method is deprecated
-    """
+    """A function to indicate that a method is deprecated"""
     import warnings  # pylint: disable=import-outside-toplevel, import-error
 
     warnings.warn(
-        f"{method_name} is deprecated, use {new_method_name} instead" +
-        (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
+        f"{method_name} is deprecated, use {new_method_name} instead"
+        + (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
         DeprecationWarning,
         stacklevel=2,
     )
@@ -30,7 +29,6 @@ def deprecated(
     import functools  # pylint: disable=import-outside-toplevel
 
     def _deprecate(func):
-
         @functools.wraps(func)
         def _wrapper(*args, **kwargs):
             deprecated_warning(method_name, new_method_name, phaseout_version)
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index 41da8ab0..584e9998 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -24,8 +24,7 @@ def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) ->
     elif isinstance(buffer_or_load_or_region, (tir.BufferLoad, tir.BufferRegion)):
         return buffer_or_load_or_region.buffer
     else:
-        raise TypeError(
-            f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
 def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
@@ -153,14 +152,12 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     """
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad,
-                                extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
     """
     Get the buffer region from a buffer load.
 
@@ -193,9 +190,9 @@ def get_buffer_region_from_load(buffer_load: tir.BufferLoad,
         return None
 
 
-def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion | tir.Var,
-                     access_type: str = "rw",
-                     extents: list[PrimExpr] | None = None) -> PrimExpr | BufferRegion:
+def to_buffer_region(
+    obj: Buffer | BufferLoad | BufferRegion | tir.Var, access_type: str = "rw", extents: list[PrimExpr] | None = None
+) -> PrimExpr | BufferRegion:
     """
     Convert to/from the tl.region representation.
 
@@ -203,6 +200,7 @@ def to_buffer_region(obj: Buffer | BufferLoad | BufferRegion | tir.Var,
     - tl.region Call -> returns the decoded BufferRegion for analysis
     """
     from tilelang.language.frame import has_let_value, get_let_value
+
     if isinstance(obj, tir.Var) and has_let_value(obj):
         obj = get_let_value(obj)
     # Encode into tl.region call (when extents is provided), otherwise return BufferRegion for analysis
@@ -279,8 +277,7 @@ def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
     return strides
 
 
-def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion,
-                                   access_type: str = "r") -> PrimExpr:
+def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion, access_type: str = "r") -> PrimExpr:
     if isinstance(buffer_or_load_or_region, Buffer):
         return buffer_or_load_or_region.access_ptr(access_type)
     elif isinstance(buffer_or_load_or_region, BufferLoad):
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index a7b17ad9..26a8e345 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -15,7 +15,7 @@ os.makedirs(_CACHE_DIR, exist_ok=True)
 
 
 def _get_cached_lib():
-    name = 'compress_lib'
+    name = "compress_lib"
 
     if os.path.exists(os.path.join(_CACHE_DIR, f"{name}.so")):
         try:
@@ -32,24 +32,22 @@ def _get_cached_lib():
         name=name,
         sources=[compress_util],
         extra_cuda_cflags=[
-            '-O2',
-            '-std=c++17',
-            '-lineinfo',
-            f'-I{env.CUTLASS_INCLUDE_DIR}',
-            f'-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include',
-            '-arch=sm_90',
+            "-O2",
+            "-std=c++17",
+            "-lineinfo",
+            f"-I{env.CUTLASS_INCLUDE_DIR}",
+            f"-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include",
+            "-arch=sm_90",
         ],
         build_directory=_CACHE_DIR,
     )
 
 
-def compress_sm90(A: torch.Tensor, block_k: int,
-                  transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
+def compress_sm90(A: torch.Tensor, block_k: int, transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
     if block_k > 128:
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
-        warnings.warn(
-            f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
+        warnings.warn(f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
     # Load the library (will use cache if available)
     compress_lib = _get_cached_lib()
 
@@ -60,8 +58,9 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
     try:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
     except ImportError as err:
-        raise ImportError("SparseSemiStructuredTensor is not available in this version of PyTorch. "
-                          "Please install a compatible version.") from err
+        raise ImportError(
+            "SparseSemiStructuredTensor is not available in this version of PyTorch. Please install a compatible version."
+        ) from err
     orig_val = SparseSemiStructuredTensor._FORCE_CUTLASS
     try:
         SparseSemiStructuredTensor._FORCE_CUTLASS = True
@@ -73,10 +72,7 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
         SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
 
 
-def compress(A: torch.Tensor,
-             transposed: bool,
-             arch: str | None = None,
-             **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+def compress(A: torch.Tensor, transposed: bool, arch: str | None = None, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compress a tensor using the appropriate method based on the CUDA architecture.
     """
@@ -101,11 +97,10 @@ def compress(A: torch.Tensor,
             A_sp = A_sp.t().contiguous()
         return A_sp, E
     else:
-        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. "
-                         "Supported versions are sm_80 and sm_90.")
+        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. Supported versions are sm_80 and sm_90.")
 
 
-def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transposed: bool = False):
+def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a random semi-sparse tensor. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -127,13 +122,7 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
     return tensor.to(dtype)  # dtype like float8 might not have randn kernel
 
 
-def randint_semi_sparse(M: int,
-                        K: int,
-                        low: int,
-                        high: int,
-                        dtype=torch.int32,
-                        device='cuda',
-                        transposed: bool = False):
+def randint_semi_sparse(M: int, K: int, low: int, high: int, dtype=torch.int32, device="cuda", transposed: bool = False):
     """
     Generate a random semi-sparse integer tensor. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -157,11 +146,7 @@ def randint_semi_sparse(M: int,
     return tensor
 
 
-def arange_semi_sparse(M: int,
-                       K: int,
-                       dtype=torch.float16,
-                       device='cuda',
-                       transposed: bool = False):
+def arange_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a semi-sparse tensor with values from 0 to M*K-1. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 094c099f..4ead7efd 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -56,11 +56,10 @@ def check_metal_availability() -> bool:
     if not mac_release:
         return False
     # todo: check torch version?
-    return arch == 'arm64'
+    return arch == "arm64"
 
 
-def determine_target(target: str | Target | Literal["auto"] = "auto",
-                     return_object: bool = False) -> str | Target:
+def determine_target(target: str | Target | Literal["auto"] = "auto", return_object: bool = False) -> str | Target:
     """
     Determine the appropriate target for compilation (CUDA, HIP, or manual selection).
 
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index b2905fb1..f1d4fc73 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from enum import Enum
 import torch
 from tvm import tir
@@ -17,7 +18,7 @@ def is_float8_dtype(dtype: torch.dtype) -> bool:
 def fp8_remove_negative_zeros_(tensor: torch.Tensor):
     assert is_float8_dtype(tensor.dtype), "Input tensor must be of float8 dtype"
     bits = tensor.view(torch.uint8)
-    zeros_mask = (tensor == 0)
+    zeros_mask = tensor == 0
     bits[zeros_mask] = 0x00
 
 
@@ -33,26 +34,21 @@ class TensorSupplyType(Enum):
 
 def map_torch_type(intype: str) -> torch.dtype:
     if intype == "float8_e4m3":
-        assert hasattr(torch, "float8_e4m3fn"), \
-            "torch.float8_e4m3fn is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e4m3fn
     elif intype == "float8_e5m2":
-        assert hasattr(torch, "float8_e5m2"), \
-            "torch.float8_e5m2 is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e5m2
     elif intype == "e4m3fnuz_float8":
-        assert hasattr(torch, "float8_e4m3fnuz"), \
-            "torch.float8_e4m3fnuz is not supported in this version of torch" \
-                "Please upgrade torch >= 2.2.0"
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
+        )
         return torch.float8_e4m3fnuz
     else:
         return getattr(torch, intype)
 
 
 def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
-
     from tilelang.engine.param import KernelParam
     from .device import get_current_device
 
@@ -63,7 +59,8 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
         if hasattr(param, "shape") and not param.shape:
             raise ValueError(
                 f"TensorType must have a shape, but got {type(param)}, "
-                "likely you are trying to generate a random tensor with a dynamic symbolic shape.")
+                "likely you are trying to generate a random tensor with a dynamic symbolic shape."
+            )
 
         # Check if with dynamic symbolic shape
         for shape in param.shape:
@@ -81,8 +78,7 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             elif dtype in {torch.float16, torch.float32, torch.bfloat16}:
@@ -91,8 +87,8 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
 
         if dtype == torch.int8 and supply_type in [
-                TensorSupplyType.Uniform,
-                TensorSupplyType.Normal,
+            TensorSupplyType.Uniform,
+            TensorSupplyType.Normal,
         ]:
             return torch.ones(*shape, device=device, dtype=dtype)
 
@@ -103,18 +99,15 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             else:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
         elif supply_type == TensorSupplyType.Uniform:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Normal:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Randn:
             return torch.randn(*shape, device=device).to(dtype)
         elif supply_type == TensorSupplyType.Zero:
@@ -150,9 +143,7 @@ def _compare_attributes(
     """
 
     def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
-        raise AssertionError(
-            f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}."
-        )
+        raise AssertionError(f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.")
 
     if actual.shape != expected.shape:
         raise_mismatch_error("shape", actual.shape, expected.shape)
@@ -163,7 +154,7 @@ def _compare_attributes(
     if actual.layout != expected.layout:
         if check_layout:
             raise_mismatch_error("layout", actual.layout, expected.layout)
-    elif (actual.layout == torch.strided and check_stride and actual.stride() != expected.stride()):
+    elif actual.layout == torch.strided and check_stride and actual.stride() != expected.stride():
         raise_mismatch_error("stride()", actual.stride(), expected.stride())
     if check_device and actual.device != expected.device:
         raise_mismatch_error("device", actual.device, expected.device)
@@ -171,8 +162,7 @@ def _compare_attributes(
         raise_mismatch_error("dtype", actual.dtype, expected.dtype)
 
 
-def _equalize_attributes(actual: torch.Tensor,
-                         expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def _equalize_attributes(actual: torch.Tensor, expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """Equalizes some attributes of two tensors for value comparison.
     If ``actual`` and ``expected`` are ...
     - ... not on the same :attr:`~torch.Tensor.device`, they are moved CPU memory.
@@ -210,7 +200,7 @@ def _equalize_attributes(actual: torch.Tensor,
     if actual.layout != expected.layout:
         # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
         actual = actual.to_dense() if actual.layout != torch.strided else actual
-        expected = (expected.to_dense() if expected.layout != torch.strided else expected)
+        expected = expected.to_dense() if expected.layout != torch.strided else expected
     return actual, expected
 
 
@@ -254,12 +244,8 @@ def torch_assert_close(
     """
 
     _compare_attributes(
-        tensor_a,
-        tensor_b,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride)
+        tensor_a, tensor_b, check_device=check_device, check_dtype=check_dtype, check_layout=check_layout, check_stride=check_stride
+    )
     tensor_a, tensor_b = _equalize_attributes(tensor_a, tensor_b)
 
     mismatched = ~torch.isclose(tensor_a, tensor_b, rtol=rtol, atol=atol, equal_nan=equal_nan)
@@ -276,8 +262,7 @@ def torch_assert_close(
 
     # Print debug information about the mismatch
     if verbose:
-        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} "
-              f"(allowed: {max_allowed_mismatched})")
+        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} (allowed: {max_allowed_mismatched})")
 
     # If there are mismatched elements, print the first mismatch
     if num_mismatched > 0:
@@ -289,9 +274,9 @@ def torch_assert_close(
         b_val = tensor_b.reshape(-1)[flat_idx].item()
         abs_diff = abs(a_val - b_val)
         rel_diff = abs_diff / (abs(b_val) + 1e-12)
-        mismatch_info = (f"\nFirst mismatch at index {idx}: "
-                         f"lhs={a_val:.6f}, rhs={b_val:.6f}, "
-                         f"abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}")
+        mismatch_info = (
+            f"\nFirst mismatch at index {idx}: lhs={a_val:.6f}, rhs={b_val:.6f}, abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}"
+        )
     else:
         mismatch_info = ""
 
@@ -304,6 +289,7 @@ def torch_assert_close(
             f"\nGreatest absolute difference: {diff.max().item()}, "
             f"Greatest relative difference: {(diff / (torch.abs(tensor_b) + 1e-12)).max().item()}"
             f"\n{base_name}: {tensor_a}"
-            f"\n{ref_name}: {tensor_b}")
+            f"\n{ref_name}: {tensor_b}"
+        )
     else:
         return True
diff --git a/version_provider.py b/version_provider.py
index 3eb45aac..c2ca929a 100644
--- a/version_provider.py
+++ b/version_provider.py
@@ -8,29 +8,26 @@ from functools import lru_cache
 
 ROOT = Path(__file__).parent
 
-base_version = (ROOT / 'VERSION').read_text().strip()
+base_version = (ROOT / "VERSION").read_text().strip()
 # When installing a sdist,
 # the installed version needs to match the sdist version,
 # so pip will complain when we install `tilelang-0.1.6.post2+gitxxxx.tar.gz`.
 # To workaround that, when building sdist,
 # we do not add version label and use a file to store the git hash instead.
-git_pin = ROOT / '.git_commit.txt'
+git_pin = ROOT / ".git_commit.txt"
 
 
 def _read_cmake_bool(i: str | None, default=False):
     if i is None:
         return default
-    return i.lower() not in ('0', 'false', 'off', 'no', 'n', '')
+    return i.lower() not in ("0", "false", "off", "no", "n", "")
 
 
 @lru_cache(maxsize=1)
 def get_git_commit_id() -> str | None:
     """Get the current git commit hash by running git in the current file's directory."""
 
-    r = subprocess.run(['git', 'rev-parse', 'HEAD'],
-                       cwd=ROOT,
-                       capture_output=True,
-                       encoding='utf-8')
+    r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=ROOT, capture_output=True, encoding="utf-8")
     if r.returncode == 0:
         _git = r.stdout.strip()
         git_pin.write_text(_git)
@@ -41,51 +38,48 @@ def get_git_commit_id() -> str | None:
         return None
 
 
-def dynamic_metadata(
-    field: str,
-    settings: dict[str, object] | None = None,
-) -> str:
-    assert field == 'version'
+def dynamic_metadata(field: str, settings: dict[str, object] | None = None) -> str:
+    assert field == "version"
 
     version = base_version
 
     # generate git version for sdist
     get_git_commit_id()
 
-    if not _read_cmake_bool(os.environ.get('NO_VERSION_LABEL')):
+    if not _read_cmake_bool(os.environ.get("NO_VERSION_LABEL")):
         exts = []
         backend = None
-        if _read_cmake_bool(os.environ.get('NO_TOOLCHAIN_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_TOOLCHAIN_VERSION")):
             pass
-        elif platform.system() == 'Darwin':
+        elif platform.system() == "Darwin":
             # only on macosx_11_0_arm64, not necessary
             # backend = 'metal'
             pass
-        elif _read_cmake_bool(os.environ.get('USE_ROCM', '')):
-            backend = 'rocm'
-        elif 'USE_CUDA' in os.environ and not _read_cmake_bool(os.environ.get('USE_CUDA')):
-            backend = 'cpu'
+        elif _read_cmake_bool(os.environ.get("USE_ROCM", "")):
+            backend = "rocm"
+        elif "USE_CUDA" in os.environ and not _read_cmake_bool(os.environ.get("USE_CUDA")):
+            backend = "cpu"
         else:  # cuda
             # Read nvcc version from env.
             # This is not exactly how it should be,
             # but works for now if building in a nvidia/cuda image.
-            if cuda_version := os.environ.get('CUDA_VERSION'):
-                major, minor, *_ = cuda_version.split('.')
-                backend = f'cu{major}{minor}'
+            if cuda_version := os.environ.get("CUDA_VERSION"):
+                major, minor, *_ = cuda_version.split(".")
+                backend = f"cu{major}{minor}"
             else:
-                backend = 'cuda'
+                backend = "cuda"
         if backend:
             exts.append(backend)
 
-        if _read_cmake_bool(os.environ.get('NO_GIT_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_GIT_VERSION")):
             pass
         elif git_hash := get_git_commit_id():
-            exts.append(f'git{git_hash[:8]}')
+            exts.append(f"git{git_hash[:8]}")
         else:
-            exts.append('gitunknown')
+            exts.append("gitunknown")
 
         if exts:
-            version += '+' + '.'.join(exts)
+            version += "+" + ".".join(exts)
 
     return version
 
-- 
GitLab


From 3546e2ee0818e584c88805a3603d1aa6cc3f6b2b Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 14 Dec 2025 00:14:31 +0800
Subject: [PATCH 100/139] [Atomic] Use ptr for atomicAdd dst instead of
 reference (#1425)

* [Enhancement] Update AtomicAdd function signature to accept pointer to destination

* Modified AtomicAdd in CUDA to take a pointer instead of a reference for the destination argument.
* Updated related code in atomicadd_vectorize.cc to ensure compatibility with the new signature.
* Adjusted Python interface in atomic.py to pass the destination by pointer, aligning with device function requirements.

* [Enhancement] Refactor AtomicAddRet function signature to accept pointer

* Updated AtomicAddRet in both CUDA and HIP to take a pointer instead of a reference for the address argument, improving consistency with the AtomicAdd function.
* Adjusted the implementation to ensure proper reinterpretation of the address type for atomic operations.

* lint fix

* [Enhancement] Refactor AtomicAddNode::MakeSIMTLoop to use destination pointer

* Updated the MakeSIMTLoop function to build a pointer to the destination element using tvm_access_ptr instead of loading the destination value directly.
* Simplified the handling of source and destination predicates, improving clarity and maintainability of the code.
* Ensured compatibility with the new pointer-based approach for atomic operations.

* lint fix

* test fix

* lint fix
---
 src/op/atomic_add.cc                          | 16 +++++------
 src/tl_templates/cuda/atomic.h                | 10 +++----
 src/tl_templates/hip/common.h                 |  5 ++--
 src/transform/atomicadd_vectorize.cc          | 27 ++++++++++++++++---
 ...g_transform_legalize_safe_memory_access.py |  2 +-
 tilelang/language/atomic.py                   | 11 ++++++--
 6 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 4ae19baf..6fa0c6b5 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -267,22 +267,22 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
   Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
 
+  Array<PrimExpr> new_args;
+
+  // Optional bounds predicates for src and dst
   PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
   PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
 
-  Array<PrimExpr> new_args;
-
+  // Load source value and cast to dst dtype if needed
   PrimExpr src_value = BufferLoad(src, src_indices);
   if (src->dtype != dst->dtype)
     src_value = Cast(dst->dtype, src_value);
-  if (src_predicate.defined())
-    src_value = if_then_else(src_predicate, src_value, make_zero(dst->dtype));
 
-  PrimExpr dst_value = BufferLoad(dst, dst_indices);
-  if (dst_predicate.defined())
-    dst_value = if_then_else(dst_predicate, dst_value, make_zero(dst->dtype));
+  // Build a pointer to destination element using tvm_access_ptr
+  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, dst_indices)});
 
-  new_args.push_back(dst_value);
+  new_args.push_back(dst_ptr);
   new_args.push_back(src_value);
   new_args.push_back(memory_order);
 
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 05421080..3ab2de4f 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -171,10 +171,9 @@ TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
 template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     if (memory_order == int(cuda::memory_order_relaxed)) {
@@ -242,19 +241,18 @@ TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
 }
 #else
 template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
   (void)memory_order;
-  atomicAdd(reinterpret_cast<NT1 *>(&ref), cuda_cast<NT1>(val));
+  atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
 }
 #endif
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     if (memory_order == int(cuda::memory_order_relaxed)) {
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index b00944a1..8be247e7 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -116,6 +116,7 @@ TL_DEVICE void AtomicAdd(T1 address, T2 val) {
   atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
 }
 
-template <typename T1, typename T2> TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val) {
-  return atomicAdd(&ref, static_cast<T1>(val));
+template <typename T1, typename T2>
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
+  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
 }
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
index 37a6d589..d66a538d 100644
--- a/src/transform/atomicadd_vectorize.cc
+++ b/src/transform/atomicadd_vectorize.cc
@@ -246,8 +246,9 @@ private:
         new_args.push_back(address_of_dst);
         new_args.push_back(address_of_value);
       } else {
+        // Scalar case: AtomicAdd now expects a pointer to destination.
         new_args.push_back(StringImm("AtomicAdd"));
-        new_args.push_back(dst_node);
+        new_args.push_back(address_of_dst);
         new_args.push_back(value_node);
       }
       new_args.push_back(memory_order);
@@ -259,8 +260,28 @@ private:
     } else {
       Array<PrimExpr> new_args;
       new_args.push_back(StringImm("AtomicAdd"));
-      for (auto x : node->args)
-        new_args.push_back(x);
+      // Ensure first argument is an address; keep value as-is.
+      if (!node->args.empty()) {
+        if (const auto *bl = node->args[0].as<BufferLoadNode>()) {
+          Call address_of_dst = Call(DataType::Handle(), builtin::address_of(),
+                                     {Downcast<BufferLoad>(node->args[0])});
+          new_args.push_back(address_of_dst);
+        } else if (const auto *call = node->args[0].as<CallNode>()) {
+          // If it's already an address_of, forward it; otherwise, keep
+          // original.
+          if (call->op.same_as(builtin::address_of())) {
+            new_args.push_back(node->args[0]);
+          } else {
+            new_args.push_back(node->args[0]);
+          }
+        } else {
+          new_args.push_back(node->args[0]);
+        }
+        // Push remaining args unchanged (value, optional memory_order, ...)
+        for (size_t i = 1; i < node->args.size(); ++i) {
+          new_args.push_back(node->args[i]);
+        }
+      }
 
       Call new_call =
           tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 35a85aaf..ed7fb31a 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -101,7 +101,7 @@ def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
-                        T.call_extern("handle", "AtomicAdd", A[tid + M_offset, j + N_offset], 1)
+                        T.call_extern("handle", "AtomicAdd", T.address_of(A[tid + M_offset, j + N_offset]), 1)
 
     return main, expected
 
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 89a3af25..9406bb7e 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -179,10 +179,17 @@ def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
         func_name = "AtomicAddRet" if return_prev else "AtomicAdd"
         return_type = dst.dtype if return_prev else "handle"
 
+        # Pass destination by pointer to match device signature
         if memory_order is None:
-            return T.call_extern(return_type, func_name, dst, value)
+            return T.call_extern(return_type, func_name, T.address_of(dst), value)
         else:
-            return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+            return T.call_extern(
+                return_type,
+                func_name,
+                T.address_of(dst),
+                value,
+                _MEMORY_ORDER_ID_MAP[memory_order],
+            )
 
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
-- 
GitLab


From 00dd738835bbd46f5da7a4df35b66578ba62ee55 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 14 Dec 2025 00:33:00 +0800
Subject: [PATCH 101/139] [CUDA] Add read-only parameter annotation for CUDA
 codegen (#1416)

* [Enhancement] Add read-only parameter annotation for CUDA codegen

* Introduced the `AnnotateReadOnlyParams` transformation to annotate read-only handle parameters in PrimFuncs, enabling the generation of `const` qualifiers in CUDA codegen.
* Updated `PrintFunctionSignature` and `AddFunction` methods to utilize the new attribute `tl.readonly_param_indices`, enhancing performance by allowing read-only cache loads.
* Modified the optimization pipeline to include the new annotation step, improving the overall efficiency of the code generation process.

* lint fix

* [Dependency] Update apache-tvm-ffi version to >=0.1.3

* Updated the version of apache-tvm-ffi in pyproject.toml, requirements.txt, and requirements-dev.txt to ensure compatibility with the latest features and fixes.
* Made adjustments in CUDA and HIP template files to use `const` qualifiers for global pointer parameters, enhancing code safety and clarity.

* lint fix

* [Enhancement] Refactor ReadWriteMarker for improved parameter handling

* Updated the ReadWriteMarker class to accept a set of parameter or data variables, enhancing its ability to track written variables.
* Introduced a new method, ResolveDataVarFromPtrArg, to resolve underlying buffer data from pointer-like arguments, improving accuracy in identifying written variables.
* Modified the MarkReadOnlyParams function to gather handle parameters and their corresponding buffer data variables, streamlining the process of determining read-only parameters.
* Enhanced the logic for identifying written variables to account for aliased data variables, ensuring comprehensive tracking of modifications.

* lint fix

* Update tma_load function to use const qualifier for global memory pointer

* Changed the parameter type of gmem_ptr in the tma_load function from void* to void const* to enhance type safety and clarity in memory operations.
* This modification ensures that the function correctly handles read-only global memory pointers, aligning with best practices in CUDA programming.

* Remove commented-out code and reorder transformations in OptimizeForTarget function for clarity

* Refactor buffer marking logic in annotate_read_only_params.cc to improve accuracy in identifying written variables. Update OptimizeForTarget function to reorder transformations for better clarity.
---
 .../example_gqa_bwd_tma_reduce_varlen.py      |   2 -
 pyproject.toml                                |   2 +-
 requirements-dev.txt                          |   2 +-
 requirements.txt                              |   2 +-
 src/target/codegen_cuda.cc                    |  25 ++-
 src/tl_templates/cuda/copy.h                  |  13 +-
 src/tl_templates/cuda/copy_sm90.h             |   6 +-
 src/tl_templates/hip/copy.h                   |  18 +-
 src/transform/annotate_read_only_params.cc    | 191 ++++++++++++++++++
 .../test_readonly_param_const_codegen.py      |  54 +++++
 tilelang/engine/phase.py                      |   1 +
 tilelang/transform/__init__.py                |  15 ++
 12 files changed, 307 insertions(+), 24 deletions(-)
 create mode 100644 src/transform/annotate_read_only_params.cc
 create mode 100644 testing/python/transform/test_readonly_param_const_codegen.py

diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 112438f7..3501df1d 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -7,8 +7,6 @@ import argparse
 from einops import rearrange, repeat
 from bert_padding import pad_input, unpad_input
 
-# tilelang.disable_cache()
-
 
 def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     assert mode in ["full", "random", "third"]
diff --git a/pyproject.toml b/pyproject.toml
index 992eba55..71700a10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     # Extra constraint to tvm-ffi for abi issue,
     # should be removed after our tvm's update.
     # See discussion in tilelang#1373 and apache/tvm-ffi#307
-    "apache-tvm-ffi>=0.1.2",
+    "apache-tvm-ffi>=0.1.3",
     # torch-c-dlpack-ext provides prebuilt torch extensions.
     # Without it, TVM FFI may require JIT compilation on first import.
     "torch-c-dlpack-ext",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ef8e98b6..e6e88e88 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
-apache-tvm-ffi>=0.1.2
+apache-tvm-ffi>=0.1.3
 build
 cmake>=3.26
 cython>=3.0.0
diff --git a/requirements.txt b/requirements.txt
index 4148dcb7..eaa4f742 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Runtime requirements
-apache-tvm-ffi>=0.1.2
+apache-tvm-ffi>=0.1.3
 torch-c-dlpack-ext
 cloudpickle
 ml-dtypes
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 72a8a5ec..1a9fb06c 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -3246,6 +3246,14 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt =
+          func->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
   os << " " << function_name << "(";
   for (size_t i = 0; i < func->params.size(); ++i) {
     tir::Var v = func->params[i];
@@ -3270,7 +3278,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, os);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        os << "const ";
+      }
       CodeGenC::PrintType(GetType(v), os);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3314,6 +3325,13 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt = f->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -3341,7 +3359,10 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, stream);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        stream << "const ";
+      }
       CodeGenC::PrintType(GetType(v), stream);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
diff --git a/src/tl_templates/cuda/copy.h b/src/tl_templates/cuda/copy.h
index 1dd53843..0fa7b9d9 100644
--- a/src/tl_templates/cuda/copy.h
+++ b/src/tl_templates/cuda/copy.h
@@ -26,7 +26,8 @@ template <int N> TL_DEVICE void cp_async_wait() {
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
+TL_DEVICE void cp_async_gs(void const *const smem_addr,
+                           void const *global_ptr) {
   static_assert(N == 16 || N == 8 || N == 4);
   unsigned int addr = smem_ptr_to_uint(smem_addr);
   if constexpr (N == 16) {
@@ -37,7 +38,7 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.cg.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -46,13 +47,13 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.ca.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
-                                       void *global_ptr, bool cond) {
+                                       void const *global_ptr, bool cond) {
   static_assert(N == 16 || N == 8 || N == 4);
   int bytes = cond ? N : 0;
   unsigned int addr = smem_ptr_to_uint(smem_addr);
@@ -64,7 +65,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.cg.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -73,7 +74,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.ca.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   }
 }
 
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index b8b174dc..0b51450b 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -15,14 +15,14 @@ enum class CacheHintSm90 : uint64_t {
 };
 
 template <typename BarrierType = uint64_t>
-TL_DEVICE void tma_load(void *smem_ptr, void *gmem_ptr, BarrierType &smem_mbar,
-                        uint32_t size) {
+TL_DEVICE void tma_load(void *smem_ptr, void const *gmem_ptr,
+                        BarrierType &smem_mbar, uint32_t size) {
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::"
                "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
-               "l"(gmem_ptr), "r"(size), "r"(smem_int_mbar)
+               "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
                :);
 }
 
diff --git a/src/tl_templates/hip/copy.h b/src/tl_templates/hip/copy.h
index 3ba334da..3f122d80 100644
--- a/src/tl_templates/hip/copy.h
+++ b/src/tl_templates/hip/copy.h
@@ -73,33 +73,35 @@ CK_TILE_DEVICE void async_buffer_load_dword_v(void *smem, int32x4_t rsrc,
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void *lds_base_ptr, void *global_base_ptr) {
+TL_DEVICE void cp_async_gs(void *lds_base_ptr, void const *global_base_ptr) {
   if constexpr (N == 16) {
-    *(uint4 *)lds_base_ptr = *(uint4 *)global_base_ptr;
+    *(uint4 *)lds_base_ptr = *(const uint4 *)global_base_ptr;
   } else if constexpr (N == 8) {
-    *(uint2 *)lds_base_ptr = *(uint2 *)global_base_ptr;
+    *(uint2 *)lds_base_ptr = *(const uint2 *)global_base_ptr;
   } else if constexpr (N == 4) {
     async_buffer_load_dword_v(
         lds_base_ptr,
-        make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+        make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                  threadIdx.x),
         threadIdx.x * N /*assume 4 bytes*/);
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void *lds_base_ptr,
-                                       void *global_base_ptr, bool cond) {
+                                       void const *global_base_ptr, bool cond) {
   if constexpr (N == 16) {
     *(uint4 *)lds_base_ptr =
-        cond ? *(uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
+        cond ? *(const uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
   } else if constexpr (N == 8) {
     *(uint2 *)lds_base_ptr =
-        cond ? *(uint2 *)global_base_ptr : make_uint2(0, 0);
+        cond ? *(const uint2 *)global_base_ptr : make_uint2(0, 0);
   } else {
     if (cond) {
       async_buffer_load_dword_v(
           lds_base_ptr,
-          make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+          make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                    threadIdx.x),
           threadIdx.x * N /*assume 4 bytes*/);
     } else {
       *(uint4 *)lds_base_ptr = make_uint4(0, 0, 0, 0);
diff --git a/src/transform/annotate_read_only_params.cc b/src/transform/annotate_read_only_params.cc
new file mode 100644
index 00000000..e9eef683
--- /dev/null
+++ b/src/transform/annotate_read_only_params.cc
@@ -0,0 +1,191 @@
+/*!
+ * \file annotate_read_only_params.cc
+ * \brief Annotate PrimFunc parameters that are read-only (never written).
+ */
+
+#include <string>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+using namespace tir;
+using namespace ffi;
+
+/*!
+ * \brief A simple visitor that marks handle parameters as written when they
+ *        appear on the LHS of a BufferStore or in a tvm_access_ptr with write
+ * flag.
+ */
+class ReadWriteMarker : public StmtExprVisitor {
+public:
+  explicit ReadWriteMarker(
+      const std::unordered_set<const VarNode *> &param_or_data_vars)
+      : param_or_data_vars_(param_or_data_vars) {}
+
+  const std::unordered_set<const VarNode *> &written() const {
+    return written_;
+  }
+
+  // Try to resolve the underlying buffer data Var from a pointer-like
+  // argument. Supports:
+  //  - address_of(BufferLoad(...)) -> returns buffer->data
+  //  - BufferLoad(...)             -> returns buffer->data
+  // Otherwise returns nullptr.
+  const VarNode *ResolveDataVarFromPtrArg(const PrimExpr &arg) const {
+    if (const auto *call = arg.as<CallNode>()) {
+      if (call->op.same_as(builtin::address_of())) {
+        if (call->args.size() == 1U) {
+          if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+            return load->buffer->data.get();
+          }
+        }
+      }
+    } else if (const auto *load = arg.as<BufferLoadNode>()) {
+      return load->buffer->data.get();
+    }
+    return nullptr;
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    const VarNode *data = op->buffer->data.get();
+    if (param_or_data_vars_.count(data)) {
+      written_.insert(data);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Detect tvm_access_ptr writes. Be conservative if rw_mask is non-constant.
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() == 5U) {
+        if (const VarNode *buf = op->args[1].as<VarNode>()) {
+          const IntImmNode *flag = op->args[4].as<IntImmNode>();
+          bool maybe_write = true; // default conservative
+          if (flag) {
+            maybe_write = (flag->value & 2) != 0; // write bit set
+          }
+          if (maybe_write && param_or_data_vars_.count(buf)) {
+            written_.insert(buf);
+          }
+        }
+      }
+    } else {
+      // Generic fallback: mark buffers that appear as
+      // address_of(BufferLoad(...)) in call arguments as written. This matches
+      // patterns like
+      //   tl.tma_store(address_of(smem[..]), address_of(gmem[..]), ...)
+      //   call_extern("AtomicAdd*", address_of(gmem[..]), ...)
+      // and avoids over-marking plain BufferLoad used for reads.
+      for (const PrimExpr &a : op->args) {
+        if (const auto *c = a.as<CallNode>()) {
+          if (c->op.same_as(builtin::address_of()) && c->args.size() == 1U) {
+            if (const auto *bl = c->args[0].as<BufferLoadNode>()) {
+              const VarNode *data = bl->buffer->data.get();
+              if (param_or_data_vars_.count(data)) {
+                written_.insert(data);
+              }
+            }
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+private:
+  std::unordered_set<const VarNode *> param_or_data_vars_;
+  std::unordered_set<const VarNode *> written_;
+};
+
+/*!
+ * \brief Annotate PrimFunc with indices of read-only handle parameters.
+ *
+ * Adds an Array<Integer> attribute "tl.readonly_param_indices" that lists
+ * parameter indices which correspond to handle parameters that are never
+ * written inside the function body. This can be used by codegen to emit
+ * `const` qualifiers to enable read-only caching (e.g., __ldg on CUDA).
+ */
+static tir::PrimFunc MarkReadOnlyParams(tir::PrimFunc f) {
+  // Gather handle params and their corresponding buffer data vars (aliases).
+  std::unordered_set<const VarNode *> param_or_data_vars;
+  // Map back from data var to parameter index for result attribution.
+  std::unordered_map<const VarNode *, size_t> data_var_to_param_idx;
+
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &p = f->params[i];
+    if (!p->dtype.is_handle())
+      continue;
+    param_or_data_vars.insert(p.get());
+    // If there is a buffer_map entry for this param, include its data var too.
+    if (auto opt = f->buffer_map.Get(p)) {
+      const VarNode *data = opt.value()->data.get();
+      param_or_data_vars.insert(data);
+      data_var_to_param_idx[data] = i;
+    }
+  }
+  if (param_or_data_vars.empty())
+    return f;
+
+  ReadWriteMarker marker(param_or_data_vars);
+  marker(f->body);
+
+  // Determine read-only parameter indices among all params (handle only)
+  Array<Integer> readonly_indices;
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &v = f->params[i];
+    if (!v->dtype.is_handle())
+      continue;
+
+    bool is_written = false;
+    // Direct param var written?
+    if (marker.written().count(v.get())) {
+      is_written = true;
+    } else {
+      // Or any aliased data var written?
+      if (auto opt = f->buffer_map.Get(v)) {
+        if (marker.written().count(opt.value()->data.get())) {
+          is_written = true;
+        }
+      }
+    }
+
+    if (!is_written) {
+      readonly_indices.push_back(Integer(static_cast<int>(i)));
+    }
+  }
+
+  if (!readonly_indices.empty()) {
+    Map<String, Any> attrs;
+    attrs.Set(String("tl.readonly_param_indices"), readonly_indices);
+    f = WithAttrs(std::move(f), attrs);
+  }
+  return f;
+}
+
+namespace transform {
+using namespace tir::transform;
+
+Pass AnnotateReadOnlyParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &m,
+                      const tvm::transform::PassContext &ctx) {
+    return MarkReadOnlyParams(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateReadOnlyParams", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AnnotateReadOnlyParams",
+                        AnnotateReadOnlyParams);
+}
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/testing/python/transform/test_readonly_param_const_codegen.py b/testing/python/transform/test_readonly_param_const_codegen.py
new file mode 100644
index 00000000..d0a2bbbf
--- /dev/null
+++ b/testing/python/transform/test_readonly_param_const_codegen.py
@@ -0,0 +1,54 @@
+import tilelang.language as T
+from tilelang.engine.lower import lower
+from tilelang.jit.adapter.utils import match_declare_kernel
+
+
+def _simple_add_kernel():
+    @T.prim_func
+    def main(
+        x: T.Tensor((128,), "float32"),
+        y: T.Tensor((128,), "float32"),
+    ):
+        # One-dimensional kernel; writes y from x without modifying x
+        with T.Kernel(128, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    return main
+
+
+def test_codegen_emits_const_for_readonly_params():
+    # Lower without device compilation to retrieve CUDA source reliably
+    func = _simple_add_kernel()
+    artifact = lower(func, target="cuda", enable_device_compile=False)
+
+    src = artifact.kernel_source
+    print(src)
+    assert 'extern "C" __global__' in src
+
+    # Extract kernel signature and check qualifiers
+    lparen = match_declare_kernel(src)
+    rparen = src.find(")", lparen)
+    assert rparen != -1
+    signature = src[lparen:rparen]
+
+    # x is read-only: should be `const` and `__restrict__`
+    assert "const float* __restrict__" in signature
+    # y is written: must not be const, but still `__restrict__` due to noalias
+    # We ensure there is a non-const float* parameter with __restrict__ as well
+    assert "const float* __restrict__ x" in src or "const float *__restrict__ x" in src
+    assert " float* __restrict__ y" in src or " float *__restrict__ y" in src
+
+    # Also validate the function attribute carries read-only param indices
+    # Expect only the first handle parameter (x) to be marked read-only
+    device_mod = artifact.device_mod
+    prim_funcs = [f for f in device_mod.functions.values() if hasattr(f, "attrs")]
+    assert prim_funcs, "No PrimFunc found in device module"
+    pf = prim_funcs[0]
+    ro = pf.attrs.get("tl.readonly_param_indices")
+    assert ro is not None, "Expected tl.readonly_param_indices to be present"
+    ro_list = [int(i) for i in ro]
+    assert 0 in ro_list and 1 not in ro_list
+
+
+if __name__ == "__main__":
+    test_codegen_emits_const_for_readonly_params()
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index cef3d9a2..fbbe3e4e 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -250,6 +250,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.ThreadSync("global")(mod)
     mod = tilelang.transform.AnnotateDeviceRegions()(mod)
     mod = tilelang.transform.SplitHostDevice()(mod)
+    mod = tilelang.transform.AnnotateReadOnlyParams()(mod)
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index bb9202a3..0f8edd50 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -303,6 +303,21 @@ def SplitHostDevice():
     return _ffi_api.SplitHostDevice()  # type: ignore
 
 
+def AnnotateReadOnlyParams():
+    """Annotate read-only handle parameters for PrimFuncs.
+
+    Adds attribute `tl.readonly_param_indices` listing param indices that are
+    never written, enabling CUDA codegen to emit `const` qualifiers to unlock
+    read-only cache loads.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateReadOnlyParams()  # type: ignore
+
+
 def VectorizeLoop(enable_vectorize: bool = True):
     """VectorizeLoop
 
-- 
GitLab


From 89521e632e651d74be5c67408708cbf880c0df65 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Dec 2025 11:39:04 +0800
Subject: [PATCH 102/139] [Refactor] Phase out the primitives folder since its
 design has been merged into tileop (#1429)

* Phase out primitives

* revert changes

* Refactor GemmWarpPolicy method signature for clarity

Updated the `from_warp_partition` method in the `GemmWarpPolicy` class to return the type `GemmWarpPolicy` instead of a string, enhancing type safety and clarity in the codebase. Removed an unnecessary blank line for improved readability.

* fix
---
 examples/amd/example_amd_flash_attn_bwd.py    |   2 +-
 examples/amd/example_amd_flash_attn_fwd.py    |   2 +-
 src/tl_templates/cuda/atomic.h                |  24 +-
 .../test_tilelang_primitives_mma.py           | 367 ------------------
 tilelang/language/__init__.py                 |   3 +-
 tilelang/language/atomic.py                   |  24 +-
 tilelang/language/experimental/gemm_sp.py     |   2 +-
 tilelang/language/gemm.py                     |   2 +-
 tilelang/primitives/__init__.py               |   3 -
 tilelang/primitives/gemm/__init__.py          |  43 --
 tilelang/primitives/gemm/gemm_mma.py          | 266 -------------
 tilelang/tileop/__init__.py                   |   1 +
 tilelang/{primitives/gemm => tileop}/base.py  | 129 ------
 tilelang/tileop/gemm/__init__.py              |   1 -
 tilelang/tileop/gemm/gemm_base.py             |   2 +-
 tilelang/tileop/gemm_sp/__init__.py           |   2 +-
 tilelang/tileop/gemm_sp/gemm_sp_base.py       |   2 +-
 17 files changed, 40 insertions(+), 835 deletions(-)
 delete mode 100644 testing/python/primitives/test_tilelang_primitives_mma.py
 delete mode 100644 tilelang/primitives/__init__.py
 delete mode 100644 tilelang/primitives/gemm/__init__.py
 delete mode 100644 tilelang/primitives/gemm/gemm_mma.py
 rename tilelang/{primitives/gemm => tileop}/base.py (57%)

diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index a5461103..c0335492 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index e53299a2..bbb27557 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 3ab2de4f..f6096cc9 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -46,10 +46,10 @@ template <> TL_DEVICE __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
 #endif
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMax(T1 &ref, T2 val,
+TL_DEVICE void AtomicMax(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     // There is no implementation of atomicMax for half and bf16 in cuda.
@@ -77,10 +77,10 @@ TL_DEVICE void AtomicMax(T1 &ref, T2 val,
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMaxRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     unsigned short *address_as_ushort =
@@ -108,10 +108,10 @@ TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMin(T1 &ref, T2 val,
+TL_DEVICE void AtomicMin(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     // There is no implementation of atomicMin for half and bf16 in cuda.
@@ -139,10 +139,10 @@ TL_DEVICE void AtomicMin(T1 &ref, T2 val,
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMinRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
     unsigned short *address_as_ushort =
@@ -690,9 +690,9 @@ AtomicAddx4Ret(float *ref, float *val,
 }
 #endif
 
-template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
+template <typename T> TL_DEVICE T AtomicLoad(T *ref, int memory_order) {
 #if CUDART_VERSION >= 11080
-  cuda::atomic_ref<T, cuda::thread_scope_device> aref(ref);
+  cuda::atomic_ref<T, cuda::thread_scope_device> aref(*ref);
   return aref.load(cuda::memory_order(memory_order));
 #else
   TL_NOT_IMPLEMENTED();
@@ -700,10 +700,10 @@ template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicStore(T1 &ref, T2 value, int memory_order) {
+TL_DEVICE void AtomicStore(T1 *ref, T2 value, int memory_order) {
   using NT1 = typename normalize_atomic_type<T1>::type;
 #if CUDART_VERSION >= 11080
-  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(ref);
+  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*ref);
   aref.store(cuda_cast<NT1>(value), cuda::memory_order(memory_order));
 #else
   TL_NOT_IMPLEMENTED();
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
deleted file mode 100644
index 97ce3231..00000000
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ /dev/null
@@ -1,367 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang import primitives as P
-
-
-def matmul_ssr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                P.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_ssr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_ssr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    # TODO(lei): gemm_v2 with tma is not fully tested.
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
-    )
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_gemm_f16f16f16_nt_ssr():
-    run_matmul_ssr(16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
-    run_matmul_ssr(128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
-    run_matmul_ssr(1024, 1024, 1024, False, True, "float16", "float16", "float16", 128, 128, 32, 2, num_threads=128)
-
-
-def matmul_rsr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.copy(A_shared, A_local)
-                P.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-                # T.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rsr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rsr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
-    )
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# TODO(lei): Fix the test case in future release
-# Now it has some bugs related to is_m_first
-# def test_gemm_f16f16f16_nt_rsr():
-#     run_matmul_rsr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         0,
-#         num_threads=128,
-#     )
-
-
-def matmul_rrr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    B_local_shape = B_shared_shape
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            B_local = T.alloc_fragment(B_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                    T.copy(A_shared, A_local)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(A_shared, A_local)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.copy(B_shared, B_local)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                    T.copy(B_shared, B_local)
-                P.gemm(A_local, B_local, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rrr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rrr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
-    )
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# def test_gemm_f16f16f16_nt_rrr():
-#     run_matmul_rrr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         2,
-#     )
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 0f3d5fb1..9a6354e9 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -59,7 +59,8 @@ from .allocate import (
     empty,  # noqa: F401
 )
 from .copy import copy, c2d_im2col  # noqa: F401
-from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
+from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
+from .gemm import gemm, gemm_v1, gemm_v2  # noqa: F401
 from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
 from .fill import fill, clear  # noqa: F401
 from .reduce import (
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 9406bb7e..b63b48c5 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -57,9 +57,15 @@ def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
 def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
@@ -102,9 +108,15 @@ def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
 def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
@@ -325,7 +337,7 @@ def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
         >>> counter = T.Tensor([1], "int64", name="counter")
         >>> current_count = atomic_load(counter, memory_order="relaxed")
     """
-    return T.call_extern(src.dtype, "AtomicLoad", src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern(src.dtype, "AtomicLoad", T.address_of(src), _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> PrimExpr:
@@ -378,4 +390,4 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> log_counter = T.Tensor([1], "int64", name="log_counter")
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
-    return T.call_extern("handle", "AtomicStore", dst, src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern("handle", "AtomicStore", T.address_of(dst), src, _MEMORY_ORDER_ID_MAP[memory_order])
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index 5adac926..1eaac680 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,7 +1,7 @@
 """The language interface for tl programs."""
 
 from __future__ import annotations
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
 from tilelang.utils.language import (
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 56f6805f..20c5d1b4 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -1,7 +1,7 @@
 """The language interface for tl programs."""
 
 from __future__ import annotations
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
 from tilelang.utils.language import (
diff --git a/tilelang/primitives/__init__.py b/tilelang/primitives/__init__.py
deleted file mode 100644
index 9d2a739a..00000000
--- a/tilelang/primitives/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""bootstrap the primitives module via tile language"""
-
-from .gemm import gemm  # noqa: F401
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
deleted file mode 100644
index 7664a7b5..00000000
--- a/tilelang/primitives/gemm/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import annotations
-from tvm import tir
-from tilelang.utils import is_local, is_fragment, is_shared
-from tilelang.primitives.gemm.base import GemmWarpPolicy
-from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,
-)
-
-
-def gemm(
-    A: tir.Buffer,
-    B: tir.Buffer,
-    C: tir.Buffer,
-    transpose_A: bool = False,
-    transpose_B: bool = False,
-    block_row_warps: int | None = None,
-    block_col_warps: int | None = None,
-    warp_row_tiles: int | None = None,
-    warp_col_tiles: int | None = None,
-    chunk: int | None = None,
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1,
-):
-    assert is_local(A) or is_fragment(A) or is_shared(A), f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}"
-    assert is_local(B) or is_fragment(B) or is_shared(B), f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}"
-    assert is_local(C) or is_fragment(C), f"Expected C to be a local, fragment, but got {C.scope()}"
-    # TODO(lei): Now we only support Nvidia GPUs
-    # Must enhance the design to implement runtime lowering
-    # for different targets (hip mfma for example)
-    return GemmPrimitiveMMA(
-        A=A,
-        B=B,
-        C=C,
-        transpose_A=transpose_A,
-        transpose_B=transpose_B,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-        policy=policy,
-        k_pack=k_pack,
-    ).invoke()
diff --git a/tilelang/primitives/gemm/gemm_mma.py b/tilelang/primitives/gemm/gemm_mma.py
deleted file mode 100644
index 7ca3208b..00000000
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ /dev/null
@@ -1,266 +0,0 @@
-from dataclasses import dataclass
-from tvm import tir
-import tilelang.language as T
-from tilelang.utils import is_fragment
-from tilelang.primitives.gemm.base import GemmBaseParams
-from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
-
-
-# TODO(lei): Implement GEMM_SR, GEMM_RS, GEMM_RR
-@dataclass
-class GemmPrimitiveMMA(GemmBaseParams):
-    """
-    A GEMM (General Matrix Multiply) primitive that uses Tensor Core MMA (Matrix
-    Multiply and Accumulate) instructions. Inherits from GemmBaseParams which
-    provides basic parameters such as A, B, C buffers and transposition flags.
-    """
-
-    def gemm_rrr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RRR is not implemented yet")
-
-    def gemm_rsr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        in_dtype = self.in_dtype
-        warp_cols = mma_emitter.warp_cols
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        a_is_fragment = is_fragment(A)
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_rsr(A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if a_is_fragment:
-                # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout(
-                    {
-                        A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                    }
-                )
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout(
-                    {
-                        C_local: mma_emitter.make_mma_store_layout(C_local),
-                    }
-                )
-
-            # Make default swizzle layout for shared memory
-            # T.annotate_layout({
-            #     B_shared: make_mma_swizzle_layout(B_shared),
-            # })
-            for ki in T.serial(0, (block_K // micro_size_k)):
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-                # Perform Matrix Multiplication
-                mma_emitter.mma(
-                    A_local,
-                    B_local,
-                    C_local,
-                    ki,
-                )
-
-        return _gemm_rsr(A, B, C)
-
-    def gemm_srr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RSR is not implemented yet")
-
-    def gemm_ssr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        """
-        Perform a single-step reduction (SSR) GEMM using Tensor Core MMA
-        primitives. Loads fragments of A and B from shared memory, multiplies
-        them, and accumulates into C.
-
-        Parameters
-        ----------
-        A : tir.Buffer
-            The buffer for matrix A (in shared memory).
-        B : tir.Buffer
-            The buffer for matrix B (in shared memory).
-        C : tir.Buffer
-            The buffer for the accumulation results.
-        mma_emitter : TensorCoreIntrinEmitter
-            A helper object responsible for generating Tensor Core MMA
-            instructions (ldmatrix, mma, etc.).
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated IR expression (macro) representing the GEMM loop.
-        """
-
-        in_dtype = self.in_dtype
-        warp_rows = mma_emitter.warp_rows
-        warp_cols = mma_emitter.warp_cols
-        local_size_a = mma_emitter.local_size_a
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_ssr(A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout(
-                    {
-                        C_local: mma_emitter.make_mma_store_layout(C_local),
-                    }
-                )
-
-            for ki in T.serial(0, (block_K // micro_size_k)):
-                # Load A into fragment
-                mma_emitter.ldmatrix_a(
-                    A_local,
-                    A_shared,
-                    ki,
-                )
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-
-                # Perform Matrix Multiplication
-                mma_emitter.mma(A_local, B_local, C_local)
-
-        return _gemm_ssr(A, B, C)
-
-    def invoke(self) -> tir.PrimExpr:
-        """
-        Entry point to generate a GEMM SSR (single-step reduction) with Tensor
-        Core instructions. Performs the following steps:
-            1. Infers block partition parameters if necessary.
-            2. Creates a `TensorCoreIntrinEmitter` with the correct data types
-               and dimensions.
-            3. Invokes the GEMM SSR function to generate the final IR expression.
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated GEMM IR expression.
-        """
-
-        # Infer block partition if necessary
-        current_frame = T.KernelLaunchFrame.Current()
-        threads = current_frame.get_num_threads()
-
-        self.infer_block_partition(threads)
-
-        A, B, C = self.A, self.B, self.C
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        chunk = self.chunk
-
-        # Check dtypes
-        A_dtype, B_dtype, C_dtype = A.dtype, B.dtype, C.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        in_dtype, accum_dtype = A_dtype, C_dtype
-
-        # Create the MMA emitter
-        mma_emitter = TensorCoreIntrinEmitter(
-            a_dtype=in_dtype,
-            b_dtype=in_dtype,
-            accum_dtype=accum_dtype,
-            a_transposed=transpose_A,
-            b_transposed=transpose_B,
-            block_row_warps=block_row_warps,
-            block_col_warps=block_col_warps,
-            warp_row_tiles=warp_row_tiles,
-            warp_col_tiles=warp_col_tiles,
-            chunk=chunk,
-        )
-        a_is_fragment = is_fragment(A)
-        b_is_fragment = is_fragment(B)
-        if a_is_fragment and b_is_fragment:
-            return self.gemm_rrr(A, B, C, mma_emitter)
-        if a_is_fragment:
-            return self.gemm_rsr(A, B, C, mma_emitter)
-        if b_is_fragment:
-            return self.gemm_srr(A, B, C, mma_emitter)
-        return self.gemm_ssr(A, B, C, mma_emitter)
-
-    @property
-    def in_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The input data type for A and B. Assumes both have the same dtype.
-
-        Raises
-        ------
-        AssertionError
-            If A and B do not share the same dtype.
-        """
-        A_dtype, B_dtype = self.A.dtype, self.B.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        return self.A.dtype
-
-    @property
-    def accum_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The accumulation data type for C.
-        """
-        return self.C.dtype
-
-
-__all__ = ["GemmPrimitiveMMA"]
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index a99cbd87..6e7798a0 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1,2 +1,3 @@
+from .base import GemmWarpPolicy  # noqa: F401
 from .gemm import GemmPy  # noqa: F401
 from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/primitives/gemm/base.py b/tilelang/tileop/base.py
similarity index 57%
rename from tilelang/primitives/gemm/base.py
rename to tilelang/tileop/base.py
index b7fcdca9..f7b51b3a 100644
--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/tileop/base.py
@@ -1,8 +1,5 @@
 from __future__ import annotations
 from enum import IntEnum
-from dataclasses import dataclass
-
-from tvm import tir
 
 
 class GemmWarpPolicy(IntEnum):
@@ -186,129 +183,3 @@ class GemmWarpPolicy(IntEnum):
             return cls.FullCol
         else:
             return cls.Square
-
-
-@dataclass
-class GemmBaseParams:
-    # OP Related Config
-    A: tir.Buffer
-    B: tir.Buffer
-    C: tir.Buffer
-
-    transpose_A: bool = False
-    transpose_B: bool = False
-    block_row_warps: int | None = None
-    block_col_warps: int | None = None
-    warp_row_tiles: int | None = None
-    warp_col_tiles: int | None = None
-    chunk: int | None = None
-    policy: GemmWarpPolicy = (GemmWarpPolicy.Square,)
-    k_pack: int = 1
-
-    def get_warp_size(self) -> int:
-        # must rewrite to 64 if the target
-        # is cdna mfma
-        return 32
-
-    def params_as_dict(self):
-        return {
-            "A": self.A,
-            "B": self.B,
-            "C": self.C,
-            "transpose_A": self.transpose_A,
-            "transpose_B": self.transpose_B,
-            "block_row_warps": self.block_row_warps,
-            "block_col_warps": self.block_col_warps,
-            "warp_row_tiles": self.warp_row_tiles,
-            "warp_col_tiles": self.warp_col_tiles,
-            "chunk": self.chunk,
-            "policy": self.policy,
-            "k_pack": self.k_pack,
-        }
-
-    def infer_block_partition(self, threads: int | None) -> None:
-        """
-        Infer and set block partition parameters (e.g., block_row_warps,
-        block_col_warps, warp_row_tiles, warp_col_tiles, chunk) based on the
-        shape of A and B. If these parameters are not already specified, the
-        method will attempt to infer them automatically based on the given
-        `threads`.
-
-        Parameters
-        ----------
-        threads : Optional[int]
-            The total number of threads in a block. Must be provided
-            if any block partition parameter is not already set.
-
-        Raises
-        ------
-        AssertionError
-            If `threads` is None but any block partition parameter is missing,
-            or if A and B have inconsistent shapes for GEMM.
-        """
-
-        warp_size = self.get_warp_size()
-        A, B = self.A, self.B
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        policy = self.policy
-
-        # The field `chunk` is not declared in GemmBaseParams by default.
-        # We infer it based on the K dimension of matrices.
-        # Initialize chunk from `self` if it exists; otherwise we infer it.
-        chunk = getattr(self, "chunk", None)
-
-        # Determine whether block partition parameters need to be inferred
-        require_infer = (
-            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or warp_col_tiles is None or chunk is None
-        )
-
-        A_shape, B_shape = A.shape, B.shape
-
-        if require_infer:
-            assert threads is not None, "threads must be provided for auto inference"
-            # Auto-inference only supports 2D matrix multiplication
-            assert len(A_shape) == 2 and len(B_shape) == 2, (
-                f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
-            )
-
-            # Analyze A/B shapes
-            AM = A_shape[1] if transpose_A else A_shape[0]  # M dimension
-            BN = B_shape[0] if transpose_B else B_shape[1]  # N dimension
-            AK = A_shape[0] if transpose_A else A_shape[1]  # K dimension
-            BK = B_shape[1] if transpose_B else B_shape[0]  # K dimension
-            assert AK == BK, "A and B shape mismatch"
-
-            block_M = int(AM)
-            block_N = int(BN)
-            num_warps = threads // warp_size
-
-            # Infer block partition using a user-specified policy
-            block_row_warps, block_col_warps = policy.compute_warp_partition(block_M, block_N, num_warps)
-            warp_row_tiles = block_M // block_row_warps
-            warp_col_tiles = block_N // block_col_warps
-            chunk = int(AK)
-
-        # rewrite the values
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-
-    @property
-    def class_attributes(self):
-        return self.params_as_dict()
-
-    def __repr__(self) -> str:
-        cls_name = self.__class__.__name__
-        fields = self.class_attributes
-        field_str = ", ".join(f"{key}={value!r}" for key, value in fields.items())
-        return f"{cls_name}({field_str})"
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 4d2caf8c..850b6f3b 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -6,7 +6,6 @@ from tvm.ir.base import Node
 from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
-from tilelang.ir import GemmWarpPolicy as GemmWarpPolicy
 from .gemm_mma import GemmMMA
 from .gemm_mma_sm70 import GemmMMASm70
 from .gemm_wgmma import GemmWGMMA
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 581272cf..5e4899b5 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -3,7 +3,7 @@ from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
 from tilelang.utils.language import is_shared, is_fragment
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 from tvm.ir import PrimExpr
 
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
index c22bca8d..1d75657e 100644
--- a/tilelang/tileop/gemm_sp/__init__.py
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -8,7 +8,7 @@ from tvm.ir.base import Node
 from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from .gemm_sp_mma import GemmSPMMA
 
 
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
index 51c6786b..8226a066 100644
--- a/tilelang/tileop/gemm_sp/gemm_sp_base.py
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -3,7 +3,7 @@ from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
 from tilelang.utils.language import is_shared, is_fragment
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 
 
-- 
GitLab


From 3aa6938f8f6f346670ca730286ff8adf77370eeb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:39:01 +0800
Subject: [PATCH 103/139] [CI]: Bump actions/upload-artifact from 5 to 6
 (#1431)

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 5 to 6.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v5...v6)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-version: '6'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/dist.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 7e9fae26..ea99ff97 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -92,7 +92,7 @@ jobs:
       - name: Upload SDist
         # Not PR to save artifact storage, as SDist is only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: sdist
           path: dist/*.tar.gz
@@ -168,7 +168,7 @@ jobs:
       - name: Upload wheels
         # Not PR to save artifact storage, as wheels are only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: wheels-${{ matrix.python-version }}-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
           path: wheelhouse/*.whl
@@ -201,7 +201,7 @@ jobs:
         run: ls -lh dist/*
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts
           path: dist/*
-- 
GitLab


From 87e9e170991781f88ade95dbc9496a9146c9ab04 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:39:13 +0800
Subject: [PATCH 104/139] [CI]: Bump actions/download-artifact from 6 to 7
 (#1432)

Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 6 to 7.
- [Release notes](https://github.com/actions/download-artifact/releases)
- [Commits](https://github.com/actions/download-artifact/compare/v6...v7)

---
updated-dependencies:
- dependency-name: actions/download-artifact
  dependency-version: '7'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/dist.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index ea99ff97..66668711 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -183,7 +183,7 @@ jobs:
     timeout-minutes: 15
     steps:
       - name: Download built SDist
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           # unpacks default artifact into dist/
           # if `name: artifact` is omitted, the action will create extra parent dir
@@ -191,7 +191,7 @@ jobs:
           path: dist
 
       - name: Download built wheels
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: wheels-*
           path: dist
-- 
GitLab


From fba12a5fd2070ca30a7921c2487051ed7c03ea15 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Dec 2025 14:36:30 +0800
Subject: [PATCH 105/139] [Bugfix] Convey  `compile_flags` to ffi compilation
 path with pass_configs (#1434)

* [Enhancement] Add device compile flags support in pass configuration

* Introduced `kDeviceCompileFlags` option in the pass configuration to allow additional device compiler flags for CUDA compilation.
* Updated the `tilelang_callback_cuda_compile` function to merge extra flags from the pass configuration, enhancing flexibility in compiler options.
* Modified the `JITKernel` class to handle device compile flags appropriately, ensuring they are included during compilation.
* Documented the new pass configuration key for clarity on usage and expected input formats.

* lint fix

* [Refactor] Simplify compile_flags handling in JIT functions

* Removed redundant string check for compile_flags in the compile, jit, and lazy_jit functions, ensuring compile_flags is consistently treated as a list.
* Updated the JITKernel class to handle compile_flags as a list when a string is provided, enhancing code clarity and maintainability.

* lint fix

* fix
---
 src/op/builtin.cc                 |  1 +
 src/op/builtin.h                  |  1 +
 tilelang/engine/lower.py          | 26 +++++++++++++++++++++-----
 tilelang/jit/__init__.py          |  8 --------
 tilelang/jit/kernel.py            | 11 +++++++++--
 tilelang/transform/pass_config.py | 14 ++++++++++++++
 6 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 7cc7f579..2983495b 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -36,6 +36,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDeviceCompileFlags, ffi::Array<ffi::String>);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
diff --git a/src/op/builtin.h b/src/op/builtin.h
index 47679a51..7b071e84 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -55,6 +55,7 @@ static constexpr const char *kLayoutVisualizationEnable =
     "tl.layout_visualization_enable";
 static constexpr const char *kLayoutVisualizationFormats =
     "tl.layout_visualization_formats";
+static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
 /*!
  * \brief Whether to disable dynamic tail split
  *
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 9932d522..8b70f6d4 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -76,21 +76,37 @@ def tilelang_callback_cuda_compile(code, target, pass_config=None):
 
     # Read pass-config keys (string-valued) like in jit.adapter.libgen.compile_lib
     cfg = pass_config or {}
-    if cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH.value, False):
+    if cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH, False):
         deprecated_warning("TL_DISABLE_FAST_MATH", "TL_ENABLE_FAST_MATH", "0.1.7")
-        disable_fast_math = bool(cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH.value, True))
+        disable_fast_math = bool(cfg.get(PassConfigKey.TL_DISABLE_FAST_MATH, True))
         enable_fast_math = not disable_fast_math
     else:
-        enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH.value, False))
+        enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH, False))
 
-    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL.value, None)
-    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT.value, False))
+    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False))
 
     options = [
         "-std=c++17",
         "-I" + tl_template_path,
         "-I" + cutlass_path,
     ]
+    # Merge extra device compiler flags from pass config, if provided
+    extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
+    if extra_flags:
+        import shlex
+
+        if isinstance(extra_flags, str):
+            tokens = shlex.split(extra_flags)
+        else:
+            tokens = []
+            for flag in extra_flags:
+                if isinstance(flag, str):
+                    tokens.extend(shlex.split(flag))
+                else:
+                    tokens.append(str(flag))
+        options += tokens
+
     if enable_fast_math:
         options.append("--use_fast_math")
     if ptxas_usage_level is not None:
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 9a5920d7..a61c91d1 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -80,9 +80,6 @@ def compile(
 
     assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
 
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
-
     if hasattr(func, "out_idx_override"):
         if func.out_idx_override is not None and out_idx is not None:
             raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
@@ -492,8 +489,6 @@ def jit(  # This is the new public interface
         Either a JIT-compiled wrapper around the input function, or a configured decorator
         instance that can then be applied to a function.
     """
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
     def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
         if isinstance(func, (PrimFunc, PrimFuncCreater)):
@@ -550,9 +545,6 @@ def lazy_jit(
     debug_root_path: str | None = None,
     compile_flags: list[str] | str | None = None,
 ):
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
-
     compile_args = dict(
         out_idx=None,
         execution_backend=execution_backend,
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index c05ef9e5..9a0dab89 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -19,6 +19,7 @@ from tilelang.jit.adapter import BaseKernelAdapter, CtypesKernelAdapter, CythonK
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
+from tilelang.transform import PassConfigKey
 import logging
 import os
 
@@ -96,7 +97,7 @@ class JITKernel(Generic[_P, _T]):
             pass_configs = {}
         self.pass_configs = pass_configs
 
-        self.compile_flags = compile_flags
+        self.compile_flags = [compile_flags] if isinstance(compile_flags, str) else compile_flags
 
         # Ensure the target is always a valid TVM Target object.
         self.target = determine_target(target, return_object=True)
@@ -218,10 +219,16 @@ class JITKernel(Generic[_P, _T]):
         target_host = self.target_host
 
         execution_backend = self.execution_backend
-        pass_configs = self.pass_configs
+        pass_configs = self.pass_configs or {}
 
         compile_flags = self.compile_flags
 
+        if compile_flags is not None:
+            compile_flags_cfg = pass_configs.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS)
+            pass_configs[PassConfigKey.TL_DEVICE_COMPILE_FLAGS] = (
+                compile_flags_cfg + compile_flags if compile_flags_cfg is not None else compile_flags
+            )
+
         # Compile the function with TVM, optimizing with shared memory lowering.
         enable_host_codegen = execution_backend == "tvm_ffi"
         enable_device_compile = execution_backend == "tvm_ffi"
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index 92a7313b..8a2d250b 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -37,6 +37,20 @@ class PassConfigKey(str, Enum):
     TL_ENABLE_PTXAS_VERBOSE_OUTPUT = "tl.enable_ptxas_verbose_output"
     """Enable ptxas verbose output. Default: False"""
 
+    TL_DEVICE_COMPILE_FLAGS = "tl.device_compile_flags"
+    """Additional device compiler flags passed to nvcc/NVRTC.
+
+    Accepts either a string (parsed with shell-like splitting) or a list of
+    strings. Typical usage is to provide extra include paths, defines or
+    ptxas options, e.g.:
+
+    - "-I/opt/include -DMY_SWITCH=1 --ptxas-options=--verbose"
+    - ["-I/opt/include", "-DMY_SWITCH=1", "--ptxas-options=--verbose"]
+
+    These flags are appended to the compiler options used in the tvm_ffi
+    CUDA compile callback. Default: None
+    """
+
     TL_CONFIG_INDEX_BITWIDTH = "tl.config_index_bitwidth"
     """Bitwidth for configuration indices. Default: 32"""
 
-- 
GitLab


From 0788feb8126604d1121add3c4933c247a9c8f025 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Dec 2025 14:49:06 +0800
Subject: [PATCH 106/139] [Enhancement] Improve buffer usage tracking in
 MakePackedAPI (#1435)

* Added detailed logging for data and shape variable parameters during buffer usage detection in the MakePackedAPI function.
* Refactored the UsedBufferDetector to differentiate between used parameters by data and shape variables, enhancing clarity in buffer management.
* Updated logic to ensure minimal carrier buffers are selected for shape symbols, improving the efficiency of parameter handling.
---
 src/transform/make_packed_api.cc | 44 +++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index a7228613..942c652f 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -324,8 +324,11 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       record_shape_vars(buf->elem_offset);
   }
 
-  // A visitor that marks a buffer as used when its underlying data var is
-  // referenced (e.g. BufferLoad/BufferStore or any direct var usage).
+  // A visitor that records
+  //  - which parameter buffers are used via their data var (load/store/direct),
+  //  - which shape/stride/offset symbols are referenced in the body.
+  // Shape symbols are not immediately attributed to all carrier buffers here;
+  // a minimal carrier set is selected after visiting.
   struct UsedBufferDetector : public StmtExprVisitor {
     UsedBufferDetector(
         const std::unordered_map<const VarNode *, const VarNode *> &data2param,
@@ -335,26 +338,25 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     void VisitExpr_(const VarNode *op) override {
       auto it = data2param.find(op);
       if (it != data2param.end()) {
-        used_params.insert(it->second);
+        used_params_by_data.insert(it->second);
       }
       auto it2 = shape2params.find(op);
       if (it2 != shape2params.end()) {
-        for (const VarNode *p : it2->second)
-          used_params.insert(p);
+        used_shape_vars.insert(op);
       }
       StmtExprVisitor::VisitExpr_(op);
     }
     void VisitStmt_(const BufferStoreNode *op) override {
       auto it = data2param.find(op->buffer->data.get());
       if (it != data2param.end()) {
-        used_params.insert(it->second);
+        used_params_by_data.insert(it->second);
       }
       StmtExprVisitor::VisitStmt_(op);
     }
     void VisitExpr_(const BufferLoadNode *op) override {
       auto it = data2param.find(op->buffer->data.get());
       if (it != data2param.end()) {
-        used_params.insert(it->second);
+        used_params_by_data.insert(it->second);
       }
       StmtExprVisitor::VisitExpr_(op);
     }
@@ -362,7 +364,8 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     const std::unordered_map<const VarNode *, const VarNode *> &data2param;
     const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
         &shape2params;
-    std::unordered_set<const VarNode *> used_params;
+    std::unordered_set<const VarNode *> used_params_by_data;
+    std::unordered_set<const VarNode *> used_shape_vars;
   };
 
   UsedBufferDetector detector(data_var2param, shape_var2params);
@@ -371,7 +374,30 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   // Build the packed argument handling. While doing so, keep track of whether
   // each parameter buffer is actually used. Unused input buffers can be
   // nullable and do not require DLTensor field dereferences.
-  std::unordered_set<const VarNode *> used_param_buffers = detector.used_params;
+  //
+  // Start from buffers used via data-var (definitely non-NULL), then for each
+  // referenced shape symbol pick a minimal "carrier" buffer that provides the
+  // symbol. Prefer carriers that are already used-by-data; otherwise pick one
+  // arbitrary carrier to ensure the symbol is bound.
+  std::unordered_set<const VarNode *> used_param_buffers =
+      detector.used_params_by_data;
+  for (const VarNode *sym : detector.used_shape_vars) {
+    auto it = shape_var2params.find(sym);
+    if (it == shape_var2params.end())
+      continue;
+    const auto &carriers = it->second;
+    bool has_used_carrier = false;
+    for (const VarNode *p : carriers) {
+      if (used_param_buffers.count(p)) {
+        has_used_carrier = true;
+        break;
+      }
+    }
+    if (!has_used_carrier && !carriers.empty()) {
+      // Choose the first carrier to anchor this symbol.
+      used_param_buffers.insert(carriers.front());
+    }
+  }
 
   for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
     Var param = func_ptr->params[i];
-- 
GitLab


From 2feaa41ef85c450bd3c092ce341e9a56934b6e07 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Mon, 15 Dec 2025 15:10:48 +0800
Subject: [PATCH 107/139] [Enhancement] Improve InjectAssumes logic and make
 assumes work after SplitHostDevice (#1405)

* [Refactor] Refactor InjectAssumes logic and make assumes work after SplitHostDevice

* address comments

* fix

* fix submodule

* fix

* fix 3rdparty
---
 CMakeLists.txt                     |  1 +
 src/transform/common/assume.cc     | 33 ++++++++++++++++++
 src/transform/common/assume.h      | 28 +++++++++++++++
 src/transform/inject_assumes.cc    | 56 ++++++++++++++++++++----------
 src/transform/split_host_device.cc | 45 ++++++++++++++++++++++++
 tilelang/transform/__init__.py     |  3 +-
 6 files changed, 147 insertions(+), 19 deletions(-)
 create mode 100644 src/transform/common/assume.cc
 create mode 100644 src/transform/common/assume.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5506e8cc..6a0a01aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,7 @@ file(GLOB TILE_LANG_SRCS
   src/*.cc
   src/layout/*.cc
   src/transform/*.cc
+  src/transform/common/*.cc
   src/op/*.cc
   src/target/utils.cc
   src/target/codegen_c_host.cc
diff --git a/src/transform/common/assume.cc b/src/transform/common/assume.cc
new file mode 100644
index 00000000..cb51d0f8
--- /dev/null
+++ b/src/transform/common/assume.cc
@@ -0,0 +1,33 @@
+
+/*!
+ * \file assume.cc
+ * \brief Utils on assume statements
+ */
+
+#include "assume.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt) {
+  auto eval = stmt.as<EvaluateNode>();
+  if (!eval)
+    return std::nullopt;
+  auto call = eval->value.as<CallNode>();
+  if (!call)
+    return std::nullopt;
+  if (!call->op.same_as(builtin::assume()))
+    return std::nullopt;
+  return call->args[0];
+}
+
+bool IsAssumeInEvaluateForm(const Stmt &stmt) {
+  return GetAssumeExprInEvaluateForm(stmt).has_value();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h
new file mode 100644
index 00000000..c6eadc6b
--- /dev/null
+++ b/src/transform/common/assume.h
@@ -0,0 +1,28 @@
+
+/*!
+ * \file assume.h
+ * \brief Utils on assume statements
+ */
+
+#ifndef TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+#define TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+
+#include "tvm/tir/stmt.h"
+#include <optional>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Get the expression inside an assume statement, if any. Returns nullopt if
+// the statement is not an assume statement.
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt);
+
+// Check if a statement is an assume statement.
+bool IsAssumeInEvaluateForm(const Stmt &stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
\ No newline at end of file
diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index 3c3bf923..2a5fc62c 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -1,4 +1,10 @@
+/*!
+ * \file inject_assumes.cc
+ * \brief Inject assumes on buffer's shape boundary check. Also convert
+ * existing assumes to AttrNodes.
+ */
 
+#include "common/assume.h"
 #include "tvm/arith/analyzer.h"
 #include "tvm/ffi/optional.h"
 #include "tvm/ir/expr.h"
@@ -10,6 +16,7 @@
 #include "tvm/tir/stmt.h"
 #include "tvm/tir/stmt_functor.h"
 #include "tvm/tir/transform.h"
+
 #include <sstream>
 
 namespace tvm::tl {
@@ -27,11 +34,12 @@ public:
   }
 
 private:
-  struct AssertCreator {
+  struct AssumeCreator {
     struct Item {
       PrimExpr expr;
       std::vector<Buffer> buffers;
     };
+
     tvm::StructuralHash sh;
     tvm::StructuralEqual se;
     // grouped by expr, since the amount of variadic shape symbols is usually
@@ -53,6 +61,7 @@ private:
         items[*it].buffers.push_back(buffer);
       }
     }
+
     void addBuffer(Buffer buf) {
       for (auto shape : buf->shape) {
         if (shape->IsInstance<IntImmNode>())
@@ -60,6 +69,7 @@ private:
         addExpr(shape, buf);
       }
     }
+
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
@@ -79,32 +89,37 @@ private:
       return body;
     }
   };
+
   Stmt VisitStmt_(const DeclBufferNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
+    AssumeCreator c;
     c.addBuffer(op->buffer);
     return DeclBuffer(op->buffer, c.build(body), op->span);
   }
-  std::optional<PrimExpr> getAssumeExpr(Stmt stmt) {
-    auto eval = stmt.as<EvaluateNode>();
-    if (!eval)
-      return std::nullopt;
-    auto call = eval->value.as<CallNode>();
-    if (!call)
-      return std::nullopt;
-    if (!call->op.same_as(builtin::assume()))
-      return std::nullopt;
-    return call->args[0];
-  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
     struct AssumeGroup {
       std::optional<PrimExpr> e;
       std::vector<Stmt> stmts;
     };
     std::vector<AssumeGroup> groups = {AssumeGroup{std::nullopt, {}}};
-    for (auto i = 0; i < op->seq.size(); i++) {
+    for (size_t i = 0; i < op->seq.size(); i++) {
       auto stmt = VisitStmt(op->seq[i]);
-      if (auto e = getAssumeExpr(stmt)) {
+      // Convert assume in evaluate form to assume attribute.
+      // By default, we have the following IR:
+      //    T.assume(cond1)
+      //    Stmt1
+      //    Stmt2
+      //    T.assume(cond2)
+      // This SeqStmt will be converted to:
+      //    With(attr::tilelang_assume, cond1) {
+      //      Stmt1
+      //      Stmt2
+      //    }
+      //    With(attr::tilelang_assume, cond2) {
+      //      ...
+      //    }
+      if (auto e = GetAssumeExprInEvaluateForm(stmt)) {
         groups.push_back(AssumeGroup{*e, {}});
       } else {
         groups.back().stmts.push_back(stmt);
@@ -127,10 +142,14 @@ private:
                                        : SeqStmt(groups[0].stmts);
     // return SeqStmt(groups[0].stmts);
   }
+
   Stmt VisitStmt_(const BlockNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
-    if (root_node) {
+    AssumeCreator c;
+
+    // NOTE(chaofan): We only inject assumes from function arguments in the
+    // root block.
+    if (op->name_hint == "root") {
       for (auto item : f->buffer_map) {
         c.addBuffer(item.second);
       }
@@ -141,12 +160,13 @@ private:
     for (auto item : op->match_buffers) {
       c.addBuffer(item->buffer);
     }
+
     return Block(op->iter_vars, op->reads, op->writes, op->name_hint,
                  c.build(body), op->init, op->alloc_buffers, op->match_buffers,
                  op->annotations, op->span);
   }
+
   PrimFunc f;
-  bool root_node{true};
 };
 
 using namespace tir::transform;
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index a9f52f41..0f88ad2e 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,13 +33,24 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "common/assume.h"
 #include "tir/analysis/var_use_def_analysis.h"
+#include "tvm/node/cast.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/tir/stmt.h"
 
 namespace tvm {
 namespace tl {
 using namespace ffi;
 namespace tir = tvm::tir;
 
+// This pass traverses the AST, split the target function into host part and
+// device part and copies all assume attribute statements to the device side.
+
+// 1. Traverse AST and collect all assume statements into host_assumes_.
+// 2. Until the first AttrStmtNode with tvm::attr::kTarget.
+// 3. Call SplitDeviceFunc, which will create a new device function and replace
+//    the original body with a call to that function.
 class HostDeviceSplitter : public tir::StmtMutator {
 public:
   explicit HostDeviceSplitter(IRModule *device_mod,
@@ -51,10 +62,29 @@ public:
       found_device_region_ = true;
       auto device_target = op->node.as<tvm::Target>().value().WithoutHost();
       return SplitDeviceFunc(op->body, device_target);
+    } else if (op->attr_key == tir::attr::tilelang_assume) {
+      // NOTE(chaofan): the assumes collected here must be in host-side.
+      //    This is because when the collector reaches the split region,
+      //    it will start to split and return. For safety, we add a check here.
+      ICHECK(!found_device_region_)
+          << "Assumes collection should not be in device region.";
+      // We first push back the outside assume, then visit the child.
+      // So when moving assumes to device side, we need to do the building
+      // process in a reverse order.
+      host_assumes_.push_back(op);
     }
     return tir::StmtMutator::VisitStmt_(op);
   }
 
+  tir::Stmt VisitStmt_(const tir::EvaluateNode *op) final {
+    auto stmt = GetRef<tir::Stmt>(op);
+    // There should be no assume in evaluate form after InjectAssumes.
+    ICHECK(!IsAssumeInEvaluateForm(stmt))
+        << "Unexpected assume in evaluate form. Please run InjectAssumes pass "
+           "first.";
+    return tir::StmtMutator::VisitStmt_(op);
+  }
+
   tir::Stmt ForceSplit(tir::Stmt body, tvm::Target device_target) {
     return SplitDeviceFunc(std::move(body), std::move(device_target));
   }
@@ -64,6 +94,14 @@ public:
 private:
   bool found_device_region_{false};
 
+  Stmt wrapBodyWithHostSideAssumes(Stmt body) {
+    for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
+      body =
+          AttrStmt((*it)->node, tir::attr::tilelang_assume, (*it)->value, body);
+    }
+    return body;
+  }
+
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
     auto [params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
@@ -104,9 +142,14 @@ private:
       kernel_ret_type = VoidType();
     }
 
+    // Declare necessary buffers for the device side.
     for (tir::Buffer buf : buffers_to_declare) {
       body = tir::DeclBuffer(buf, std::move(body));
     }
+
+    // Copy assumes from host-side to device-side.
+    body = wrapBodyWithHostSideAssumes(body);
+
     tir::PrimFunc device_func(params, body, kernel_ret_type);
     device_func =
         WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target},
@@ -138,6 +181,8 @@ private:
   IRModule *device_mod_;
   // Generate new GlobalVar for the kernel
   std::function<GlobalVar()> var_supply_;
+  // Collect assumes in host side
+  Array<const tir::AttrStmtNode *> host_assumes_;
 };
 
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 0f8edd50..1dbf66d7 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -92,7 +92,8 @@ def LegalizeNegativeIndex():
 
 
 def InjectAssumes():
-    """Inject Assumes
+    """Inject Assumes for natural shape boundary conditions. And convert Assumes in Evaluate(Call(...)) form
+    (tvm builtin assume call) to AttrNode form.
 
     Returns:
     -------
-- 
GitLab


From b8003a28576fdbab77be8435e15a04292ca811a6 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Dec 2025 15:39:01 +0800
Subject: [PATCH 108/139] [Enhancement] Include PrimFunc name in memory cache
 logs for better debugging (#1437)

* Added the `get_prim_func_name` utility to extract human-readable function names from TVM PrimFuncs.
* Updated memory cache logging in `AutoTuner` and `KernelCache` classes to include the kernel name, improving clarity during cache hits.
* Enhanced debug logging to provide more informative messages when checking disk cache for kernels.
---
 tilelang/autotuner/tuner.py    | 11 ++++++++---
 tilelang/cache/kernel_cache.py | 12 ++++++++----
 tilelang/utils/__init__.py     |  1 +
 tilelang/utils/language.py     | 24 ++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 5bbdc48a..8d950373 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -37,6 +37,7 @@ from pathlib import Path
 
 from tilelang import env
 from tilelang.autotuner.param import CompileArgs, ProfileArgs, AutotuneResult
+from tilelang.utils.language import get_prim_func_name
 from tilelang.autotuner.capture import get_autotune_inputs
 from tilelang.utils.target import determine_target
 from tilelang import __version__
@@ -332,11 +333,15 @@ class AutoTuner:
             if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
+                    # Include PrimFunc name when hitting autotuner memory cache
+                    cached_result = self._memory_cache[key]
+                    prim = getattr(cached_result, "func", None)
+                    kernel_name = get_prim_func_name(prim, "<unknown>")
                     logger.warning(
-                        "Found kernel in memory cache. For better performance,"
-                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel."
+                        "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.",
+                        kernel_name,
                     )
-                    return self._memory_cache[key]
+                    return cached_result
 
                 # Then check disk cache
                 result = self._load_result_from_disk(key)
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index 4fbe2dce..58295406 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -16,6 +16,7 @@ from tvm.target import Target
 from tvm.tir import PrimFunc
 from tvm.runtime import Executable
 from tilelang.engine.param import KernelParam
+from tilelang.utils.language import get_prim_func_name
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
@@ -179,13 +180,16 @@ class KernelCache:
         with self._lock:
             # First check in-memory cache
             if key in self._memory_cache:
+                # Include kernel name for easier debugging when hitting memory cache
+                kernel_name = get_prim_func_name(func, "<unknown>")
                 self.logger.warning(
-                    "Found kernel in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching."
+                    "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.",
+                    kernel_name,
                 )
                 return self._memory_cache[key]
 
             if verbose:
-                self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
+                self.logger.debug(f"Checking disk cache for kernel {get_prim_func_name(func, '<unknown>')}")
 
             # Then check disk cache
             kernel = self._load_kernel_from_disk(
@@ -193,13 +197,13 @@ class KernelCache:
             )
             if kernel is not None:
                 if verbose:
-                    self.logger.debug(f"Found kernel in disk cache for {func.attrs['global_symbol']}")
+                    self.logger.debug(f"Found kernel in disk cache for {get_prim_func_name(func, '<unknown>')}")
                 # Populate memory cache with disk result
                 self._memory_cache[key] = kernel
                 return kernel
 
         if verbose:
-            self.logger.debug(f"No cached kernel for {func.attrs['global_symbol']}")
+            self.logger.debug(f"No cached kernel for {get_prim_func_name(func, '<unknown>')}")
         # Compile kernel if cache miss; leave critical section
         kernel = JITKernel(
             func,
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index a713df8e..df0c71c2 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -16,5 +16,6 @@ from .language import (
     is_full_region,  # noqa: F401
     to_buffer_region,  # noqa: F401
     get_buffer_region_from_load,  # noqa: F401
+    get_prim_func_name,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index 584e9998..ea8e5880 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -478,3 +478,27 @@ def is_full_region(buffer_region: BufferRegion) -> bool:
         if not expr_equal(r.extent, dim):
             return False
     return True
+
+
+def get_prim_func_name(func: PrimFunc | None, default: str | None = None) -> str | None:
+    """
+    Extract a human‑readable function name from a TVM PrimFunc.
+
+    Prefer the `global_symbol` attribute set on the PrimFunc. If it is missing
+    (e.g., private PrimFunc without a global symbol), return the provided
+    `default` value.
+
+    Args:
+        func: TVM PrimFunc instance or None.
+        default: Fallback name to return when no name can be determined.
+
+    Returns:
+        The function name as a string, or `default` when unavailable.
+    """
+    if func is None:
+        return default
+    try:
+        name = func.attrs["global_symbol"]
+        return str(name) if name is not None else default
+    except Exception:
+        return default
-- 
GitLab


From 4dbc910d5f85829bf3b5ea56b86d6a318bae3734 Mon Sep 17 00:00:00 2001
From: Xuehai Pan <XuehaiPan@pku.edu.cn>
Date: Mon, 15 Dec 2025 17:16:58 +0800
Subject: [PATCH 109/139] [CI] Update lint dependencies and fix lint on trunk
 (#1433)

* [CI] Update pre-commit hooks

* [Lint] Pass correct `exclude-header-filter` to `clang-tidy`

* [Lint] Download latest `run-clang-tidy` script

* [CI] Show compile commands

* [CI] Add output grouping to GHA

* [Lint] Re-order pre-commit hooks
---
 .clang-tidy              |  2 ++
 .github/workflows/ci.yml | 26 +++++++++++++++-----------
 .gitignore               |  5 ++++-
 .pre-commit-config.yaml  |  7 ++++---
 requirements-lint.txt    |  6 +++---
 5 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 1681ed66..f9b77bce 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,9 @@ ExtraArgs: []
 FormatStyle: file
 UseColor: true
 WarningsAsErrors: '*'
+# FIXME: Use `ExcludeHeaderFilterRegex` instead when all maintainers upgraded their `clang-tidy`
 HeaderFilterRegex: '^(?!.*(?:/|^)(3rdparty|tvm)/).*'
+# ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
 
 # NOTE: there must be no spaces before the '-', so put the comma last.
 Checks: >-
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c33a25b6..0bf2e2ec 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -294,29 +294,33 @@ jobs:
         run: |
           echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version
 
-          if [[ -x "$(command -v run-clang-tidy)" ]]; then
-            echo "Using run-clang-tidy from $(command -v run-clang-tidy)"
-            CLANG_TIDY=(run-clang-tidy)
-          else
-            RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
-            echo "Downloading run-clang-tidy script from ${RCT_URL}"
-            echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
-            CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
-          fi
+          # Download run-clang-tidy script
+          RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+          echo "Downloading run-clang-tidy script from ${RCT_URL}"
+          echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
+          RUN_CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
+
           if [[ -x "$(command -v clang-apply-replacements)" ]]; then
             echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)"
-            CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
+            RUN_CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
           else
             echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled."
           fi
 
           # Run cmake to create the build directory with compile_commands.json
           cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
+          echo "::group::compile_commands.json"
+          ls -alh cmake-build/compile_commands.json
+          uv run --no-project -m -- json.tool --no-ensure-ascii cmake-build/compile_commands.json
+          echo "::endgroup::"
 
           CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h")
           rc=0
-          "${CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+          echo "::group::run-clang-tidy"
+          "${RUN_CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+            -exclude-header-filter='^(3rdparty|tvm)/.*$' \
             -p="cmake-build" ${CXX_FILES} || rc="$?"
+          echo "::endgroup::"
           rm -rf cmake-build run-clang-tidy.py
           if (( rc != 0 )); then
             echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them."
diff --git a/.gitignore b/.gitignore
index d94abf9e..8dc9fd88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,7 @@ maint/host_checks/logs/*
 *.ncu-rep
 
 # csv
-*.csv
\ No newline at end of file
+*.csv
+
+# clang-tidy
+/run-clang-tidy.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d1bb4cee..3504adb6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,16 +32,17 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.6  # sync with requirements-lint.txt
+    rev: v21.1.7  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
         types_or: [c++, c]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.7  # sync with requirements-lint.txt
+    rev: v0.14.9  # sync with requirements-lint.txt
     hooks:
-      - id: ruff-format
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
+        args: [--exit-non-zero-on-format]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.4.1  # sync with requirements-lint.txt
     hooks:
diff --git a/requirements-lint.txt b/requirements-lint.txt
index 54f03638..b68f81a2 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,6 +1,6 @@
 # Format and lint requirements
 pre-commit
-clang-format==21.1.2
-clang-tidy==21.1.1
+clang-format==21.1.7
+clang-tidy==21.1.6
 codespell[toml]==2.4.1
-ruff==0.14.3
+ruff==0.14.9
-- 
GitLab


From e387102ccb9487eeba792aebda27e1f465c4712c Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Mon, 15 Dec 2025 20:07:41 +0800
Subject: [PATCH 110/139] [Enhancement] Refactor vectorization checks in
 loop_vectorize (#1440)

* Introduced a new function, IsExprInvariantInVectorBoundary, to encapsulate the logic for checking if an expression is invariant within vector boundaries, improving code clarity and reusability.
* Updated the existing vectorization logic to utilize this new function, streamlining the process of determining vectorization feasibility based on boundary conditions.
* Enhanced comments for better understanding of the vectorization criteria and mathematical rationale behind the checks.
---
 src/transform/loop_vectorize.cc | 48 +++++++++++++++++++++------------
 src/transform/loop_vectorize.h  |  6 +++++
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 03b80825..00223fae 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -187,9 +187,13 @@ private:
     if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {
       return;
     }
-    // 3. Tight vectorize bound
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ /
-                                                         buffer->dtype.bits());
+    // 3. Check if current vector_size_ works with invariant boundary check
+    if (!IsExprInvariantInVectorBoundary(elem_offset, inner_for_->loop_var,
+                                         vector_size_, analyzer_)) {
+      // If not, tight vectorize bound with buffer dtype constraint
+      vector_size_ = arith::ZeroAwareGCD(
+          vector_size_, vector_load_bits_max_ / buffer->dtype.bits());
+    }
     // 4. Try to vectorize buffer load
     while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
                                inner_for_->extent, vector_size_, analyzer_)) {
@@ -272,6 +276,28 @@ bool CanProveIndependent(const PrimExpr &expr, Var var,
   return false;
 }
 
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer) {
+  // Check if expr is invariant within vector boundaries
+  // We're trying to prove the access expression A[f(var)] depends only on
+  // floor(var/vecsize), not on var%vecsize
+  // Mathematically:
+  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
+  // f(floor(var/vecsize)*vecsize + 0)
+  // Example: for i in T.vectorized(8):
+  //     A[i] = B[i] * C[i//4]
+  // if vecsize=4, f(i)=i//4 depends only on i//4
+  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
+  PrimExpr var_aligned =
+      floordiv(var, target_vectorized_size) * target_vectorized_size;
+  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
+  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+    return true;
+  }
+  return false;
+}
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer) {
@@ -292,20 +318,8 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                                0))
     return false;
 
-  // Check if expr is invariant within vector boundaries
-  // We're trying to prove the access expression A[f(var)] depends only on
-  // floor(var/vecsize), not on var%vecsize
-  // Mathematically:
-  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
-  // f(floor(var/vecsize)*vecsize + 0)
-  // Example: for i in T.vectorized(8):
-  //     A[i] = B[i] * C[i//4]
-  // if vecsize=4, f(i)=i//4 depends only on i//4
-  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
-  PrimExpr var_aligned =
-      floordiv(var, target_vectorized_size) * target_vectorized_size;
-  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
-  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+  if (IsExprInvariantInVectorBoundary(expr, var, target_vectorized_size,
+                                      analyzer)) {
     return true;
   }
 
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index a63c4b45..92a75622 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -46,6 +46,12 @@ For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
 // when var changes
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer);
+
+// Check if expr is invariant within vector boundaries
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer);
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer);
-- 
GitLab


From bcae814ed157a1b836d7f604bc7dc652365cf3c1 Mon Sep 17 00:00:00 2001
From: Xiangwen Wang <77378439+LJC00118@users.noreply.github.com>
Date: Mon, 15 Dec 2025 20:07:58 +0800
Subject: [PATCH 111/139]  Enhance vectorized conversion support (#1438)

---
 src/target/codegen_cuda.cc                    | 63 ++++++++++++++++
 src/tl_templates/cuda/cuda_fp8.h              | 71 ++++++++++---------
 src/transform/layout_inference.cc             | 11 ++-
 .../test_tilelang_language_vectorized_cast.py | 15 +++-
 4 files changed, 123 insertions(+), 37 deletions(-)

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 1a9fb06c..1cf642138 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -1139,6 +1139,69 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
+  if ((from_ty.is_float8_e4m3() || from_ty.is_float8_e5m2()) &&
+      target_ty.is_float()) {
+    // FP8 -> FP32: Use __tl_cvt_fp8x2_to_float2 for vectorized conversion
+    // (fp8x2 -> float2)
+    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
+      // fp8x2 -> float2
+      PrintIndent();
+      stream << "*reinterpret_cast<float2*>(&(" << sret
+             << ")) = "
+                "__tl_cvt_fp8x2_to_float2(*reinterpret_cast<__nv_fp8x2_storage_"
+                "t*>(&("
+             << src << ")), "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
+      // fp8x4 -> float4
+      PrintIndent();
+      stream << "*(float2*)(&" << sret << ") = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[0], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+1) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[1], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      os << sret;
+      return;
+    } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
+      // fp8x8 -> float8
+      PrintIndent();
+      stream << "*(float2*)(&" << sret << ") = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[0], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+1) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[1], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+2) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[2], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      PrintIndent();
+      stream << "*((float2*)(&" << sret << ")+3) = "
+             << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
+             << "))[3], "
+             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << ");\n";
+      os << sret;
+      return;
+    }
+  }
+
   // Fallback: elementwise cast
   for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
     std::ostringstream val;
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index 2efb8f11..83c380c6 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -33,7 +33,7 @@ struct __CUDA_ALIGN__(32) fp8_e4_32_t {
   fp8_e4_16_t x;
   fp8_e4_16_t y;
 
-  __device__ __forceinline__ fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e4_8_t *)&rhs.x;
     x.y = *(fp8_e4_8_t *)&rhs.y;
     y.x = *(fp8_e4_8_t *)&rhs.z;
@@ -68,7 +68,7 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   fp8_e5_16_t x;
   fp8_e5_16_t y;
 
-  __device__ __forceinline__ fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e5_8_t *)&rhs.x;
     x.y = *(fp8_e5_8_t *)&rhs.y;
     y.x = *(fp8_e5_8_t *)&rhs.z;
@@ -78,7 +78,7 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
 };
 
 // Pack two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
+TL_DEVICE fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
   fp8_e4_2_t result;
   result.x = x;
   result.y = y;
@@ -86,9 +86,8 @@ __forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
 }
 
 // Pack four fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2,
-                                                      fp8_e4_t x3) {
+TL_DEVICE fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3) {
   fp8_e4_4_t result;
   result.x = x0;
   result.y = x1;
@@ -98,11 +97,9 @@ __forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack eight fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2, fp8_e4_t x3,
-                                                      fp8_e4_t x4, fp8_e4_t x5,
-                                                      fp8_e4_t x6,
-                                                      fp8_e4_t x7) {
+TL_DEVICE fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                     fp8_e4_t x6, fp8_e4_t x7) {
   fp8_e4_8_t result;
   result.x = make_fp8_e4_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e4_4_t(x4, x5, x6, x7);
@@ -110,11 +107,12 @@ __forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack sixteen fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_16_t
-make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
-                 fp8_e4_t x4, fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7,
-                 fp8_e4_t y0, fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
-                 fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6, fp8_e4_t y7) {
+TL_DEVICE fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                       fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                       fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                       fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                       fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                       fp8_e4_t y7) {
   fp8_e4_16_t result;
   result.x = make_fp8_e4_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e4_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -122,7 +120,7 @@ make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
 }
 
 // Pack thirty-two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
+TL_DEVICE fp8_e4_32_t make_fp8_e4_32_t(
     fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3, fp8_e4_t x4,
     fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t x8, fp8_e4_t x9,
     fp8_e4_t x10, fp8_e4_t x11, fp8_e4_t x12, fp8_e4_t x13, fp8_e4_t x14,
@@ -139,7 +137,7 @@ __forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
 }
 
 // Pack two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
+TL_DEVICE fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
   fp8_e5_2_t result;
   result.x = x;
   result.y = y;
@@ -147,9 +145,8 @@ __forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
 }
 
 // Pack four fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2,
-                                                      fp8_e5_t x3) {
+TL_DEVICE fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3) {
   fp8_e5_4_t result;
   result.x = x0;
   result.y = x1;
@@ -159,11 +156,9 @@ __forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack eight fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2, fp8_e5_t x3,
-                                                      fp8_e5_t x4, fp8_e5_t x5,
-                                                      fp8_e5_t x6,
-                                                      fp8_e5_t x7) {
+TL_DEVICE fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                     fp8_e5_t x6, fp8_e5_t x7) {
   fp8_e5_8_t result;
   result.x = make_fp8_e5_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e5_4_t(x4, x5, x6, x7);
@@ -171,11 +166,12 @@ __forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack sixteen fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_16_t
-make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
-                 fp8_e5_t x4, fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7,
-                 fp8_e5_t y0, fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
-                 fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6, fp8_e5_t y7) {
+TL_DEVICE fp8_e5_16_t make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                       fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                       fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t y0,
+                                       fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
+                                       fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6,
+                                       fp8_e5_t y7) {
   fp8_e5_16_t result;
   result.x = make_fp8_e5_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e5_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -183,7 +179,7 @@ make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
 }
 
 // Pack thirty-two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
+TL_DEVICE fp8_e5_32_t make_fp8_e5_32_t(
     fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3, fp8_e5_t x4,
     fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t x8, fp8_e5_t x9,
     fp8_e5_t x10, fp8_e5_t x11, fp8_e5_t x12, fp8_e5_t x13, fp8_e5_t x14,
@@ -198,3 +194,14 @@ __forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
                               y12, y13, y14, y15);
   return result;
 }
+
+// e4m3x2 -> float2
+TL_DEVICE float2
+__tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+  half2 tmp = __nv_cvt_fp8x2_to_halfraw2(x, fp8_interpretation);
+  float2 result;
+  result.x = (float)tmp.x;
+  result.y = (float)tmp.y;
+  return result;
+}
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index e505bc6e..dbbdb5cc 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -20,6 +20,7 @@
 #include "../op/copy.h"
 #include "../op/parallel.h"
 #include "../op/region.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
@@ -1170,9 +1171,15 @@ private:
       // If a cast operation exists, vectorization may still be required
       bool has_cast_operations = false;
       PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (const auto *cast = obj.as<CastNode>()) {
           // Check if this is a non-reducer store with Cast operation
-          if (store->value.as<CastNode>()) {
+          DataType src_type = cast->value.dtype();
+          DataType dst_type = cast->dtype;
+          bool src_ok = src_type.is_float() || src_type.is_bfloat() ||
+                        src_type.is_float8_e4m3() || src_type.is_float8_e5m2();
+          bool dst_ok = dst_type.is_float() || dst_type.is_bfloat() ||
+                        dst_type.is_float8_e4m3() || dst_type.is_float8_e5m2();
+          if (src_ok && dst_ok && TargetIsCuda(Target::Current())) {
             has_cast_operations = true;
           }
         }
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index adb59a6b..2fd1554a 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -60,9 +60,10 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
     kernel = vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
     kernel_parallel = parallel_vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
 
-    A = torch.randn(M, dtype=str2dtype[src_dtype_str]).cuda()
-    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
-    C = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
+    A_float = torch.randn(M, dtype=torch.float32, device="cuda")
+    A = A_float.to(str2dtype[src_dtype_str])
+    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str], device="cuda")
+    C = torch.zeros(M, dtype=str2dtype[dst_dtype_str], device="cuda")
 
     kernel(A, B)
     kernel_parallel(A, C)
@@ -101,6 +102,14 @@ def test_vectorized_cast():
     run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
     run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
 
+    # fp8_e4m3 -> fp32
+    run_vectorized_cast("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 2)
+    run_vectorized_cast("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 4)
+
+    # fp8_e5m2 -> fp32
+    run_vectorized_cast("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 2)
+    run_vectorized_cast("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 4)
+
 
 if __name__ == "__main__":
     tilelang.testing.main()
-- 
GitLab


From 869f021b2bb3de56be7686c708bed21034e5b82e Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <158081477+Dayuxiaoshui@users.noreply.github.com>
Date: Mon, 15 Dec 2025 20:08:45 +0800
Subject: [PATCH 112/139] [Feature] Support region as input of T.cumsum (#1426)

* [Feature] Support region as input of T.cumsum

- Extend T.cumsum to accept BufferRegion and BufferLoad inputs in addition to Buffer
- This enables operations on buffer slices/regions like:
  T.cumsum(InputG_fragment[i * chunk_size:(i + 1) * chunk_size], dim=0)
- Update cumsum_fragment to handle region inputs properly
- Add comprehensive tests for 1D and 2D region inputs including normal and reverse modes

Fixes #879

* Fix formatting and add docstring for cumsum_fragment

- Add comprehensive docstring for cumsum_fragment function
- Format code according to ruff style guidelines

* Fix CodeRabbit review issues

- Fix negative dimension bounds check (dim < -len(shape) instead of dim <= -len(shape))
- Add src/dst shape compatibility validation for out-of-place cumsum
- Update copy() type annotation to accept BufferRegion as dst parameter
- Fix test in-place mutation issues by using out-of-place cumsum operations
- Add non-divisible size test cases for tail region coverage

* Fix out-of-bounds access in region tests

- Add bounds clamping using T.min() for chunk_end calculations
- Prevents accessing beyond tensor bounds for non-divisible sizes
- Matches reference implementation behavior
- Fixes both 1D and 2D region test cases

* Fix region test: use simple slice expressions instead of T.min()

- Remove T.min() which cannot be used directly in slice indices
- Use chunk_start + chunk_size form instead
- Rely on system's automatic bounds checking for non-divisible sizes
- Update comments to reflect this approach

* Fix cumsum region: use region extents in lowering and update tests for shared memory

* Simplify fragment scope check using is_fragment()

---------

Co-authored-by: LeiWang1999 <leiwang1999@outlook.com>
---
 src/op/reduce.cc                              |  14 +-
 .../language/test_tilelang_language_cumsum.py | 134 ++++++++++++++++++
 tilelang/language/copy.py                     |   4 +-
 tilelang/language/reduce.py                   |  69 ++++++++-
 4 files changed, 208 insertions(+), 13 deletions(-)

diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index 40c9b83c..4458a4f5 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -528,23 +528,29 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     std::stringstream ss;
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
-    int ndim = static_cast<int>(src->shape.size());
 
     // Build access pointers from regions locally
     PrimExpr srcPtr = MakeAccessPtrFromRegion(srcRegion_, 1);
     PrimExpr dstPtr = MakeAccessPtrFromRegion(dstRegion_, 2);
 
+    // Use region extents instead of buffer shape for correct slice handling
+    Array<PrimExpr> src_extents;
+    for (const auto &range : srcRegion_->region) {
+      src_extents.push_back(range->extent);
+    }
+    int ndim = static_cast<int>(src_extents.size());
+
     if (ndim == 1) {
       ICHECK_EQ(dim, 0) << "Cumulative sum over a 1D buffer only supports dim "
                            "= 0.";
       ss << "tl::CumSum1D<" << threads << ", " << (reverse ? "true" : "false")
          << ">::run";
-      args = {StringImm(ss.str()), srcPtr, dstPtr, src->shape[0]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0]};
     } else if (ndim == 2) {
       ss << "tl::CumSum2D<" << threads << ", " << dim << ", "
          << (reverse ? "true" : "false") << ">::run";
-      args = {StringImm(ss.str()), srcPtr, dstPtr, src->shape[0],
-              src->shape[1]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0],
+              src_extents[1]};
     } else {
       LOG(FATAL) << "CumSum currently supports only 1D or 2D buffers, got "
                  << ndim << "D.";
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index 76982a4e..c563bcf2 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -174,5 +174,139 @@ def test_cumsum_fragment_1d():
     run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
 
 
+def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype="float32"):
+    """Test cumsum with buffer region (slice) as input."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((N,), dtype),
+        OutputG_fragment: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, chunk_size), threads=chunk_size) as bx:
+            i = bx
+            chunk_start = i * chunk_size
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((chunk_size,), dtype)
+            T.copy(InputG_fragment[chunk_start : chunk_start + chunk_size], A_shared)
+            # Test cumsum with region input - in-place operation on shared memory
+            # This demonstrates the feature: T.cumsum(region, dim=0)
+            T.cumsum(src=A_shared, dim=0, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(A_shared, OutputG_fragment[chunk_start : chunk_start + chunk_size])
+
+    return cumsum_region
+
+
+def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype="float32"):
+    """Run test for cumsum with region input."""
+    program = cumsum_region_test_1d(N, chunk_size, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks = (N + chunk_size - 1) // chunk_size
+        for j in range(num_blocks):
+            start = j * chunk_size
+            end = min(start + chunk_size, N)
+            chunk = A[start:end].clone()
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            chunk = chunk.cumsum(dim=0)
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            ref_b[start:end] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+    """Test cumsum with buffer region (slice) as input in 2D."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((M, N), dtype),
+        OutputG_fragment: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
+            chunk_start_M = by * block_M
+            chunk_start_N = bx * block_N
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(
+                InputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+                A_shared,
+            )
+            # Test cumsum with 2D region input - in-place operation on shared memory
+            T.cumsum(src=A_shared, dim=dim, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(
+                A_shared,
+                OutputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+            )
+
+    return cumsum_region
+
+
+def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+    """Run test for cumsum with 2D region input."""
+    program = cumsum_region_test_2d(M, N, block_M, block_N, dim, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks_M = (M + block_M - 1) // block_M
+        num_blocks_N = (N + block_N - 1) // block_N
+        for i in range(num_blocks_M):
+            for j in range(num_blocks_N):
+                start_M = i * block_M
+                end_M = min(start_M + block_M, M)
+                start_N = j * block_N
+                end_N = min(start_N + block_N, N)
+                chunk = A[start_M:end_M, start_N:end_N].clone()
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                chunk = chunk.cumsum(dim=dim)
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                ref_b[start_M:end_M, start_N:end_N] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def test_cumsum_region_1d():
+    """Test cumsum with 1D region input."""
+    # Test normal cumsum with region input
+    run_cumsum_region_1d(1024, 128)
+    # Test reverse cumsum with region input
+    run_cumsum_region_1d(1024, 128, reverse=True)
+    # Test with different chunk sizes
+    run_cumsum_region_1d(512, 64)
+    run_cumsum_region_1d(2048, 256)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_1d(1000, 128)
+
+
+def test_cumsum_region_2d():
+    """Test cumsum with 2D region input."""
+    # Test 2D cumsum along dim 0
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=0)
+    # Test 2D cumsum along dim 1
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=1)
+    # Test reverse cumsum
+    run_cumsum_region_2d(512, 512, 64, 64, dim=1, reverse=True)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_2d(1000, 1000, 128, 128, dim=1)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index b80a24e7..1bc84a53 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -13,7 +13,7 @@ from tvm import ir, tir
 
 def copy(
     src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-    dst: tir.Buffer | tir.BufferLoad,
+    dst: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
     coalesced_width: int | None = None,
     disable_tma: bool = False,
     eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
@@ -22,7 +22,7 @@ def copy(
 
     Args:
         src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
-        dst (Union[tir.Buffer, tir.BufferLoad]): Destination memory region
+        dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
         coalesced_width (Optional[int], optional): Width for coalesced memory access. Defaults to None.
 
     Raises:
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 9bb3b179..012fdb55 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
-from tilelang.utils.language import to_buffer_region
+from tilelang.utils.language import to_buffer_region, retrieve_shape, _get_buffer
 from tilelang.utils.language import is_shared, is_fragment
 from tvm.script.ir_builder import IRBuilder
 
@@ -242,8 +242,35 @@ def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
 
 
 @macro
-def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -> tir.PrimExpr:
-    cumsum_smem = alloc_shared(src.shape, src.dtype, "shared.dyn")
+def cumsum_fragment(
+    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    dim: int,
+    reverse: bool,
+) -> tir.PrimExpr:
+    """
+    Compute cumulative sum for fragment buffers by copying to shared memory first.
+
+    This macro handles cumulative sum operations on fragment buffers by first copying
+    the data to shared memory, performing the cumsum operation, and then copying back.
+
+    Args:
+        src: Source buffer (Buffer, BufferRegion, or BufferLoad) containing input data.
+        dst: Destination buffer (Buffer, BufferRegion, or BufferLoad) for output data.
+        dim: Dimension along which to compute cumulative sum.
+        reverse: If True, compute cumulative sum in reverse order.
+
+    Returns:
+        tir.PrimExpr: A handle to the cumulative sum operation.
+    """
+    src_shape = retrieve_shape(src)
+    src_buffer = _get_buffer(src)
+    # Get dtype from the buffer
+    if isinstance(src, tir.Buffer):
+        dtype = src.dtype
+    else:
+        dtype = src_buffer.dtype
+    cumsum_smem = alloc_shared(src_shape, dtype, "shared.dyn")
     copy(src, cumsum_smem)
     tir.call_intrin(
         "handle",
@@ -256,12 +283,19 @@ def cumsum_fragment(src: tir.Buffer, dst: tir.Buffer, dim: int, reverse: bool) -
     copy(cumsum_smem, dst)
 
 
-def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse: bool = False):
+def cumsum(
+    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad | None = None,
+    dim: int = 0,
+    reverse: bool = False,
+):
     """
     Compute the cumulative sum of `src` along `dim`, writing results to `dst`.
 
     Negative `dim` indices are normalized (Python-style). If `dst` is None, the operation is performed in-place into `src`. Raises ValueError when `dim` is out of bounds for `src.shape`. When `src.scope() == "local.fragment"`, this delegates to `cumsum_fragment`; otherwise it emits the `tl.cumsum` intrinsic.
 
+    Supports Buffer, BufferRegion, and BufferLoad inputs, allowing operations on buffer slices/regions.
+
     Examples:
         A 1D inclusive scan that writes the result into a separate shared-memory buffer:
 
@@ -285,19 +319,40 @@ def cumsum(src: tir.Buffer, dst: tir.Buffer | None = None, dim: int = 0, reverse
         ...         T.cumsum(src=tile, dim=1, reverse=True)
         ...         T.copy(tile, B)
 
+        Operating on a buffer region (slice):
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel_region(InputG_fragment: T.Tensor((128,), "float32"), chunk_size: T.int32):
+        ...     with T.Kernel(1, threads=128):
+        ...         i = T.int32(0)
+        ...         T.cumsum(InputG_fragment[i * chunk_size:(i + 1) * chunk_size], dim=0)
+
     Returns:
         tir.Call: A handle to the emitted cumulative-sum operation.
     """
 
-    shape = src.shape
-    if dim >= len(shape) or dim <= -len(shape):
+    # Get shape from src (supports Buffer, BufferRegion, BufferLoad)
+    shape = retrieve_shape(src)
+    if dim >= len(shape) or dim < -len(shape):
         raise ValueError(f"Dimension {dim} is out of bounds for buffer with shape {shape}")
     if dim < 0:
         dim = len(shape) + dim
 
     if dst is None:
         dst = src
-    if src.scope() == "local.fragment":
+    else:
+        # Validate that dst shape matches src shape
+        dst_shape = retrieve_shape(dst)
+        if len(dst_shape) != len(shape):
+            raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (rank mismatch)")
+        # Check each dimension matches
+        for i in range(len(shape)):
+            if not tir.analysis.expr_deep_equal(dst_shape[i], shape[i]):
+                raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (dim {i} mismatch)")
+
+    # Check if src is a fragment buffer
+    if is_fragment(src):
         return cumsum_fragment(src, dst, dim, reverse)
     return tir.call_intrin(
         "handle",
-- 
GitLab


From 81b8c1b77b8b8529e43bb0e305676f1a230faaf3 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:10:40 +0800
Subject: [PATCH 113/139] [Fix] Fix analyzer bind conflicting (#1446)

---
 src/transform/layout_inference.cc | 192 +++++++++++++++---------------
 1 file changed, 97 insertions(+), 95 deletions(-)

diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index dbbdb5cc..33731285 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -1090,112 +1090,114 @@ private:
       reducer_info = op->annotations.Get(attr::kReducerInfo)
                          ->as<Map<Var, ReducerInfo>>()
                          .value();
-
+    if (!result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
+      return IRMutatorWithAnalyzer::VisitStmt_(op);
+    }
+    // the analyzer will be modified in PartitionLoop and VectorizeLoop
+    // we need to save its state to prevent conflicted bindings
+    auto saved_analyzer = analyzer_->Clone();
     For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    if (result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
-      auto root = tvm::ffi::GetRef<For>(op);
-      // This check is a workaround to support T.Parallel for local buffers.
-      // For example:
-      //   for i in T.Parallel(1024):
-      //     A_local[i] = A_global[i]
-      // Here, A_local is a register-local buffer held independently by each
-      // thread, so explicit thread binding is not required.
-      bool store_into_local = false;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() == "local") {
-            store_into_local = true;
-          }
-          // if the case is like:
-          // for i in T.Parallel(1024):
-          //     A_local[i] = B_global[i]
-          //     A_frag[i] = A_global[i]
-          // exception will be raise in Parallel::LayoutInference
+    auto root = tvm::ffi::GetRef<For>(op);
+    // This check is a workaround to support T.Parallel for local buffers.
+    // For example:
+    //   for i in T.Parallel(1024):
+    //     A_local[i] = A_global[i]
+    // Here, A_local is a register-local buffer held independently by each
+    // thread, so explicit thread binding is not required.
+    bool store_into_local = false;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (store->buffer.scope() == "local") {
+          store_into_local = true;
         }
-      });
-      // This check if for the loop that only manuplates "local" buffers,
-      // for i in T.Parallel(1024):
-      //     A_local[i] = B_local[i]
-      // Though this might be illegal
-      // We use PostOrderVisit to detect whether the loop only manuplates
-      // "local" buffers, which indicates register usage and justifies skipping
-      // thread binding.
-      bool local_register_only = true;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        } else if (const auto *load = obj.as<BufferLoadNode>()) {
-          if (load->buffer.scope() != "local") {
-            local_register_only = false;
-          }
+        // if the case is like:
+        // for i in T.Parallel(1024):
+        //     A_local[i] = B_global[i]
+        //     A_frag[i] = A_global[i]
+        // exception will be raise in Parallel::LayoutInference
+      }
+    });
+    // This check if for the loop that only manuplates "local" buffers,
+    // for i in T.Parallel(1024):
+    //     A_local[i] = B_local[i]
+    // Though this might be illegal
+    // We use PostOrderVisit to detect whether the loop only manuplates
+    // "local" buffers, which indicates register usage and justifies skipping
+    // thread binding.
+    bool local_register_only = true;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (store->buffer.scope() != "local") {
+          local_register_only = false;
         }
-      });
+      } else if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (load->buffer.scope() != "local") {
+          local_register_only = false;
+        }
+      }
+    });
 
-      auto loop_layout = result_.for_map[root];
-      // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
-      // NOTE(lei): a bit ugly, we should rethink about this part in future.
-      bool parallel_loop =
-          !skip_thread_partition_ && !local_register_only && !store_into_local;
+    auto loop_layout = result_.for_map[root];
+    // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
+    // NOTE(lei): a bit ugly, we should rethink about this part in future.
+    bool parallel_loop =
+        !skip_thread_partition_ && !local_register_only && !store_into_local;
 
-      if (parallel_loop) {
-        for_node =
-            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
+    if (parallel_loop) {
+      for_node =
+          PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
+    }
+    // If none thread bindings are provided, partition the loop
+    bool has_non_local = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *load = obj.as<BufferLoadNode>()) {
+        String scope = load->buffer.scope();
+        if (scope != "local" && scope != "local.fragment") {
+          has_non_local = true;
+        }
+      } else if (const auto *store = obj.as<BufferStoreNode>()) {
+        String scope = store->buffer.scope();
+        if (scope != "local" && scope != "local.fragment") {
+          has_non_local = true;
+        }
       }
-      // If none thread bindings are provided, partition the loop
-      bool has_non_local = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *load = obj.as<BufferLoadNode>()) {
-          String scope = load->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        } else if (const auto *store = obj.as<BufferStoreNode>()) {
-          String scope = store->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
+    });
+    // Workaround: if reducer is presented, don't vectorize loop
+    // Best solution should be isolate reduction axis out of vectorization
+    bool has_reducer = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (!has_reducer)
+        if (const auto *store = obj.as<BufferStoreNode>()) {
+          has_reducer = reducer_info.count(store->buffer->data) != 0;
         }
-      });
-      // Workaround: if reducer is presented, don't vectorize loop
-      // Best solution should be isolate reduction axis out of vectorization
-      bool has_reducer = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (!has_reducer)
-          if (const auto *store = obj.as<BufferStoreNode>()) {
-            has_reducer = reducer_info.count(store->buffer->data) != 0;
-          }
-      });
-
-      // If a cast operation exists, vectorization may still be required
-      bool has_cast_operations = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *cast = obj.as<CastNode>()) {
-          // Check if this is a non-reducer store with Cast operation
-          DataType src_type = cast->value.dtype();
-          DataType dst_type = cast->dtype;
-          bool src_ok = src_type.is_float() || src_type.is_bfloat() ||
-                        src_type.is_float8_e4m3() || src_type.is_float8_e5m2();
-          bool dst_ok = dst_type.is_float() || dst_type.is_bfloat() ||
-                        dst_type.is_float8_e4m3() || dst_type.is_float8_e5m2();
-          if (src_ok && dst_ok && TargetIsCuda(Target::Current())) {
-            has_cast_operations = true;
-          }
+    });
+
+    // If a cast operation exists, vectorization may still be required
+    bool has_cast_operations = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *cast = obj.as<CastNode>()) {
+        // Check if this is a non-reducer store with Cast operation
+        DataType src_type = cast->value.dtype();
+        DataType dst_type = cast->dtype;
+        bool src_ok = src_type.is_float() || src_type.is_bfloat() ||
+                      src_type.is_float8_e4m3() || src_type.is_float8_e5m2();
+        bool dst_ok = dst_type.is_float() || dst_type.is_bfloat() ||
+                      dst_type.is_float8_e4m3() || dst_type.is_float8_e5m2();
+        if (src_ok && dst_ok && TargetIsCuda(Target::Current())) {
+          has_cast_operations = true;
         }
-      });
-
-      if ((has_non_local || has_cast_operations) && !has_reducer) {
-        for_node = VectorizeLoop(for_node, analyzer_);
       }
+    });
 
-      if (result_.predicate_map.count(root) && parallel_loop) {
-        return IfThenElse(result_.predicate_map[root], for_node);
-      } else {
-        return for_node;
-      }
+    if ((has_non_local || has_cast_operations) && !has_reducer) {
+      for_node = VectorizeLoop(for_node, saved_analyzer.get());
+    }
+
+    if (result_.predicate_map.count(root) && parallel_loop) {
+      return IfThenElse(result_.predicate_map[root], for_node);
+    } else {
+      return for_node;
     }
-    return for_node;
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
-- 
GitLab


From dda451269c6dfea7fdc7f2fede8e86b2b5621819 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:26:07 +0800
Subject: [PATCH 114/139] [Refactor] Reduce direct dependency on PyTorch due to
 its limited type support (#1444)

* [Enhancement] Update KernelParam to use tvm.DataType directly and add torch_dtype conversion method

- Changed dtype in KernelParam from torch.dtype to tvm.DataType to support a wider range of data types and prevent information loss during conversions.
- Added a new method, torch_dtype, to convert tvm.DataType back to torch.dtype for tensor creation.
- Updated various adapters to utilize the new torch_dtype method for parameter type conversion during initialization.

* [Enhancement] Refactor CUDA type handling and add support for FP4 and FP8 types

- Renamed functions for clarity: GetFP8Type, GetFP6Type, and GetFP4Type are now GetTileLangFP8Type, GetTileLangFP6Type, and GetTileLangFP4Type respectively.
- Enhanced FP4 type handling to support additional lane sizes (2, 4, 8, 16, 32, 64).
- Updated CUDA code generation to include new FP8 and FP4 types, ensuring proper type handling in PrintType and related functions.
- Introduced new structures for FP8 types in cuda_fp8.h to facilitate better memory management and type packing.
- Added methods in KernelParam and tensor utilities to recognize and handle float4 types, improving compatibility with PyTorch.
- Enhanced logging for debugging purposes in various CUDA functions to track type handling and memory operations more effectively.

* lint fix

* Remove unnecessary logging statements from CUDA code generation and delete obsolete matrix multiplication test file.

* [Enhancement] Add support for FP4 and FP8 types in CUDA code generation

- Enhanced PrintVecElemLoad and PrintVecElemStore functions to handle new FP4 types.
- Updated arg_binder to allow float4 to match int8 at runtime, improving compatibility with PyTorch.
- Modified loop_vectorize to account for buffer dtype lanes in vectorization calculations.
- Refactored tensor type mapping to support new float4 and float8 types, ensuring correct type handling in tensor operations.
- Added tests for FP4 and FP8 copy operations to validate functionality and integration with existing workflows.

---------

Co-authored-by: Zhiwen Mo <zm125@ic.ac.uk>
---
 src/target/codegen_cuda.cc                    | 157 +++++++++++++++---
 src/target/codegen_cuda.h                     |   1 +
 src/tl_templates/cuda/copy_sm100.h            |  63 ++++---
 src/tl_templates/cuda/cuda_fp4.h              | 157 ++++++++++++++++++
 src/tl_templates/cuda/cuda_fp8.h              |  95 +++++++++++
 src/transform/arg_binder.cc                   |  14 +-
 src/transform/loop_vectorize.cc               |   3 +-
 .../cache/test_tilelang_cache_matmul.py       | 102 ------------
 .../language/test_tilelang_language_copy.py   |  59 ++++++-
 tilelang/engine/param.py                      |  44 ++++-
 tilelang/jit/adapter/ctypes/adapter.py        |   6 +-
 .../jit/adapter/cython/cython_wrapper.pyx     |   3 +-
 tilelang/jit/adapter/nvrtc/adapter.py         |   6 +-
 tilelang/jit/adapter/tvm_ffi.py               |   3 +-
 tilelang/utils/tensor.py                      |  28 +++-
 15 files changed, 568 insertions(+), 173 deletions(-)
 create mode 100644 src/tl_templates/cuda/cuda_fp4.h
 delete mode 100644 testing/python/cache/test_tilelang_cache_matmul.py

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 1cf642138..011855fb 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -107,7 +107,7 @@ struct CUDAIEEEMath {
   }
 };
 
-static std::string GetFP8Type(DataType type) {
+static std::string GetTileLangFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -134,13 +134,15 @@ static std::string GetFP8Type(DataType type) {
   } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
              type.is_float8_e5m2()) {
     stream << "fp8_e5" << vec << "_t";
+  } else if (type.is_float8_e8m0fnu()) {
+    stream << "fp8_e8" << vec << "_t";
   } else {
     LOG(FATAL) << "Unsupported FP8 type in CUDA codegen but got " << type;
   }
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetTileLangFP6Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -171,32 +173,37 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetTileLangFP4Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
   if (type.is_scalar()) {
     vec = "";
   } else if (lanes == 2) {
-    vec = "x2";
+    vec = "_2";
   } else if (lanes == 4) {
-    vec = "x4";
+    vec = "_4";
   } else if (lanes == 8) {
-    vec = "x8";
+    vec = "_8";
   } else if (lanes == 16) {
-    vec = "x16";
+    vec = "_16";
+  } else if (lanes == 32) {
+    vec = "_32";
+  } else if (lanes == 64) {
+    vec = "_64";
   } else {
-    LOG(FATAL)
-        << "Only support scalar and vector types of width (2, 4) for FP4";
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16, "
+                  "32, 64) for FP4";
   }
-  stream << "__nv_fp4";
+
   std::string suffix;
   if (type.code() == DataType::kFloat4_e2m1fn) {
-    suffix = "_e2m1";
+    suffix = "_e2";
   } else {
     LOG(FATAL) << "Unsupported FP4 type in CUDA codegen";
   }
-  stream << vec << suffix;
+
+  stream << "fp4" << suffix << vec << "_t";
   return stream.str();
 }
 
@@ -278,6 +285,9 @@ std::string CodeGenTileLangCUDA::Finish() {
   if (enable_fp8_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp8.h>\n";
   }
+  if (enable_fp4_) {
+    decl_stream << "#include <tl_templates/cuda/cuda_fp4.h>\n";
+  }
 
   if (need_math_constants_h_) {
     decl_stream << "#include <math_constants.h>\n";
@@ -437,18 +447,20 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
       return;
   } else if (t.is_float8()) {
     enable_fp8_ = true;
-    os << GetFP8Type(t);
+    os << GetTileLangFP8Type(t);
     return;
   } else if (t.is_float6()) {
     enable_fp6_ = true;
     if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+      os << GetTileLangFP6Type(t);
     }
     return;
   } else if (t.is_float4()) {
     enable_fp4_ = true;
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (t.lanes() <= 64) {
+      os << GetTileLangFP4Type(t);
+    } else {
+      fail = true;
     }
     return;
   } else if (t == DataType::Bool()) {
@@ -665,7 +677,9 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < 256 / t.bits());
+  ICHECK(i >= 0 && i < 256 / t.bits())
+      << "i: " << i << " t: " << t << " t.bits(): " << t.bits()
+      << " t.lanes(): " << t.lanes();
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     std::string type_name = t.is_int() ? "char" : "unsigned char";
     if (t.lanes() == 2 || t.lanes() == 3) {
@@ -707,6 +721,22 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
       os << "." << access[(i % 8) / 4];
     // fp8_e5_4_t or fp8_e5_2_t
     os << "." << access[i % 4];
+  } else if (t.is_float4_e2m1fn()) {
+    os << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      os << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      os << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      os << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      os << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    os << "." << access[i % 4];
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -810,6 +840,22 @@ void CodeGenTileLangCUDA::PrintVecElemStore(const std::string &vec, DataType t,
     ICHECK(!type_name.empty());
     stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2]
            << ")))->" << access[i % 2] << " = " << value << ";\n";
+  } else if (t.is_float4_e2m1fn()) {
+    stream << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      stream << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      stream << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      stream << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      stream << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    stream << "." << access[i % 4] << " = " << value << ";\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
@@ -1365,7 +1411,7 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     return os.str();
   }
   std::string index_str = PrintExpr(index);
-  if (t.bits() == 4 || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
@@ -2895,7 +2941,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    // For sub-byte types with lanes > 1 in element_dtype, adjust the ramp
+    // pattern
+    int ramp_lanes = (element_dtype.lanes() > 1 && element_dtype.bits() < 8)
+                         ? value_dtype.lanes() / element_dtype.lanes()
+                         : value_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index)) {
       const RampNode *ramp = index.as<RampNode>();
       ICHECK(ramp);
       can_vector_load = true;
@@ -2907,11 +2958,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
       // }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
-      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
-      // So we cannot vector load it.
-      can_vector_load = false;
-    }
     if (can_vector_load) {
       std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
@@ -2945,6 +2991,69 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   }
 }
 
+void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string value = this->PrintExpr(op->value);
+    std::string ref =
+        this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+    this->PrintIndent();
+    stream << ref << " = " << value << ";\n";
+  } else {
+    arith::PVar<PrimExpr> base;
+    // For sub-byte types with lanes > 1 in element_dtype, adjust the ramp
+    // pattern
+    int ramp_lanes = (element_dtype.lanes() > 1 && element_dtype.bits() < 8)
+                         ? value_dtype.lanes() / element_dtype.lanes()
+                         : value_dtype.lanes();
+
+    if (arith::ramp(base, 1, ramp_lanes).Match(index_expr)) {
+      std::string value = this->PrintExpr(op->value);
+      this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
+    } else {
+      // The assignment below introduces side-effect, and the resulting value
+      // cannot be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
+      // store elements separately
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      for (int i = 0; i < value_dtype.lanes(); ++i) {
+        this->PrintIndent();
+        DataType elem_type = value_dtype.element_of();
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          stream << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, stream);
+            }
+          }
+          PrintType(elem_type, stream);
+          stream << "*)" << vid << ')';
+        } else {
+          stream << vid;
+        }
+        stream << '[';
+        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        stream << "] = ";
+        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        stream << ";\n";
+      }
+      EndScope(vec_scope);
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 11c0ad08..45fe5e2a 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -57,6 +57,7 @@ public:
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
   void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
+  void VisitStmt_(const BufferStoreNode *op) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index aa898bcc..82d0cca2 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -5,6 +5,7 @@
 
 namespace tl {
 
+// 256-bit load for longlong4
 __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   longlong4 ret;
   asm volatile("ld.global.v4.s64 {%0, %1, %2, %3}, [%4];"
@@ -13,13 +14,18 @@ __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   return ret;
 }
 
-__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
-  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+// 256-bit load for ulonglong4
+__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+  ulonglong4 ret;
+  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
+               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
+               : "l"(ptr));
+  return ret;
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+// Generic 256-bit load for FP8 types (returns ulonglong4)
+template <typename T>
+__device__ __forceinline__ ulonglong4 ld_global_256(const T *ptr) {
   ulonglong4 ret;
   asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
                : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
@@ -27,6 +33,22 @@ __device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
   return ret;
 }
 
+// 256-bit store for longlong4
+__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
+  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with non-const reference
+__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
+                                              ulonglong4 &val) {
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with const reference
 // must be const &val, otherwise the compiler will generate a temporary variable
 // and compilation will fail if we have st_global_256(ptr, ld_global_256(ptr))
 __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
@@ -36,35 +58,22 @@ __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e4_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
-  return ret;
-}
-
-__device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
-                                              fp8_e4_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
+// Generic 256-bit store for FP8 types
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, const ulonglong4 &val) {
   asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
                :
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e5_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
-  return ret;
-}
 
-__device__ __forceinline__ void st_global_256(fp8_e5_32_t *ptr,
-                                              fp8_e5_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
+// Generic 256-bit store for FP8 types with non-const reference
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, T &val) {
+  ulonglong4 &val_u64 = *((ulonglong4 *)&val);
   asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
                :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+               : "l"(ptr), "l"(val_u64.x), "l"(val_u64.y), "l"(val_u64.z),
+                 "l"(val_u64.w));
 }
 
 __device__ __forceinline__ unsigned long long
diff --git a/src/tl_templates/cuda/cuda_fp4.h b/src/tl_templates/cuda/cuda_fp4.h
new file mode 100644
index 00000000..e3f56622
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_fp4.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "common.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#include <cuda_fp4.h>
+
+// Wrapper for __nv_fp4_e2m1 with implicit conversions
+struct fp4_e2_t {
+  __nv_fp4_storage_t __x;
+
+  TL_DEVICE fp4_e2_t() = default;
+
+  // Constructor from __nv_fp4_e2m1
+  TL_DEVICE fp4_e2_t(__nv_fp4_e2m1 x) : __x(x.__x) {}
+
+  // Constructor from storage type
+  TL_DEVICE fp4_e2_t(__nv_fp4_storage_t x) : __x(x) {}
+
+  // Constructor from float
+  TL_DEVICE explicit fp4_e2_t(float x) {
+    __nv_fp4_e2m1 tmp(x);
+    __x = tmp.__x;
+  }
+
+  // Conversion to __nv_fp4_e2m1
+  TL_DEVICE operator __nv_fp4_e2m1() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return tmp;
+  }
+
+  // Conversion to float
+  TL_DEVICE operator float() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return float(tmp);
+  }
+
+  // Implicit conversion to half_t (cutlass::half_t)
+  TL_DEVICE operator half_t() const { return half_t(float(*this)); }
+
+  // Implicit conversion to __half
+  TL_DEVICE operator __half() const { return __half(float(*this)); }
+};
+
+using fp4_e2x2_t = __nv_fp4x2_e2m1;
+using fp4_e2x4_t = __nv_fp4x4_e2m1;
+
+struct fp4_e2x8_t {
+  fp4_e2_t data[8];
+};
+
+struct fp4_e2x16_t {
+  fp4_e2_t data[16];
+};
+
+struct __CUDA_ALIGN__(1) fp4_e2_2_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+};
+
+struct __CUDA_ALIGN__(2) fp4_e2_4_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+  fp4_e2_t z;
+  fp4_e2_t w;
+};
+
+struct __CUDA_ALIGN__(4) fp4_e2_8_t {
+  fp4_e2_4_t x;
+  fp4_e2_4_t y;
+};
+
+struct __CUDA_ALIGN__(8) fp4_e2_16_t {
+  fp4_e2_8_t x;
+  fp4_e2_8_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp4_e2_32_t {
+  fp4_e2_16_t x;
+  fp4_e2_16_t y;
+
+  TL_DEVICE fp4_e2_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp4_e2_8_t *)&rhs.x;
+    x.y = *(fp4_e2_8_t *)&rhs.y;
+    y.x = *(fp4_e2_8_t *)&rhs.z;
+    y.y = *(fp4_e2_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
+struct __CUDA_ALIGN__(32) fp4_e2_64_t {
+  fp4_e2_32_t x;
+  fp4_e2_32_t y;
+};
+
+// Pack two fp4_e2_t values.
+TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
+  fp4_e2_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp4_e2_t values.
+TL_DEVICE fp4_e2_4_t make_fp4_e2_4_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3) {
+  fp4_e2_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp4_e2_t values.
+TL_DEVICE fp4_e2_8_t make_fp4_e2_8_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                     fp4_e2_t x6, fp4_e2_t x7) {
+  fp4_e2_8_t result;
+  result.x = make_fp4_e2_4_t(x0, x1, x2, x3);
+  result.y = make_fp4_e2_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp4_e2_t values.
+TL_DEVICE fp4_e2_16_t make_fp4_e2_16_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                       fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                       fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t y0,
+                                       fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+                                       fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6,
+                                       fp4_e2_t y7) {
+  fp4_e2_16_t result;
+  result.x = make_fp4_e2_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp4_e2_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp4_e2_t values.
+TL_DEVICE fp4_e2_32_t make_fp4_e2_32_t(
+    fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2, fp4_e2_t x3, fp4_e2_t x4,
+    fp4_e2_t x5, fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t x8, fp4_e2_t x9,
+    fp4_e2_t x10, fp4_e2_t x11, fp4_e2_t x12, fp4_e2_t x13, fp4_e2_t x14,
+    fp4_e2_t x15, fp4_e2_t y0, fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+    fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6, fp4_e2_t y7, fp4_e2_t y8,
+    fp4_e2_t y9, fp4_e2_t y10, fp4_e2_t y11, fp4_e2_t y12, fp4_e2_t y13,
+    fp4_e2_t y14, fp4_e2_t y15) {
+  fp4_e2_32_t result;
+  result.x = make_fp4_e2_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp4_e2_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+#endif
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index 83c380c6..d1774d47 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -6,6 +6,7 @@
 
 using fp8_e4_t = tl::float_e4m3_t;
 using fp8_e5_t = tl::float_e5m2_t;
+using fp8_e8_t = __nv_fp8_e8m0;
 
 struct __CUDA_ALIGN__(2) fp8_e4_2_t {
   fp8_e4_t x;
@@ -77,6 +78,41 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   }
 };
 
+struct __CUDA_ALIGN__(2) fp8_e8_2_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+};
+
+struct __CUDA_ALIGN__(4) fp8_e8_4_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+  fp8_e8_t z;
+  fp8_e8_t w;
+};
+
+struct __CUDA_ALIGN__(8) fp8_e8_8_t {
+  fp8_e8_4_t x;
+  fp8_e8_4_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp8_e8_16_t {
+  fp8_e8_8_t x;
+  fp8_e8_8_t y;
+};
+
+struct __CUDA_ALIGN__(32) fp8_e8_32_t {
+  fp8_e8_16_t x;
+  fp8_e8_16_t y;
+
+  TL_DEVICE fp8_e8_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp8_e8_8_t *)&rhs.x;
+    x.y = *(fp8_e8_8_t *)&rhs.y;
+    y.x = *(fp8_e8_8_t *)&rhs.z;
+    y.y = *(fp8_e8_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
 // Pack two fp8_e4_t values.
 TL_DEVICE fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
   fp8_e4_2_t result;
@@ -195,6 +231,65 @@ TL_DEVICE fp8_e5_32_t make_fp8_e5_32_t(
   return result;
 }
 
+// Pack two fp8_e8_t values.
+TL_DEVICE fp8_e8_2_t make_fp8_e8_2_t(fp8_e8_t x, fp8_e8_t y) {
+  fp8_e8_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp8_e8_t values.
+TL_DEVICE fp8_e8_4_t make_fp8_e8_4_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3) {
+  fp8_e8_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp8_e8_t values.
+TL_DEVICE fp8_e8_8_t make_fp8_e8_8_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                     fp8_e8_t x6, fp8_e8_t x7) {
+  fp8_e8_8_t result;
+  result.x = make_fp8_e8_4_t(x0, x1, x2, x3);
+  result.y = make_fp8_e8_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp8_e8_t values.
+TL_DEVICE fp8_e8_16_t make_fp8_e8_16_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                       fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                       fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t y0,
+                                       fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+                                       fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6,
+                                       fp8_e8_t y7) {
+  fp8_e8_16_t result;
+  result.x = make_fp8_e8_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp8_e8_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp8_e8_t values.
+TL_DEVICE fp8_e8_32_t make_fp8_e8_32_t(
+    fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2, fp8_e8_t x3, fp8_e8_t x4,
+    fp8_e8_t x5, fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t x8, fp8_e8_t x9,
+    fp8_e8_t x10, fp8_e8_t x11, fp8_e8_t x12, fp8_e8_t x13, fp8_e8_t x14,
+    fp8_e8_t x15, fp8_e8_t y0, fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+    fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6, fp8_e8_t y7, fp8_e8_t y8,
+    fp8_e8_t y9, fp8_e8_t y10, fp8_e8_t y11, fp8_e8_t y12, fp8_e8_t y13,
+    fp8_e8_t y14, fp8_e8_t y15) {
+  fp8_e8_32_t result;
+  result.x = make_fp8_e8_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp8_e8_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
 // e4m3x2 -> float2
 TL_DEVICE float2
 __tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 318ff0f9..c3aebc86 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -443,9 +443,21 @@ void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
     PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
     cond = cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
   }
+  // Allow float4 to match int8 at runtime (PyTorch uses int8 as storage for
+  // FP4).
+  if (buffer->dtype.is_float4()) {
+    PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+    PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+    // For FP4, we pack 2 elements per byte, but we still use same lanes at
+    // storage level Accept int8 with same lanes as the fp4 type
+    PrimExpr fp4_lanes_ok = (v_type_lanes == expect_lanes);
+    PrimExpr int8_ok =
+        (v_type_code == code_int && v_type_bits == bits8 && fp4_lanes_ok);
+    cond = cond || int8_ok;
+  }
   if (!(buffer->dtype == DataType::Int(1) ||
         buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4))) {
+        buffer->dtype == DataType::UInt(4) || buffer->dtype.is_float4())) {
     // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch occurs.
     // Only issue the call when handle is non-NULL and cond is false.
     ffi::Array<PrimExpr> packed_args;
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 00223fae..72b93b78 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -192,7 +192,8 @@ private:
                                          vector_size_, analyzer_)) {
       // If not, tight vectorize bound with buffer dtype constraint
       vector_size_ = arith::ZeroAwareGCD(
-          vector_size_, vector_load_bits_max_ / buffer->dtype.bits());
+          vector_size_, vector_load_bits_max_ /
+                            (buffer->dtype.bits() * buffer->dtype.lanes()));
     }
     // 4. Try to vectorize buffer load
     while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
diff --git a/testing/python/cache/test_tilelang_cache_matmul.py b/testing/python/cache/test_tilelang_cache_matmul.py
deleted file mode 100644
index f38ed487..00000000
--- a/testing/python/cache/test_tilelang_cache_matmul.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang.cache import cached
-import tilelang.language as T
-
-
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    """
-    Defines a matrix multiplication primitive function using tilelang.
-
-    This function constructs a tilelang primitive function for matrix multiplication,
-    optimized for execution on hardware accelerators. It utilizes shared memory and
-    fragment memory for performance.
-
-    Args:
-        M (int): Number of rows in matrix A and C.
-        N (int): Number of columns in matrix B and C.
-        K (int): Number of columns in matrix A and rows in matrix B.
-        block_M (int): Block size for M dimension in shared memory and fragment.
-        block_N (int): Block size for N dimension in shared memory and fragment.
-        block_K (int): Block size for K dimension in shared memory.
-        dtype (str, optional): Data type for input matrices A and B, and output C. Defaults to "float16".
-        accum_dtype (str, optional): Accumulation data type for internal computations. Defaults to "float".
-
-    Returns:
-        T.PrimFunc: A tilelang primitive function representing the matrix multiplication.
-    """
-
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, K), dtype),
-        B: T.Tensor((K, N), dtype),
-        C: T.Tensor((M, N), dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_cache_matmul():
-    """
-    Demonstrates the usage of the cached matrix multiplication kernel.
-
-    This function defines a reference PyTorch matrix multiplication,
-    creates a cached kernel from the tilelang matmul function,
-    runs the kernel with random input tensors, compares the output with the reference,
-    and prints the CUDA kernel source code.
-    """
-
-    def ref_program(A, B):
-        """
-        Reference PyTorch matrix multiplication for comparison.
-        """
-        import torch
-
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.half)  # Assuming dtype="float16" in matmul
-        return C
-
-    func = matmul(1024, 1024, 1024, 128, 128, 32)
-    kernel = cached(func, [2], execution_backend="cython")
-    import torch
-
-    a = torch.randn(1024, 1024).cuda().half()
-    b = torch.randn(1024, 1024).cuda().half()
-
-    c = kernel(a, b)
-    print("\nOutput from Cached Kernel:")
-    print(c)
-
-    ref_c = ref_program(a, b)
-    print("\nReference PyTorch Output:")
-    print(ref_c)
-
-    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-    print("\nOutputs are close (within tolerance).")
-
-    # Get CUDA Source
-    print("\nCUDA Kernel Source:")
-    print(kernel.get_kernel_source())
-
-
-def test_cache_matmul_f16f16f16_nn():
-    """
-    Test function for cached matrix multiplication (float16 inputs, float16 output, no transpose).
-    """
-    run_cache_matmul()
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 367f8ed1..c8515d5b 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -3,31 +3,37 @@ import tilelang.language as T
 import torch
 import tilelang.testing
 
+print(torch.__version__)
+
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy(M, N, block_M, block_N, src_dtype="float16", dst_dtype="float16"):
     @T.prim_func
     def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), src_dtype),
+        B: T.Tensor((M, N), dst_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+                B[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+            )
 
     return main
 
 
 def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
-    program = tilelang_copy(M, N, block_M, block_N, dtype)
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=dtype, dst_dtype=dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
         target="cuda",
         pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
     )
+    source = kernel.get_kernel_source()
+    print(source)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,5 +143,46 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
+def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype="float8_e8m0fnu", dst_dtype="float8_e8m0fnu"):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp8_e8_t" in source
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8).view(torch.float8_e8m0fnu)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp8_e8m0():
+    run_tilelang_copy_fp8_e8m0(src_dtype="float8_e8m0fnu", dst_dtype="float8_e8m0fnu")
+
+
+def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype="float4_e2m1fn", dst_dtype="float4_e2m1fn"):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp4_e2_t" in source
+    # For FP4, use same shape as kernel expects, since int8 is used as storage type
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp4():
+    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="float4_e2m1fn")
+    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="float16")
+    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="bfloat16")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index 1abf66a5..bb9872e4 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -16,7 +16,11 @@ class KernelParam:
     Used to describe tensor or scalar parameters in TVM/PyTorch interop.
     """
 
-    dtype: torch.dtype  # PyTorch data type of the parameter
+    # Use tvm.DataType (buffer.dtype) directly instead of torch.dtype to support more data types
+    # tvm.DataType can represent a much wider range of types than PyTorch's dtype system,
+    # including specialized types like float8_e4m3, float8_e5m2, custom quantized types, etc.
+    # This avoids information loss when converting from TVM buffer types
+    dtype: tvm.DataType  # Data type from buffer.dtype (supports all TVM types)
     shape: list[int | Var]  # List of dimensions, can be integers or TVM variables
 
     @classmethod
@@ -28,12 +32,14 @@ class KernelParam:
             buffer: TVM Buffer object containing dtype and shape information
 
         Returns:
-            KernelParam instance with converted dtype and shape
+            KernelParam instance with dtype directly from buffer and shape
 
         Raises:
             ValueError: If dimension type is not supported (not IntImm or Var)
         """
-        dtype = map_torch_type(buffer.dtype)
+        # Use buffer.dtype directly (tvm.DataType) to preserve all type information
+        # buffer.dtype is already a tvm.DataType object, no conversion needed
+        dtype = buffer.dtype
         shape = []
         for s in buffer.shape:
             if isinstance(s, IntImm):
@@ -56,7 +62,9 @@ class KernelParam:
         Returns:
             KernelParam instance representing a scalar (empty shape)
         """
-        dtype = map_torch_type(var.dtype)
+        # Use var.dtype directly (tvm.DataType) to preserve all type information
+        # var.dtype is already a tvm.DataType object, no conversion needed
+        dtype = var.dtype
         return cls(dtype, [])
 
     def is_scalar(self) -> bool:
@@ -92,6 +100,18 @@ class KernelParam:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("float8")
 
+    def is_float4(self) -> bool:
+        """
+        Checks if the parameter represents a float4 type.
+
+        Returns:
+            bool: True if parameter is a float4 type, False otherwise
+        """
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("float4")
+
     def is_boolean(self) -> bool:
         """
         Checks if the parameter represents a boolean type.
@@ -104,6 +124,22 @@ class KernelParam:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("bool")
 
+    def torch_dtype(self) -> torch.dtype:
+        """
+        Converts the TVM DataType to PyTorch dtype.
+
+        This method is used when creating PyTorch tensors from KernelParam,
+        as PyTorch's tensor creation functions require torch.dtype.
+
+        Returns:
+            torch.dtype: Corresponding PyTorch dtype
+
+        Example:
+            >>> param = KernelParam.from_buffer(buffer)
+            >>> tensor = torch.empty(shape, dtype=param.torch_dtype())
+        """
+        return map_torch_type(str(self.dtype))
+
 
 @dataclass
 class CompiledArtifact:
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index 92af8262..b7cac9d6 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -76,7 +76,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -139,7 +140,8 @@ class CtypesKernelAdapter(BaseKernelAdapter):
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index 873e5507..dc462c62 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -32,7 +32,8 @@ cdef class CythonKernelWrapper:
         self.params = params
         self.lib = lib
         # Convert TVM types to native Python types during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         # Convert TVM shape arrays to native Python lists
         self.param_shapes = []
         self.get_current_device = torch.cuda.current_device
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index d222f33a..b1b67299 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -52,7 +52,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -118,7 +119,8 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index 8b868645..fdba92c2 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -135,7 +135,8 @@ class TVMFFIKernelAdapter(BaseKernelAdapter):
         current_device_functor = self.get_current_device_functor()
 
         # Convert TVM types to native Python types during initialization
-        param_dtypes = [param.dtype for param in self.params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        param_dtypes = [param.torch_dtype() for param in self.params]
         # Convert TVM shape arrays to native Python lists
         param_shapes = []
 
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index f1d4fc73..13ce194f 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -32,7 +32,11 @@ class TensorSupplyType(Enum):
     Auto = 7
 
 
-def map_torch_type(intype: str) -> torch.dtype:
+def map_torch_type(intype) -> torch.dtype:
+    # Convert to string if needed
+    if not isinstance(intype, str):
+        intype = str(intype)
+
     if intype == "float8_e4m3":
         assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
         return torch.float8_e4m3fn
@@ -44,6 +48,19 @@ def map_torch_type(intype: str) -> torch.dtype:
             "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
         )
         return torch.float8_e4m3fnuz
+    elif intype == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif intype == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif "float4" in intype:
+        # PyTorch doesn't support float4, use int8 as storage type
+        return torch.int8
     else:
         return getattr(torch, intype)
 
@@ -53,7 +70,8 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
     from .device import get_current_device
 
     def get_tensor(param: KernelParam) -> torch.Tensor:
-        dtype: torch.dtype = param.dtype
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        dtype: torch.dtype = param.torch_dtype()
         device = get_current_device()
 
         if hasattr(param, "shape") and not param.shape:
@@ -74,11 +92,14 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
         if supply_type == TensorSupplyType.Auto:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
                 return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             elif dtype in {torch.float16, torch.float32, torch.bfloat16}:
@@ -95,11 +116,14 @@ def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
         if supply_type == TensorSupplyType.Integer:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
                 return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             else:
-- 
GitLab


From 0b6336b583c0d0208acf49011ccfbe8a2327338c Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Tue, 16 Dec 2025 15:17:32 +0800
Subject: [PATCH 115/139] [Refactor] Use `pytest.mark.parameterize` to speedup
 parallel testing (#1447)

* Refactor GEMM tests to use parameterized pytest fixtures

- Converted multiple test cases for GEMM operations in `test_tilelang_tilelibrary_gemm_sp.py` to use `pytest.mark.parametrize` for better maintainability and readability.
- Similar refactoring applied to `test_tilelang_tilelibrary_gemm_sp_v2.py`, consolidating test cases for `run_gemm_ss`, `run_gemm_rs`, `run_gemm_sr`, and `run_gemm_rr` into parameterized tests.
- This change reduces code duplication and enhances the clarity of test configurations.

* Update testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .../amd/test_tilelang_gemm_mfma_intrinsic.py  |  37 +++-
 .../amd/test_tilelang_gemm_mfma_preshuffle.py |  52 +++--
 testing/python/amd/test_tilelang_test_amd.py  |  55 ++++--
 .../python/fastmath/test_mathops_fastmath.py  |  61 +++---
 .../test_tilelang_language_vectorized_cast.py |  56 +++---
 .../test_tilelang_tilelibrary_gemm.py         | 184 ++++++++----------
 .../test_tilelang_tilelibrary_gemm_sp.py      |  82 ++++----
 .../test_tilelang_tilelibrary_gemm_sp_v2.py   | 172 +++++++---------
 8 files changed, 359 insertions(+), 340 deletions(-)

diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index 4007bebe..65b2d5cf 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
@@ -207,17 +208,33 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype="floa
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack",
+    [
+        (128, 128, 128, "float16", "float16", "float32", False, True, 1),
+        (128, 256, 256, "float16", "float32", "float32", False, True, 1),
+        (128, 256, 256, "float16", "float32", "float32", False, True, 2),
+        (128, 128, 128, "int8", "int32", "int32", False, True, 1),
+        (128, 256, 256, "int8", "int32", "int32", False, True, 1),
+        (128, 256, 256, "int8", "int32", "int32", False, True, 2),
+        (128, 256, 256, "int8", "int32", "int32", False, False, 1),
+        (128, 256, 256, "int8", "int32", "int32", False, False, 2),
+        (128, 128, 128, "float8_e4m3fnuz", "float16", "float32", False, True, 1),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
+def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack):
+    assert_tl_matmul_correctness(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+    )
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
     assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index 393a77b7..eb2c6cbc 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
@@ -257,19 +258,46 @@ def assert_tl_matmul_correctness(
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
+    [
+        (256, 256, 512, "int8", "int32", "int32", False, True, 1, True, False),
+        (256, 256, 512, "int8", "int32", "int32", False, False, 1, True, False),
+        (256, 256, 512, "int8", "int32", "int32", False, True, 2, True, False),
+        (256, 256, 512, "int8", "int32", "int32", False, False, 2, True, False),
+        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, True, 1, True, False),
+        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, False, 1, True, False),
+        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, True, 2, True, False),
+        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, False, 2, True, False),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
-
-    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2, b_preshuffle=True)
-
-    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", b_transposed=False, b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_preshuffle=True)
-    assert_tl_matmul_correctness(256, 256, 512, "float8_e4m3fnuz", "float32", k_pack=2, b_transposed=False, b_preshuffle=True)
+def test_assert_tl_matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    a_transposed,
+    b_transposed,
+    k_pack,
+    b_preshuffle,
+    b_g2l_load,
+):
+    assert_tl_matmul_correctness(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+        b_g2l_load=b_g2l_load,
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index 0666fd47..c9c3bedb 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -1,3 +1,4 @@
+import pytest
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
@@ -95,31 +96,49 @@ def run_gemm(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_f16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, "float16", "float32", "float32", 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=k_pack)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16bf16f32():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16bf16f32(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=k_pack)
 
 
 def matmul_rs(
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index 7809983e..72eddd96 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -1,3 +1,4 @@
+import pytest
 import tilelang
 import tilelang.language as T
 import torch
@@ -242,13 +243,9 @@ def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
     print(f"✓ {mathop_name} numerical test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_mathops_generate_no_fastmath():
-    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
-    # Based on test results, our tl.* intrinsics actually generate
-    # no fastmath versions
-    # This appears to be the intended behavior
-    single_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("exp", T.exp),
         ("exp2", T.exp2),
         ("exp10", T.exp10),
@@ -270,24 +267,26 @@ def test_mathops_generate_no_fastmath():
         ("trunc", T.trunc),
         ("round", T.round),
         ("nearbyint", T.nearbyint),
-    ]
-
-    for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_mathops_generate_no_fastmath(name, func):
+    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
+    run_single_arg_mathop_test(name, func, dtype="float32")
+    print(f"✓ {name} test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_two_arg_mathops_fastmath():
-    """Test all two-argument mathops"""
-    # Two argument mathops
-    two_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("pow", T.pow),
         ("fmod", T.fmod),
-    ]
-
-    for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_two_arg_mathops_fastmath(name, func):
+    """Test all two-argument mathops"""
+    run_two_arg_mathop_test(name, func, dtype="float32")
 
 
 @tilelang.testing.requires_cuda
@@ -296,11 +295,9 @@ def test_abs_maps_to_fabs():
     run_abs_test()
 
 
-@tilelang.testing.requires_cuda
-def test_fastmath_versions():
-    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    # Test fastmath versions
-    fastmath_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("__exp", T.__exp),
         ("__exp10", T.__exp10),
         ("__log", T.__log),
@@ -309,11 +306,13 @@ def test_fastmath_versions():
         ("__tan", T.__tan),
         ("__cos", T.__cos),
         ("__sin", T.__sin),
-    ]
-
-    for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_fastmath_versions(name, func):
+    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
+    run_fastmath_mathop_test(name, func, dtype="float32")
+    print(f"✓ {name} test passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index 2fd1554a..a9ab8698 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 import tilelang.language as T
@@ -77,38 +78,29 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
     assert check_str in code and check_str in code_parallel, f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
 
 
-def test_vectorized_cast():
-    # fp32 -> fp16
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
-
-    # fp16 -> fp32
-    run_vectorized_cast("float16", "float32", "__half22float2", 2)
-    run_vectorized_cast("float16", "float32", "__half22float2", 4)
-
-    # fp32 -> fp8_e4m3
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> fp8_e5m2
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> bf16
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 2)
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 4)
-
-    # bf16 -> fp32
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
-
-    # fp8_e4m3 -> fp32
-    run_vectorized_cast("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 2)
-    run_vectorized_cast("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 4)
-
-    # fp8_e5m2 -> fp32
-    run_vectorized_cast("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 2)
-    run_vectorized_cast("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 4)
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        ("float32", "float16", "__float22half2_rn", 2),
+        ("float32", "float16", "__float22half2_rn", 4),
+        ("float16", "float32", "__half22float2", 2),
+        ("float16", "float32", "__half22float2", 4),
+        ("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2),
+        ("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4),
+        ("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2),
+        ("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4),
+        ("float32", "bfloat16", "__float22bfloat162_rn", 2),
+        ("float32", "bfloat16", "__float22bfloat162_rn", 4),
+        ("bfloat16", "float32", "__bfloat1622float2", 2),
+        ("bfloat16", "float32", "__bfloat1622float2", 4),
+        ("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 2),
+        ("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 4),
+        ("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 2),
+        ("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 4),
+    ],
+)
+def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index a13e4533..de8a9f9d 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -109,30 +109,27 @@ def run_gemm_ss(
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
-def test_gemm_ss():
-    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
-    # GEMM tests for float16
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    # n8 test
-    run_gemm_ss(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 test
-    run_gemm_ss(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # tfloat32 test
-    run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2, 128),
+        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rs(
@@ -247,30 +244,27 @@ def run_gemm_rs(
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
-def test_gemm_rs():
-    # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_rs(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_sr(
@@ -384,31 +378,27 @@ def run_gemm_sr(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_sr():
-    # GEMM tests for float16
-    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_sr(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_sr(128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    # TODO(lei): fix in future
-    run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
+        (128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rr(
@@ -526,31 +516,29 @@ def run_gemm_rr(
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_rr():
-    # GEMM tests for float16
-    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
-    # n8 tests
-    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2)
-    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2)
-
-    # int8 tests
-    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 4ced4f83..3a703a00 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang
 import tilelang.testing
@@ -303,50 +304,53 @@ def run_gemm_sp_sm80(
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9, 0)
-def test_gemm_sp_sm90():
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, False)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, True)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False, True)
-    run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, True),
+        (512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(8, 0)
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
-def test_gemm_sp_sm80():
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128, False, False),
+        (512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
index 276bce4d..cd4123d9 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -1,3 +1,4 @@
+import pytest
 from tilelang import tvm as tvm
 from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
 from tilelang.utils.tensor import torch_assert_close, map_torch_type
@@ -153,33 +154,24 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
     return A, B
 
 
-def test_gemm_ss():
-    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
-    # GEMM tests for float16
-    # TODO: support transposed A compressor
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float", 128, 128, 32, 2)
-
-    # n8 test
-    run_gemm_ss(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
-
-    # int8 test
-    run_gemm_ss(128, 128, 128, False, True, "int8", "int32", "int32", 128, 128, 64, 2)
-    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
-
-    # float8 tests
-    run_gemm_ss(128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
-    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
-
-    # tfloat32 test
-    # run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 128, 32, 2, 128),
+        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, "int8", "int32", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rs(
@@ -313,30 +305,23 @@ def run_gemm_rs(
     print("pass")
 
 
-def test_gemm_rs():
-    # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_rs(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
-
-    # float8 tests
-    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
-
-    # float32 tests
-    # run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_sr(
@@ -470,30 +455,23 @@ def run_gemm_sr(
     print("pass")
 
 
-def test_gemm_sr():
-    # GEMM tests for float16
-    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_sr(128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_sr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 128, 2)
-    run_gemm_sr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 128, 2)
-    run_gemm_sr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_sr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
-
-    # float8 tests
-    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
-
-    # float32 tests
-    # run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 128, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 128, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rr(
@@ -631,31 +609,25 @@ def run_gemm_rr(
     print("pass")
 
 
-def test_gemm_rr():
-    # GEMM tests for float16
-    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
-    # n8 tests
-    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float", 128, 8, 32, 2)
-    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 64, 2)
-
-    # int8 tests
-    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2)
-    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2)
-
-    # float8 tests
-    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2)
-
-    # float32 tests
-    # run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    # run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, "float16", "float16", "float", 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 64, 2, 128),
+        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 899f7bd5324ef9466949bc872f5fca15d8f7048f Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Tue, 16 Dec 2025 23:25:57 +0800
Subject: [PATCH 116/139] [Docs] Improve installation instructions for
 developers (#1450)

---
 CONTRIBUTING.md                  |  2 ++
 docs/get_started/Installation.md | 21 +++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e4b45e24..7edc20f6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,6 +81,8 @@ in the main directory. This installation is removable by:
 python3 -m pip uninstall tilelang
 ```
 
+We also recommend installing TileLang in a more manual way for better control over the build process, by compiling the C++ extensions first and set the `PYTHONPATH`. See [Working from Source via `PYTHONPATH`](https://tilelang.com/get_started/Installation.html#working-from-source-via-pythonpath) for detailed instructions.
+
 ## Lint Check
 
 To check the linting, run:
diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index 585a0029..8fa41c02 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -69,9 +69,9 @@ pip install -e . -v
 
 (working-from-source-via-pythonpath)=
 
-### Working from Source via `PYTHONPATH`
+### Working from Source via `PYTHONPATH` (Recommended for Developers)
 
-If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first:
+If you prefer to work directly from the source tree via `PYTHONPATH` instead of using pip, make sure the native extension (`libtilelang.so`) is built first:
 
 ```bash
 mkdir -p build
@@ -79,6 +79,14 @@ cd build
 cmake .. -DUSE_CUDA=ON
 make -j
 ```
+
+We also recommend using `ninja` to speed up compilation:
+
+```bash
+cmake .. -DUSE_CUDA=ON -G Ninja
+ninja
+```
+
 Then add the repository root to `PYTHONPATH` before importing `tilelang`, for example:
 
 ```bash
@@ -229,8 +237,17 @@ Developers who needs to recompile frequently could use:
 
 ```bash
 pip install -r requirements-dev.txt
+
+# For first time compilation
 pip install -e . -v --no-build-isolation
 
+# Or manually compile with cmake/ninja. Remember to set PYTHONPATH properly.
+mkdir build
+cd build
+cmake .. -G Ninja
+ninja
+
+# Rebuild when you change the cpp code
 cd build; ninja
 ```
 
-- 
GitLab


From 9c21586b54cde5dd7449f5471502983b5f66d9c7 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:22:55 +0800
Subject: [PATCH 117/139] [Feat] Integrate Z3 in TVM Arith Analyzer (#1367)

---
 3rdparty/tvm                                  |    2 +-
 CMakeLists.txt                                |   39 +-
 cmake/pypi-z3/FindZ3.cmake                    |   30 +
 pyproject.toml                                |   14 +-
 requirements-dev.txt                          |    1 +
 requirements-test.txt                         |    1 +
 requirements.txt                              |    1 +
 testing/python/arith/test_arith_hard.py       |   97 ++
 testing/python/arith/test_arith_intset.py     |  379 +++++
 .../arith/test_arith_iter_affine_map.py       | 1292 +++++++++++++++++
 testing/python/arith/test_arith_simplify.py   |  121 ++
 .../test_tilelang_tilelibrary_gemm_sp.py      |   13 +-
 ...g_transform_legalize_safe_memory_access.py |   67 +-
 tilelang/language/reduce.py                   |    4 +-
 tilelang/utils/sparse.py                      |    1 -
 15 files changed, 2009 insertions(+), 53 deletions(-)
 create mode 100644 cmake/pypi-z3/FindZ3.cmake
 create mode 100644 testing/python/arith/test_arith_hard.py
 create mode 100644 testing/python/arith/test_arith_intset.py
 create mode 100644 testing/python/arith/test_arith_iter_affine_map.py
 create mode 100644 testing/python/arith/test_arith_simplify.py

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 2b1ead1a..4d3ec925 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 2b1ead1a375704c75af563cc800aa9347583ba2b
+Subproject commit 4d3ec9253e346b2281513700e692124aefaff347
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a0a01aa..109f8451 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,6 +222,14 @@ elseif(USE_CUDA)
   list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 
+set(USE_Z3      ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations")
+set(USE_PYPI_Z3 ON CACHE BOOL   "Use Z3 provided by PyPI z3-solver package")
+
+if(USE_Z3 AND USE_PYPI_Z3)
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/pypi-z3")
+  find_package(Z3 REQUIRED)
+endif()
+
 # Include tvm after configs have been populated
 add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 
@@ -288,19 +296,32 @@ install(TARGETS tilelang_cython_wrapper
         RUNTIME DESTINATION tilelang/lib
         ARCHIVE DESTINATION tilelang/lib)
 
-# let libtilelang to search tvm/tvm_runtime in same dir
+# add python z3 lib path to rpath for running tests and dev in current folder
+if(USE_Z3 AND USE_PYPI_Z3)
+  set_property(TARGET tvm APPEND PROPERTY BUILD_RPATH ${Python3_SITELIB}/z3/lib)
+  set_property(TARGET tvm APPEND PROPERTY BUILD_RPATH ${Python3_SITELIB}/z3/bin)
+endif()
+
 if(APPLE)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # some z3 is placed in lib/ and some in bin/, we add both in rpath
+    list(APPEND TILELANG_INSTALL_RPATH "@loader_path/../../z3/lib" "@loader_path/../../z3/bin")
+  endif()
 elseif(UNIX)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
-  set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  set(TILELANG_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # cmake uses ; by default, we explicitly use : for linux
+    string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib")
+  endif()
 endif()
 
+# let libtilelang to search tvm/tvm_runtime in same dir
+set_target_properties(tilelang        PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tvm             PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+set_target_properties(tvm_runtime     PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+
 install(
   TARGETS tvm tvm_runtime tilelang_module tilelang
   LIBRARY DESTINATION tilelang/lib
diff --git a/cmake/pypi-z3/FindZ3.cmake b/cmake/pypi-z3/FindZ3.cmake
new file mode 100644
index 00000000..d7920f8f
--- /dev/null
+++ b/cmake/pypi-z3/FindZ3.cmake
@@ -0,0 +1,30 @@
+if(Z3_FOUND)
+    return()
+endif()
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+execute_process(
+    COMMAND "${Python3_EXECUTABLE}" -c "import z3; print(z3.__path__[0])"
+    OUTPUT_VARIABLE Z3_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE Z3_PYTHON_RESULT
+)
+if(NOT Z3_PYTHON_RESULT EQUAL 0 OR Z3_PATH STREQUAL "")
+    message(FATAL_ERROR "Failed to locate z3 Python package. Ensure z3-solver>=4.13.0 is installed.")
+endif()
+message("-- Find Z3 in path: ${Z3_PATH}")
+find_path(Z3_INCLUDE_DIR NO_DEFAULT_PATH NAMES z3++.h PATHS ${Z3_PATH}/include)
+find_library(Z3_LIBRARY NO_DEFAULT_PATH NAMES z3 libz3 PATHS ${Z3_PATH}/bin ${Z3_PATH}/lib ${Z3_PATH}/lib64)
+message("-- Found Z3 include dir: ${Z3_INCLUDE_DIR}")
+message("-- Found Z3 library: ${Z3_LIBRARY}")
+add_library(z3::libz3 SHARED IMPORTED GLOBAL)
+set_target_properties(z3::libz3
+    PROPERTIES
+    IMPORTED_LOCATION ${Z3_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${Z3_INCLUDE_DIR}
+)
+if(NOT Z3_INCLUDE_DIR OR NOT Z3_LIBRARY)
+    message(FATAL_ERROR "Could not find Z3 library or include directory")
+endif()
+set(Z3_CXX_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_C_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_FOUND TRUE)
diff --git a/pyproject.toml b/pyproject.toml
index 71700a10..e5c3494f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
     "torch>=2.7; platform_system == 'Darwin'",
     "tqdm>=4.62.3",
     "typing-extensions>=4.10.0",
+    "z3-solver>=4.13.0",
 ]
 
 [project.optional-dependencies]
@@ -53,7 +54,14 @@ fp4 = ["ml-dtypes>=0.5.1"]
 vis = ["matplotlib"]
 
 [build-system]
-requires = ["cython>=3.0.0", "scikit-build-core"]
+requires = [
+    "cython>=3.0.0",
+    "scikit-build-core",
+    "z3-solver>=4.13.0",
+    # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
+    # See tvm's CMakeLists.txt for more information.
+    "patchelf>=0.17.2; platform_system == 'Linux'",
+]
 build-backend = "scikit_build_core.build"
 
 [tool.scikit-build]
@@ -227,7 +235,7 @@ environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
 environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
 manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
-manylinux-aarch64-image = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_34"  # Z3 requires
 # Install CUDA runtime and stub driver library
 # manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
@@ -256,7 +264,7 @@ yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-de
 yum clean all
 """
 repair-wheel-command = [
-    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
+    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
     "pipx run abi3audit --verbose --strict {wheel}",
 ]
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e6e88e88..0cbc2f7f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,6 +10,7 @@ scikit-build-core
 setuptools>=61
 torch
 wheel
+z3-solver>=4.13.0
 
 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'
diff --git a/requirements-test.txt b/requirements-test.txt
index 38bdf2d7..78ee2cfc 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -30,3 +30,4 @@ scipy
 tabulate
 tornado
 wheel
+z3-solver>=4.13.0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index eaa4f742..c8dc3347 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ torch
 torch>=2.7; platform_system == 'Darwin'
 tqdm>=4.62.3
 typing-extensions>=4.10.0
+z3-solver>=4.13.0
\ No newline at end of file
diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
new file mode 100644
index 00000000..5284bf85
--- /dev/null
+++ b/testing/python/arith/test_arith_hard.py
@@ -0,0 +1,97 @@
+import tilelang.testing
+import tilelang.language as T
+from tvm.arith import Analyzer
+from tvm.ir.expr import Range
+from tvm.tir.expr import Not, Or
+
+
+def implies(x, y):
+    return Or(Not(x), y)
+
+
+def test_hard_prove():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+    d = T.Var("d", T.int32)
+
+    def check_expr(expr):
+        analyzer = Analyzer()
+        result = analyzer.can_prove(expr, 1)
+        if not result:
+            smtlib2 = analyzer.get_smtlib2(expr)
+            raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
+        # assert result, f"Failed to prove: {expr}"
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    check_expr(complex_expr_1())
+
+    @T.macro
+    def complex_expr_2():
+        return implies(a < b and b < c and a * d < b * d, b * d < c * d)
+
+    check_expr(complex_expr_2())
+
+    @T.macro
+    def complex_expr_3():
+        return implies(a >= 0 and a < 128, a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+
+    check_expr(complex_expr_3())
+
+    @T.macro
+    def complex_expr_4():
+        return implies(
+            a >= 0 and a < 128,
+            (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
+            == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
+        )
+
+    check_expr(complex_expr_4())
+
+
+def test_smtlib2():
+    import z3
+
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    e = complex_expr_1()
+    analyzer = Analyzer()
+    analyzer.set_z3_timeout_ms(1000)
+    smtlib2 = analyzer.get_smtlib2(e)
+
+    solver = z3.Solver()
+    solver.from_string(smtlib2)
+    assert solver.check() == z3.unsat, f"Expected unsat, got {solver.check()}"
+
+
+def test_bind():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    analyzer = Analyzer()
+    analyzer.bind(a, Range(1, 100000))
+    analyzer.bind(b, Range(1, 100000))
+    analyzer.bind(c, Range(1, 100000))
+
+    expr = ((b - a) // c) * c + a <= b
+    smtlib2 = analyzer.get_smtlib2(expr)
+    try:
+        result = analyzer.can_prove(expr, 1)
+        assert result, f"Failed to prove with bindings: {expr}"
+    except Exception as e:
+        print(smtlib2)
+        raise e
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_intset.py b/testing/python/arith/test_arith_intset.py
new file mode 100644
index 00000000..e3fc7889
--- /dev/null
+++ b/testing/python/arith/test_arith_intset.py
@@ -0,0 +1,379 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tvm.testing
+from tvm import te
+from tvm import tir
+from tvm.arith.analyzer import Analyzer
+
+
+class IntSetChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, dmap, expected):
+        res = self.analyzer.int_set(data, dmap)
+
+        def err_msg():
+            return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
+
+        assert self.analyzer.can_prove_equal(res.min_value, expected[0]), err_msg()
+        assert self.analyzer.can_prove_equal(res.max_value, expected[1]), err_msg()
+
+
+def test_basic():
+    s = tvm.arith.IntervalSet(2, 3)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 3
+
+    s = tvm.arith.IntSet.single_point(2)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 2
+
+
+def test_vector():
+    base = 10
+    stride = 3
+    lanes = 2
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, stride, lanes))
+    assert s.min_value.value == base
+    assert s.max_value.value == base + stride * (lanes - 1)
+
+
+def test_scalable_vector():
+    base = 5
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, 2, tvm.tir.vscale() * 4))
+
+    assert s.min_value.value == base
+    assert s.max_value.same_as(tvm.arith.int_set.pos_inf())
+
+
+def test_add_sub():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (1, 21))
+    ck.verify(x - y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (-11, 9))
+
+
+def test_mul_div():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+
+    tdiv = tvm.tir.truncdiv
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x * y, {x: tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
+    ck.verify(x * 2, {x: tvm.arith.IntervalSet(1, 10)}, (2, 20))
+    ck.verify(x * -2, {x: tvm.arith.IntervalSet(1, 10)}, (-20, -2))
+
+    ck.verify(tdiv(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, tdiv(10, y)))
+    ck.verify(tdiv(x, 2), {x: tvm.arith.IntervalSet(1, 10)}, (0, 5))
+
+    fld = tvm.te.floordiv
+    ck.verify(fld(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, fld(10, y)))
+    ck.verify(fld(x, 2), {x: tvm.arith.IntervalSet(-1, 10)}, (-1, 5))
+
+
+def test_mod():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    tmod = tvm.tir.truncmod
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(tmod(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
+    ck.verify(tmod(x, 10), {x: tvm.arith.IntervalSet(1, 10)}, (0, 9))
+
+    flm = tvm.te.floormod
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(-10, 10)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 5)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(13, 15)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 15)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 11)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(1, 21)}, (0, 9))
+
+    fld = tvm.te.floordiv
+    z = te.var("z")
+    ck.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 3))
+    ck.verify(
+        flm(y, 8),
+        {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)},
+        (
+            z * 8 + x * 4 - 8 * fld(z * 8 + x * 4, 8),
+            z * 8 + x * 4 + 3 - 8 * fld(z * 8 + x * 4, 8),
+        ),
+    )
+    ck1 = IntSetChecker()
+    ck1.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 2))
+    ck1.verify(flm(y, 8), {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)}, (x * 4, x * 4 + 3))
+
+
+def test_max_min():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(tvm.te.max(x, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.te.min(x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.te.min(x, y), {}, (tvm.te.min(x, y), tvm.te.min(x, y)))
+    ck.verify(tvm.te.max(x, y), {}, (tvm.te.max(x, y), tvm.te.max(x, y)))
+
+
+def test_select():
+    ck = IntSetChecker()
+    # x, y = te.var("x"), te.var("y")
+    x = te.var("x")
+    ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
+
+
+def check_region_bound(expect_region, var_dom, mode, predicate=None):
+    """Helper to check region bound estimation.
+
+    Parameters
+    ----------
+    expect_region: dict
+        The keys are of form (begin, end) or PrimExpr as a single point. The values are
+        expected estimated region or region dict on different bindings.
+
+    var_dom: dict
+        Map var to iteration domain range.
+
+    mode: str
+        Specify "lowerbound", "upperbound" or else use strict bound estimation.
+
+    predicate: PrimExpr
+        Extra predicate, defaults to True.
+    """
+    if predicate is None:
+        predicate = tvm.tir.IntImm("bool", 1)
+    region = []
+    expect = []
+    for k, v in expect_region.items():
+        if not isinstance(k, (tuple, list)):
+            k = (k, k + 1)
+        region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0])))
+        expect.append(v)
+    if mode == "lowerbound":
+        result = tvm.arith.estimate_region_lower_bound(region=region, var_dom=var_dom, predicate=predicate)
+    elif mode == "upperbound":
+        result = tvm.arith.estimate_region_upper_bound(region=region, var_dom=var_dom, predicate=predicate)
+    else:
+        result = tvm.arith.estimate_region_strict_bound(region=region, var_dom=var_dom, predicate=predicate)
+    if result is None:
+        assert all([_ is None for _ in expect])
+        return
+    assert len(result) == len(expect)
+    for intset, expect_desc in zip(result, expect):
+        if isinstance(expect_desc, dict):
+            # check range on different free var bindings
+            for binding in expect_desc:
+                analyzer = Analyzer()
+                for k, v in binding:
+                    analyzer.bind(k, v)
+                expect_begin, expect_end = expect_desc[binding]
+                result_begin = analyzer.simplify(intset.min_value, 3)
+                result_end = analyzer.simplify(intset.max_value + 1, 3)
+                assert analyzer.can_prove_equal(result_begin - expect_begin, 0), f"{result_begin} vs {expect_begin}"
+                assert analyzer.can_prove_equal(result_end - expect_end, 0), f"{result_end} vs {expect_end}"
+        else:
+            # check range
+            expect_begin, expect_end = expect_desc
+            analyzer = Analyzer()
+            assert analyzer.can_prove_equal(intset.min_value - expect_begin, 0), f"{intset.min_value} vs {expect_begin}"
+            assert analyzer.can_prove_equal(intset.max_value - expect_end + 1, 0), f"{intset.max_value} vs {expect_end - 1}"
+
+
+def test_region_bound_not_independent():
+    # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound")
+
+    # when only a subset of access indices are affine
+    i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=16),
+        j: tvm.ir.Range(begin=0, end=16),
+        k: tvm.ir.Range(begin=0, end=16),
+    }
+    check_region_bound(
+        {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="upperbound",
+    )
+
+
+def test_region_bound_stride_too_wide():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {i: tvm.ir.Range(begin=0, end=64)}
+    check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound")
+
+
+def test_region_bound_small_stride():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_split_predicate():
+    x_o = tvm.tir.Var("xo", "int32")
+    x_i = tvm.tir.Var("xi", "int32")
+    x = x_o * 4 + x_i
+    var_dom = {
+        x_o: tvm.ir.Range(begin=0, end=16),
+        x_i: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound")
+
+    check_region_bound(
+        {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)},
+        var_dom,
+        predicate=x < 63,
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_multiple_variables():
+    div = tvm.tir.floordiv
+    mod = tvm.tir.floormod
+    x = tvm.tir.Var("x", "int32")
+    wid = tvm.tir.Var("wid", "int32")
+    i = div(x, 16)
+    j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4
+    k = wid % 32
+    var_dom = {
+        x: tvm.ir.Range(begin=0, end=32),
+        wid: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_negative_scale():
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=4),
+        j: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_for_non_perfect_tile():
+    h1 = tvm.tir.Var("h1", "int32")
+    h2 = tvm.tir.Var("h2", "int32")
+    h3 = tvm.tir.Var("h3", "int32")
+
+    # non-uniform tiling, single inner variable
+    var_dom = {
+        h2: tvm.ir.Range(begin=0, end=10),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),  # h3 == 0: region is [1, 10)
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),  # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10)
+                ((h3, 27),): (h3 * 8, 224),  # h3 > 26: region is [h3 * 8, 224)
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 >= 1, h3 * 8 + h2 < 224),
+        mode="lowerbound",
+    )
+
+    # non-uniform tiling, two inner variables
+    var_dom = {
+        h1: tvm.ir.Range(begin=0, end=5),
+        h2: tvm.ir.Range(begin=0, end=2),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2 * 5 + h1: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),
+                ((h3, 27),): (h3 * 8, 224),
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h2 * 5 + h1 < 224),
+        mode="lowerbound",
+    )
+
+    # lowerbound should fail on incompatible predicates
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: None},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_unfusable():
+    var_dom = {
+        tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
+        tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
+    }
+    i, j = var_dom
+    check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound")
+
+
+def test_union_lower_bound():
+    neg_inf = tvm.arith.int_set.neg_inf()
+    pos_inf = tvm.arith.int_set.pos_inf()
+    set_0 = tvm.arith.IntervalSet(min_value=neg_inf, max_value=0)
+    set_1 = tvm.arith.IntervalSet(min_value=1, max_value=pos_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+    set_2 = tvm.arith.IntervalSet(min_value=pos_inf, max_value=neg_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1, set_2])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+
+
+def test_modular_set():
+    ck = IntSetChecker()
+    x = tvm.te.var("x", dtype="int32")
+    y = tvm.te.var("y", dtype="int32")
+    expr = (x * 2048 + y * 16) % 7168
+    ck.verify(expr, {x: tvm.arith.IntervalSet(0, 128), y: tvm.arith.IntervalSet(0, 3584)}, (0, 7152))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/testing/python/arith/test_arith_iter_affine_map.py b/testing/python/arith/test_arith_iter_affine_map.py
new file mode 100644
index 00000000..7a666f87
--- /dev/null
+++ b/testing/python/arith/test_arith_iter_affine_map.py
@@ -0,0 +1,1292 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tilelang.testing
+from tvm.tir import floordiv, floormod
+from tvm.script import tir as T
+
+
+def ifuse(inputs, pred_extent=None):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return value, extent if pred_extent is None else pred_extent
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def convert_iter_expr(expr):
+    return tvm.arith.normalize_iter_map_to_expr(expr)
+
+
+def assert_iter_sum_pattern(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_map_simplify(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    _imap = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    res = tvm.arith.iter_map_simplify(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    for i, input_expr in enumerate(keys):
+        expected_expr = expect_dict[input_expr]
+        tvm.ir.assert_structural_equal(res[i], expected_expr)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(list(iters), dom_map, predicate=predicate, check_level=check_level).indices
+    assert len(res) == 0
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
+
+    # not independent
+    assert_iter_sum_failure([x, x, 3], dom_map)
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False)
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
+
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
+
+    # fuse with symbolic factor
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
+
+    # duplication
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+
+    # factor mismatch
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+
+    # simple stride pattern
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
+
+    # simple stride pattern with symbolic
+    assert_iter_sum_pattern({x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)]))
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
+
+    assert_iter_sum_pattern({fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)]))
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
+
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
+
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
+
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
+
+    # domain of x is undefined
+    assert_iter_sum_pattern({fld(flm(x, 49) + y, 49): (1, fld(flm(x, 49) + y, 49))}, var_dom([(y, 1)]))
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
+    xoscale = 3
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
+
+
+def test_compound_floormod_two_regression():
+    x = tvm.tir.Var("x", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    # regression
+    # extent of 2 of negative scale cannot be normalized
+    assert_iter_sum_failure(
+        [fld(x, 2) * 2 - flm(x, 2) + 1],
+        dom_map=var_dom([(x, 8)]),
+    )
+
+
+def test_predicate():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # available constraints
+    # upper bound only
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128)
+
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127)
+
+    # lower bound only
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5)
+
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6)
+
+    # lower bound + upper bound
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 64 + y * 4 + z: (16, 16)},
+        var_dom([(x, 16), (y, 16), (z, 4)]),
+        predicate=tvm.tir.And(x * 64 + y * 4 + z < 32, x * 16 + y >= 4),
+    )
+
+    # constraints on one fused iter
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    k = tvm.tir.Var("k", "int32")
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9),
+    )
+
+    # constraints on single var
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
+
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
+        [i, j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
+    )
+
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
+        [i * 128 + j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=i * 16384 + j * 128 + k < 100,
+    )
+
+    # irrelevant predicate
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
+
+    # constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9, i * 8 + j * 2 + k >= 3, i * 8 + j * 2 + k < 25),
+    )
+
+    # duplicate constraint on one fused iter
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k >= 2, j * 2 + k < 8, j * 2 + k < 9),
+    )
+
+    # duplicate constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(
+            j * 2 + k >= 1,
+            j * 2 + k >= 2,
+            j * 2 + k < 8,
+            j * 2 + k < 9,
+            i * 6 + j * 2 + k >= 3,
+            i * 6 + j * 2 + k < 25,
+            i * 6 + j * 2 + k >= 1,
+            i * 6 + j * 2 + k < 18,
+        ),
+    )
+
+    # constraint on non-disjoint fused iters should fail
+    assert_iter_sum_failure(
+        [i * 8 + j * 2 + k],
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 2, i * 4 + j >= 0),
+    )
+
+    # constraints with different lower bound
+    assert_iter_sum_pattern(
+        {
+            (i * 16 + j) // 23 * 8 + (i * 16 + j) % 23 - 15: (
+                64,
+                0,
+                1,
+                (i * 16 + j) // 23 * 8 + ((i * 16 + j) % 23 + tvm.tir.IntImm("int32", -15)),
+            )
+        },
+        var_dom([(i, 12), (j, 16)]),
+        predicate=tvm.tir.And(
+            tvm.tir.And(i * 16 + j < 184, tvm.tir.LE(tvm.tir.IntImm("int32", 8), (i * 16 + j) % 23)),
+            tvm.tir.LE(tvm.tir.IntImm("int32", 15), (i * 16 + j) % 23),
+        ),
+    )
+
+    # constraint on many disjoint fused iters, case 1
+    # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
+    # i2 * 30 + i3 * 15 in [30, 90), extent=60 (= scale of i1)
+    # i1 * 60 in [60, 240), extent=180 (= scale of i0)
+    i0 = tvm.tir.Var("i0", "int32")
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    i5 = tvm.tir.Var("i5", "int32")
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
+        var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
+        predicate=tvm.tir.all(i1 >= 1, i2 * 2 + i3 >= 2, i4 * 6 + i5 >= 3),
+    )
+
+    # constraint on many disjoint fused iters, case 2
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
+        var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
+        predicate=tvm.tir.all(i1 * 5 + i2 >= 3, i1 * 5 + i2 < 8, i3 * 4 + i4 >= 1, i3 * 4 + i4 < 10),
+    )
+
+    # constraint on split iters
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
+        var_dom([(i, 1024)]),
+        predicate=tvm.tir.all(i % 16 >= 3, i % 16 < 10, i // 16 >= 4, i // 16 < 12),
+        check_level="bijective",
+    )
+
+    # constraint on split iters, nested case 1
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all((i * 32 + j) % 16 >= 3, (i * 32 + j) % 16 < 10),
+    )
+
+    # constraint on split iters, nested case 2
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+        check_level="bijective",
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 64),
+    )
+
+    # non-standard form of predicate
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y)
+
+    # duplicate constraint
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
+    )
+
+    # useless constraint
+    assert_iter_sum_pattern({x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140)
+
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+            )
+        ),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
+            )
+        ),
+    )
+
+    # incompatible constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+
+    # zero iter
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
+    )
+
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
+
+def convert_division(divisions):
+    if divisions is None or len(divisions) == 0:
+        return []
+    res = []
+    for division in divisions[:-1]:
+        res.append(
+            [
+                tvm.arith.normalize_iter_map_to_expr(division[0].source),
+                tvm.arith.normalize_iter_map_to_expr(division[1].source),
+            ]
+        )
+    res.append([divisions[-1][0].extent, divisions[-1][1].extent])
+    return res
+
+
+def create_iter(name, extent):
+    return tvm.tir.Var(name, "int32"), extent
+
+
+def test_subspace_division():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+
+    # simple 1.1
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x])
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+
+    # simple 1.2
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x], z * 4 + y < 18)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+    tvm.ir.assert_structural_equal(res[1][0], z * 4 + y < 18)
+    tvm.ir.assert_structural_equal(res[1][1], T.bool(True))
+
+    # compound 1
+    i0 = create_iter("i0", 4)
+    j0 = create_iter("j0", 8)
+    i3 = create_iter("i3", 2)
+
+    i1, i2 = isplit(j0, 4)
+    k0 = ifuse([i0, i1])
+    k1 = ifuse([i2, i3])
+
+    # compound 1.1
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+
+    # assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.2
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.3
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 1.4
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], k0[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
+    tvm.ir.assert_structural_equal(res[2][1], T.bool(True))
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.5
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]], k1[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], T.bool(True))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.6
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], tvm.tir.all(k0[0] < 7, k1[0] < 7))
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2
+    j0 = create_iter("j0", 4)
+    l0 = create_iter("l0", 2)
+    l1 = create_iter("l1", 6)
+    j3 = create_iter("j3", 3)
+
+    k0 = ifuse([l0, l1])
+    i1, j2 = isplit(k0, 3)
+    j1, i1 = isplit(i1, 2)
+    i0 = ifuse([j0, j1])
+    i2 = ifuse([j2, j3])
+
+    # compound 2.1
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.2
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], j0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(l0[0] * 6 + l1[0], 6))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(floormod(l0[0] * 6 + l1[0], 6), 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
+    assert len(res2) == 3
+
+    # compound 2.3
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2.4
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0], i2[0]],
+        var_dom([j0, l0, l1, j3]),
+        [l1[0], j3[0]],
+        tvm.tir.all(i0[0] < 7, i2[0] < 8),
+    )
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+    tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
+    tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.5
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [j3[0]], i2[0] < 8)
+    res = convert_division(res)
+    assert len(res) == 0
+
+
+def test_subspace_divide_trivial_iters():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    # z = tvm.tir.Var("z", "int32")
+
+    # trivial 1.1
+    res = tvm.arith.subspace_divide([x * 16 + y], var_dom([(x, 1), (y, 16)]), [y], simplify_trivial_iterators=False)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], y)
+
+    # trivial 1.2
+    res = tvm.arith.subspace_divide(
+        [x, y],
+        var_dom([(x, 1), (y, 1)]),
+        [y],
+        simplify_trivial_iterators=False,
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], y)
+
+
+def test_complex():
+    n0 = create_iter("n0", 2)
+    n1 = create_iter("n1", 4)
+
+    m0 = ifuse([n0, n1], 6)
+    m1 = create_iter("m1", 3)
+
+    l0 = create_iter("l0", 4)
+    l1 = create_iter("l1", 8)
+    l2 = ifuse([m0, m1], 16)
+    l3 = create_iter("l3", 32)
+
+    k0, k4 = isplit(l0, 2)
+    k1, k5 = isplit(l1, 2)
+    k2, k6 = isplit(l2, 4)
+    k3, k7 = isplit(l3, 4)
+
+    j0 = ifuse([k0, k1], 7)
+    j1 = ifuse([k2, k3])
+    j2 = ifuse([k4, k5])
+    j3 = ifuse([k6, k7], 15)
+
+    i0 = ifuse([j0, j1], 200)
+    i1 = ifuse([j2, j3], 50)
+
+    n0_mark = tvm.arith.IterMark(n0[0], n0[1])
+    n1_mark = tvm.arith.IterMark(n1[0], n1[1])
+    l0_mark = tvm.arith.IterMark(l0[0], l0[1])
+    l1_mark = tvm.arith.IterMark(l1[0], l1[1])
+    m1_mark = tvm.arith.IterMark(m1[0], m1[1])
+    l3_mark = tvm.arith.IterMark(l3[0], l3[1])
+
+    m0_expr = tvm.arith.IterSumExpr(
+        [
+            tvm.arith.IterSplitExpr(n0_mark, 1, n0[1], 4),
+            tvm.arith.IterSplitExpr(n1_mark, 1, n1[1], 1),
+        ],
+        0,
+    )
+    m0_mark = tvm.arith.IterMark(m0_expr, 6)
+    l2_expr = tvm.arith.IterSumExpr(
+        [tvm.arith.IterSplitExpr(m0_mark, 1, 6, 3), tvm.arith.IterSplitExpr(m1_mark, 1, m1[1], 1)],
+        0,
+    )
+    l2_mark = tvm.arith.IterMark(l2_expr, 16)
+    k0_expr = tvm.arith.IterSplitExpr(l0_mark, 2, 2, 4)
+    k1_expr = tvm.arith.IterSplitExpr(l1_mark, 2, 4, 1)
+    k2_expr = tvm.arith.IterSplitExpr(l2_mark, 4, 4, 8)
+    k3_expr = tvm.arith.IterSplitExpr(l3_mark, 4, 8, 1)
+    k4_expr = tvm.arith.IterSplitExpr(l0_mark, 1, 2, 30)
+    k5_expr = tvm.arith.IterSplitExpr(l1_mark, 1, 2, 15)
+    k6_expr = tvm.arith.IterSplitExpr(l2_mark, 1, 4, 4)
+    k7_expr = tvm.arith.IterSplitExpr(l3_mark, 1, 4, 1)
+
+    j0_expr = tvm.arith.IterSumExpr([k0_expr, k1_expr], 0)
+    j0_mark = tvm.arith.IterMark(j0_expr, 7)
+    i0_expr = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(j0_mark, 1, 7, 32), k2_expr, k3_expr], 0)
+
+    j3_expr = tvm.arith.IterSumExpr([k6_expr, k7_expr], 0)
+    j3_mark = tvm.arith.IterMark(j3_expr, 15)
+    i1_expr = tvm.arith.IterSumExpr([k4_expr, k5_expr, tvm.arith.IterSplitExpr(j3_mark, 1, 15, 1)], 0)
+
+    i0_mark = tvm.arith.IterMark(i0_expr, i0[1])
+    i1_mark = tvm.arith.IterMark(i1_expr, i1[1])
+
+    i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
+    i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
+
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
+    )
+
+    # subspace_division
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        [n0[0], n1[0], m1[0], l3[0]],
+        tvm.tir.all(m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], floordiv(l0[0], 2) * 4 + floordiv(l1[0], 2))
+    tvm.ir.assert_structural_equal(res[0][1], (floordiv((n0[0] * 4 + n1[0]) * 3 + m1[0], 4) * 8) + floordiv(l3[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], ((floormod(l0[0], 2) * 2) + floormod(l1[0], 2)))
+    tvm.ir.assert_structural_equal(res[1][1], ((floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4) + floormod(l3[0], 4)))
+    tvm.ir.assert_structural_equal(res[2][0], (floordiv(l0[0], 2) * 4) + floordiv(l1[0], 2) < 7)
+    tvm.ir.assert_structural_equal(
+        res[2][1],
+        tvm.tir.all(
+            n0[0] * 4 + n1[0] < 6,
+            (n0[0] * 4 + n1[0]) * 3 + m1[0] < 16,
+            floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4 + floormod(l3[0], 4) < 15,
+        ),
+    )
+
+    assert_iter_sum_pattern({res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1])
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
+
+
+def test_normalize_iter_map_to_expr():
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
+
+    tvm.ir.assert_structural_equal(
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
+    )
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
+
+    # iter mark wrap a complex expr
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
+
+
+def test_inverse_affine_iter_map():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    l2 = create_iter("l2", 64)
+
+    # simple case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l0_1_l1_1_fused = ifuse([l0_1, l1_1])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 2
+    l0_inverse = floordiv(outputs[0], 4) + outputs[1] * 16
+    l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+
+    # compound case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l2_1, l2_2 = isplit(l2, 4)
+    l2_0, l2_1 = isplit(l2_1, 4)
+
+    l0_1_l2_1_l1_1_l2_0_fused = ifuse([l0_1, l2_1, l1_1, l2_0])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 3
+    l0_inverse = floordiv(outputs[0], 64) + outputs[1] * 16
+    l1_inverse = floormod(floordiv(outputs[0], 4), 4) + outputs[3] * 4
+    l2_inverse = floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+    assert analyzer.can_prove_equal(res[l2[0]], l2_inverse)
+
+    # diamond-shape DAG
+    l0_0, l0_1 = isplit(l0, 16)
+    l1 = ifuse([l0_1, l0_0])
+    l1_0, l1_1 = isplit(l1, 8)
+    l2 = ifuse([l1_1, l1_0])
+
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 1
+    l1_inverse = floormod(outputs[0], 8) * 8 + floordiv(outputs[0], 8)
+    l0_inverse = floormod(l1_inverse, 4) * 16 + floordiv(l1_inverse, 4)
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+
+
+def test_inverse_affine_map_trivial_iter():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    # output_0 is expected to be constant and it is not included in the inverse map
+    assert len(res) == 2
+    assert analyzer.can_prove_equal(res[l0[0]], outputs[1])
+    assert analyzer.can_prove_equal(res[l1[0]], outputs[2])
+
+
+def test_free_variables():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # illegal iter if z is within dom
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
+
+    # iter is valid if z is free, even there are linear forms of z
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+
+
+class TestPadding:
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    positive_test_case = tvm.testing.parameter(
+        # left padding only, offset divisible
+        ({y: 192}, {fld(64 + y, 32): (6, 2, 1), flm(64 + y, 32): (32, 0, 1)}, "bijective"),
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32): (6, 2, 1)}),
+        ({y: 176}, {flm(fld(80 + y, 2), 16): (16, 0, 1), flm(80 + y, 2): (2, 0, 1)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 16): (10, 0, 1), flm(x * 32 + y * 8, 16): (2, 0, 8)}),
+        # right padding only, offset non-divisible
+        ({x: 26}, {fld(x, 15): (2, 0, 1)}),
+        ({x: 26}, {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)}),
+        # padding constants on both side
+        ({x: 45}, {fld(x + 71, 32): (2, 2, 1)}),
+        ({x: 45}, {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)}),
+        # padding for free iteration part
+        ({y: 360}, {fld(x * 360 + y, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}),
+        ({y: 360}, {flm(x * 360 + y, 16): (16, 0, 1)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3): (3, 0),
+                flm(fld(x + 10, 3), 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+            },
+        ),
+        # different offsets on splits
+        (
+            {x: 240},
+            {
+                flm(x + 1, 3): (3, 0),
+                flm(fld(x + 10, 3) + 2, 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+            },
+        ),
+    )
+
+    negative_test_case = tvm.testing.parameter(
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32), flm(80 + y, 32)}),
+        ({y: 176}, {fld(80 + y, 32), fld(80 + y, 4)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 5)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3),
+                flm(fld(x + 10, 3), 4),
+                flm(fld(fld(x + 10, 3), 4), 5),
+                fld(fld(fld(x + 10, 3), 4), 5),
+            },
+        ),
+        # original extent is smaller than the divident
+        # it is not surjective wrt to the region [0, 16)
+        ({x: 3}, {flm(x, 16)}),
+        # (x % c1) // c2 is not proved as surjective if c1 % c2 != 0
+        ({x: 255}, {fld(flm(x, 255), 16)}),
+    )
+
+    def test_padding(self, positive_test_case):
+        iter_extent, mapped_iterators, *args = positive_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_pattern(mapped_iterators, dom_map, check_level=check_level)
+
+    def test_padding_error(self, negative_test_case):
+        iter_extent, mapped_iterators, *args = negative_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_failure(mapped_iterators, dom_map, check_level=check_level)
+
+
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, x * 6 + y > 1, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern({x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective")
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
+def test_iter_map_simplify_symbolic_case():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = x * 32 + y
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    assert_iter_map_simplify({simple_fuse0(x): x}, var_dom([(x, n * 32)]))
+
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, n), (y, 32)]))
+
+    def fsymbolic_fuse0(x):
+        return ((x // (n * n)) % 32) * (n * n) + ((x // n) % n) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse0(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse0(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse1(x):
+        return ((x % (n * n * 32)) // (n * n) * n + (x % (n * n) // n)) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse1(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse1(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    assert_iter_map_simplify({fsymbolic_fuse2(x): x}, var_dom([(x, n * n * 32)]))
+
+
+def test_iter_map_simplify_symbolic_predicate():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    z = x * 32 + y
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, (n + 1) // 2), (y, 32)]), predicate=(z < n * 16))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    z = x * 64 + y
+    assert_iter_map_simplify(
+        {fsymbolic_fuse2(z): z},
+        var_dom([(x, (n * n + 1) // 2), (y, 64)]),
+        predicate=(z < n * n * 32),
+    )
+
+
+def test_iter_map_simplify_symbolic_reshape():
+    n = tvm.tir.Var("n", "int64")
+    fused = tvm.tir.Var("fused", "int64")
+
+    ax0 = (fused // 4096) // n
+    ax1 = (fused // 4096) % n
+    ax2 = fused % 4096
+
+    rhs_index = ((ax2 // 4096 + ax0 * n + ax1) % n) * 4096 + ax2 % 4096
+
+    assert_iter_map_simplify({rhs_index: fused}, var_dom([(fused, n * 4096)]))
+
+
+def test_iter_map_simplify_unit_loop_order():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+
+    # trivial iterators can be found at any when comparing via scale
+    # ensure order unchange
+    assert_iter_map_simplify({x + y + z: x + y + z}, var_dom([(x, 1), (y, 1), (z, 1)]), simplify_trivial_iterators=False)
+
+    # Even with simplification, it should follow the original order
+    assert_iter_map_simplify(
+        {x + y + (z // 4) * 4 + z % 4: z + x + y},
+        var_dom([(x, 1), (y, 1), (z, 32)]),
+        simplify_trivial_iterators=False,
+    )
+
+    assert_iter_map_simplify(
+        {y + 64 - x % 2 * 64: y + 64 - x % 2 * 64},
+        var_dom([(x, 6), (y, 64)]),
+        simplify_trivial_iterators=False,
+    )
+
+    # When we have iterators that have same scale but one of them come
+    # with unit extent, we should prioritize unit extent
+    assert_iter_map_simplify(
+        {x // 128 + y + z: y + z},
+        var_dom([(x, 128), (y, 128), (z, 1)]),
+        simplify_trivial_iterators=False,
+    )
+
+
+def assert_normalize_to_iter_sum(index, input_iters, args, base):
+    """Assert the result of arith.normalize_to_iter_sum is correct
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index to be normalized
+    input_iters : Mapping[Var, Range]
+        The input iterators
+    args : List[Union[tvm.arith.IterSplitExpr, Tuple[PrimExpr, PrimExpr]]]
+        The expected result. Ordered list of args of the expected IterSumExpr. Each arg can be
+        either IterSplitExpr or a tuple of (PrimExpr, PrimExpr) where the first element is the
+        iterator normalized to PrimExpr and the second element is the scale.
+    base : tvm.tir.PrimExpr
+        The expected base
+    """
+    res = tvm.arith.normalize_to_iter_sum(index, input_iters)
+
+    assert isinstance(res, tvm.arith.IterSumExpr)
+    assert len(res.args) == len(args)
+    for split, item in zip(res.args, args):
+        if isinstance(item, tvm.arith.IterSplitExpr):
+            tvm.ir.assert_structural_equal(split, item)
+            continue
+        tvm.testing.assert_prim_expr_equal(split.scale, item[1])
+        tvm.testing.assert_prim_expr_equal(tvm.arith.normalize_iter_map_to_expr(split), item[0] * item[1])
+    tvm.testing.assert_prim_expr_equal(res.base, base)
+
+
+def test_normalize_to_iter_sum():
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+    a = tvm.tir.Var("a", "int64")
+    n = tvm.tir.Var("n", "int64")
+    # flm = tvm.tir.floormod
+
+    assert_normalize_to_iter_sum(
+        z + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n), (z, 1)],
+        2 * n + 3,
+    )
+
+    # max cannot detected so it goes into base
+    assert_normalize_to_iter_sum(
+        tvm.tir.max(z, a) + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n)],
+        tvm.tir.max(z, a) + 2 * n + 3,
+    )
+
+    # order by symbolic prod
+    assert_normalize_to_iter_sum(
+        z + ((y * 4 * a + x * 4 + 2) * n) + 3,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, a * n * 4), (x, n * 4), (z, 1)],
+        2 * n + 3,
+    )
+
+    # order by cscale
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * x,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x, 4), (z, 1)],
+        0,
+    )
+
+    # split pattern
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * (x // 2),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x // 2, 4), (z, 1)],
+        0,
+    )
+
+    # non-divisible
+    assert_normalize_to_iter_sum(
+        x // 5,
+        var_dom([(x, 4096)]),
+        [
+            tvm.arith.IterSplitExpr(
+                tvm.arith.IterMark(x, 4096),
+                lower_factor=tvm.tir.const(5, "int64"),
+                extent=tvm.tir.const(820, "int64"),
+                scale=tvm.tir.const(1, "int64"),
+            )
+        ],
+        0,
+    )
+
+    # iter simplify
+    assert_normalize_to_iter_sum(
+        z * 2 + 2 * y * 3 + 4 * (x // 4) + (x % 4),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (z, 2), (x, 1)],
+        0,
+    )
+
+
+def test_detect_iter_map_with_bufferload_recursion():
+    n = tvm.tir.Var("n", "int32")
+    m = tvm.tir.Var("m", "int32")
+    divisor = tvm.tir.Var("divisor", "int32")
+
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+
+    buffer = tvm.tir.decl_buffer((n,), "int32", name="seqlen")
+
+    indices = [(buffer[i] + j) // divisor]
+    iter_vars = {
+        i: tvm.ir.Range(tvm.tir.const(0, "int32"), n),
+        j: tvm.ir.Range(tvm.tir.const(0, "int32"), m),
+    }
+
+    result = tvm.arith.detect_iter_map(indices, iter_vars)
+    assert len(result.indices) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_simplify.py b/testing/python/arith/test_arith_simplify.py
new file mode 100644
index 00000000..7d6cf6d3
--- /dev/null
+++ b/testing/python/arith/test_arith_simplify.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tilelang import tvm
+import tilelang.testing
+from tvm import tir
+import tvm.ir
+
+
+def test_simplify_reshape_flattened_index():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    ana.bind(i0, tvm.ir.Range(0, 8))
+    ana.bind(i1, tvm.ir.Range(0, 3))
+
+    i_flattened = i0 * 3 + i1
+    tvm.ir.assert_structural_equal(
+        ana.simplify((i_flattened) // 12 * 12 + (i_flattened) % 12 // 4 * 4 + (i_flattened) % 4),
+        i_flattened,
+    )
+
+
+dtype = tvm.testing.parameter(
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+)
+
+
+def test_can_prove_self_identity(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove(n == n)
+
+
+def test_can_prove_self_equal_to_self(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove_equal(n, n)
+
+
+def test_simplify_symbolic_comparison():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    n, m = tvm.tir.SizeVar("n", "int64"), tvm.tir.SizeVar("m", "int64")
+    outer = (n + 31) // 32
+    ana.bind(i0, tvm.ir.Range(0, outer))
+    ana.bind(i1, tvm.ir.Range(0, 32))
+    PS = tvm.arith.ProofStrength
+
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32 + m, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 + 1 <= (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1 + 1, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1, PS.SYMBOLIC_BOUND)
+
+
+def test_regression_simplify_inf_recursion():
+    ana = tvm.arith.Analyzer()
+    cond = tir.Var("cond", "int32")
+
+    res = (tvm.tir.NE(cond, 0).astype("int8") - tvm.tir.NE(cond, 0).astype("int8")).astype("int32") == 0
+    # regression in a previous case
+    # try compare and int set recursive call can cause infinite loop
+    ana.rewrite_simplify(res)
+
+
+def test_simplify_floor_mod_with_linear_offset():
+    """
+    Test that the floor_mod is simplified correctly when the offset is linear.
+    """
+    ana = tvm.arith.Analyzer()
+    past_decoder_sequence_length = tir.Var("past_decoder_sequence_length", "int64")
+    expr1 = (past_decoder_sequence_length + 1) * 64
+    divisor1 = (past_decoder_sequence_length + 1) * 32
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor1), 0)
+    divisor2 = 32 * (past_decoder_sequence_length + 1)
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor2), 0)
+
+
+def test_simplify_float_division():
+    # Test for the discussion:
+    # https://discuss.tvm.apache.org/t/discuss-is-constant-division-to-multiplication-rewrite-in-tvm-necessary/18615
+    ana = tvm.arith.Analyzer()
+    x = tir.Var("x", "float32")
+    ry = x / 27
+    # in old version, the division will be rewritten into x * T.float32(1 / 27)
+    sy = ana.rewrite_simplify(ry)
+    tvm.ir.assert_structural_equal(ry, sy)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 3a703a00..6c47bb5e 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -216,6 +216,8 @@ def run_gemm_sp(
     print("pass")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def run_gemm_sp_sm90(
     M,
     N,
@@ -228,8 +230,8 @@ def run_gemm_sp_sm90(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm90(
         M,
@@ -259,6 +261,9 @@ def run_gemm_sp_sm90(
     )
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def run_gemm_sp_sm80(
     M,
     N,
@@ -271,8 +276,8 @@ def run_gemm_sp_sm80(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm80(
         M,
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index ed7fb31a..de2e61ee 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -41,34 +41,35 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
-def issue_1013_buggy_kernel():
-    # NOTE: This kernel is mainly to test some corner cases in boundary check
-
-    num_tokens = T.dynamic("num_tokens")
-    num_threads = 128
-
-    @T.prim_func
-    def main(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var("int")
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += x[idx] == 2
-
-    # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
-    # and the padding value is not used. However, the current prover cannot handle this case.
-    # So for now the expected kernel is a if-else statement to check the boundary.
-    @T.prim_func
-    def expected(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var("int")
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += T.Cast("int32", T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
-
-    return main, expected
+# def issue_1013_buggy_kernel():
+#     # NOTE: This kernel is mainly to test some corner cases in boundary check
+
+#     num_tokens = T.dynamic('num_tokens')
+#     num_threads = 128
+
+#     @T.prim_func
+#     def main(x: T.Tensor((num_tokens,), dtype="int64")):
+#         with T.Kernel(1, threads=num_threads) as _:
+#             count = T.alloc_var('int')
+#             thread_idx = T.get_thread_binding()
+#             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
+#                 idx = thread_idx + i * num_threads
+#                 count += x[idx] == 2
+
+#     # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
+#     # and the padding value is not used. However, the current prover cannot handle this case.
+#     # So for now the expected kernel is a if-else statement to check the boundary.
+#     @T.prim_func
+#     def expected(x: T.Tensor((num_tokens,), dtype="int64")):
+#         with T.Kernel(1, threads=num_threads) as _:
+#             count = T.alloc_var('int')
+#             thread_idx = T.get_thread_binding()
+#             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
+#                 idx = thread_idx + i * num_threads
+#                 count += T.Cast("int32",
+#                                 value=T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
+
+#     return main, expected
 
 
 def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
@@ -151,11 +152,11 @@ def test_vectorize_access():
     assert_vectorize_access(64, 64)
 
 
-def test_issue_1013():
-    func, expected = issue_1013_buggy_kernel()
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
+# def test_issue_1013():
+#     func, expected = issue_1013_buggy_kernel()
+#     mod = tvm.IRModule({func.attrs["global_symbol"]: func})
+#     transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
+#     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
 def test_vectorize_access_with_atmoic_add():
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 012fdb55..eb464f6a 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -243,8 +243,8 @@ def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
 
 @macro
 def cumsum_fragment(
-    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
-    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    src: tir.Buffer,
+    dst: tir.Buffer,
     dim: int,
     reverse: bool,
 ) -> tir.PrimExpr:
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index 26a8e345..d7a2c5f4 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -21,7 +21,6 @@ def _get_cached_lib():
         try:
             return _import_module_from_library(name, _CACHE_DIR, is_python_module=True)
         except Exception:
-            # If loading fails, recompile
             pass
 
     # Set TORCH_CUDA_ARCH_LIST
-- 
GitLab


From f4f87f468de59fc4ddadb2dd2eaae69a0338eaf5 Mon Sep 17 00:00:00 2001
From: senlyu163 <70838408+senlyu163@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:39:32 +0800
Subject: [PATCH 118/139] [Bugfix] Improve autotune from elementwise_add
 function in examples (#1445)

* Remove JIT decorator from elementwise_add function in examples

* fix kernel compilation without autotune

* Refactor main function to accept parameters and update tests for autotune option

* Refactor autotune test function for morden style
---
 .../elementwise/example_elementwise_add.py    | 58 ++++++-------------
 .../elementwise/test_example_elementwise.py   |  4 ++
 2 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index 464312ce..72459459 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -3,13 +3,21 @@ import itertools
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.autotuner import AutoTuner
 
 
 def ref_program(x, y):
     return x + y
 
 
+def get_configs():
+    block_M = [64, 128, 256]
+    block_N = [64, 128, 256]
+    threads = [64, 128, 256]
+    configs = list(itertools.product(block_M, block_N, threads))
+    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
+
+
+@tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
     @T.prim_func
@@ -30,47 +38,12 @@ def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
     return elem_add
 
 
-def get_configs(M, N):
-    block_M = [64, 128, 256]
-    block_N = [64, 128, 256]
-    threads = [64, 128, 256]
-    configs = list(itertools.product(block_M, block_N, threads))
-    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
-
-
-def get_best_config(M, N):
-    def kernel(block_M=None, block_N=None, threads=None):
-        return elementwise_add(M, N, block_M, block_N, "float32", "float32", threads)
-
-    autotuner = (
-        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N))
-        .set_compile_args(
-            out_idx=[-1],
-            target="cuda",
-        )
-        .set_profile_args(
-            supply_type=tilelang.TensorSupplyType.Auto,
-            ref_prog=ref_program,
-            skip_check=False,
-        )
-    )
-    return autotuner.run(warmup=3, rep=20)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--m", type=int, default=1024)
-    parser.add_argument("--n", type=int, default=1024)
-    parser.add_argument("--use_autotune", action="store_true", default=False)
-    args, _ = parser.parse_known_args()
-    M, N = args.m, args.n
-
+def main(M=1024, N=1024, use_autotune=False):
     a = torch.randn(M, N, dtype=torch.float32, device="cuda")
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
-    if args.use_autotune:
-        result = get_best_config(M, N)
-        kernel = result.kernel
+    if use_autotune:
+        kernel = elementwise_add(M, N, in_dtype="float32", out_dtype="float32")
     else:
         # Default config
         config = {"block_M": 32, "block_N": 32, "threads": 128}
@@ -81,4 +54,9 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--m", type=int, default=1024)
+    parser.add_argument("--n", type=int, default=1024)
+    parser.add_argument("--use_autotune", action="store_true", default=False)
+    args, _ = parser.parse_known_args()
+    main(args.m, args.n, args.use_autotune)
diff --git a/examples/elementwise/test_example_elementwise.py b/examples/elementwise/test_example_elementwise.py
index f1668f4a..24f675cd 100644
--- a/examples/elementwise/test_example_elementwise.py
+++ b/examples/elementwise/test_example_elementwise.py
@@ -6,5 +6,9 @@ def test_example_elementwise_add():
     example_elementwise_add.main()
 
 
+def test_example_elementwise_add_autotune():
+    example_elementwise_add.main(use_autotune=True)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
-- 
GitLab


From 0814b17129bdbc8a56e325ff7ef6cca695140a44 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 17 Dec 2025 12:44:19 +0800
Subject: [PATCH 119/139] [Language] Introduce `T.annotate_restrict_buffers`
 (#1428)

* [Enhancement] Introduce non-restrict parameter support in code generation

- Added a new PrimFunc-level attribute `tl.non_restrict_params` to specify handle Vars that should not be marked with the restrict qualifier during code generation.
- Updated `CodeGenTileLangCPP`, `CodeGenTileLangCUDA`, and `CodeGenTileLangHIP` to handle non-restrict parameters, ensuring proper treatment of overlapping buffer aliases.
- Implemented a new annotation function `annotate_restrict_buffers` to facilitate the marking of buffer parameters as non-restrict.
- Enhanced the `SplitHostDevice` transformation to propagate non-restrict parameters from host to device functions.
- Added a new transform function `HoistNonRestrictParams` to manage non-restrict parameters effectively.

* [Enhancement] Improve HoistNonRestrictParams transformation

- Updated the HoistNonRestrictParams function to recursively collect all `tl.non_restrict_params` annotations from nested blocks, enhancing flexibility in annotation placement.
- Introduced a new NonRestrictCollector class to manage the collection and deduplication of non-restrict parameters.
- Modified the SplitHostDevice transformation to remove the non-restrict attribute from the host-side PrimFunc after propagation to device kernels.
- Adjusted the LowerAndLegalize function to directly apply the HoistNonRestrictParams transformation without exception handling, streamlining the process.

* [Refactor] Simplify non-restrict parameter handling in code generation

- Removed unnecessary normalization logic and associated data structures from `CodeGenTileLangCPP`, `CodeGenTileLangCUDA`, and `CodeGenTileLangHIP`.
- Streamlined the handling of non-restrict parameters by directly inserting them into the `non_restrict` set, improving code clarity and maintainability.
- Updated conditional checks to eliminate redundant checks against normalized names, enhancing performance and readability.

* [Dependency] Update TVM subproject to latest commit 68aa8461

- Updated the TVM subproject to the latest commit, ensuring compatibility with recent changes and improvements.
- Refactored non-restrict parameter handling in `CodeGenTileLangCPP`, `CodeGenTileLangCUDA`, and `CodeGenTileLangHIP` to enhance code clarity and maintainability.
- Adjusted the `SplitHostDevice` transformation to streamline the propagation of non-restrict parameters.

* fix
---
 src/op/builtin.h                              |   4 +
 src/target/codegen_cpp.cc                     |   9 +-
 src/target/codegen_cuda.cc                    |  16 ++-
 src/target/codegen_hip.cc                     |   8 +-
 src/transform/hoist_nonrestrict_params.cc     | 133 ++++++++++++++++++
 src/transform/split_host_device.cc            |  28 +++-
 .../components/test_cuda_restrict_codegen.py  |  48 +++++++
 tilelang/engine/phase.py                      |   2 +
 tilelang/language/__init__.py                 |   1 +
 tilelang/language/annotations.py              |  29 ++++
 tilelang/transform/__init__.py                |   4 +
 11 files changed, 272 insertions(+), 10 deletions(-)
 create mode 100644 src/transform/hoist_nonrestrict_params.cc
 create mode 100644 testing/python/components/test_cuda_restrict_codegen.py

diff --git a/src/op/builtin.h b/src/op/builtin.h
index 7b071e84..81bc64e5 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -28,6 +28,10 @@ static constexpr const char *kWarpSpecializationScope =
 static constexpr const char *kCustomWarpSpecialization =
     "kCustomWarpSpecialization";
 static constexpr const char *kLocalVarInit = "tl.local_var_init";
+// A PrimFunc-level attribute carrying a list of handle Vars
+// that must NOT be marked with the restrict qualifier in codegen.
+// Type: Array<tir::Var>
+static constexpr const char *kNonRestrictParams = "tl.non_restrict_params";
 } // namespace attr
 
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index 975f9a48..4f736bb0 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -29,6 +29,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "../op/builtin.h"
 #include "../support/ffi_aliases.h"
 #include "support/str_escape.h"
 #include "target/build_common.h"
@@ -260,6 +261,12 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -294,7 +301,7 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 011855fb..408b16cc 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -3418,6 +3418,12 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
   // Read-only param indices attribute, if present.
   std::unordered_set<int> ro_param_indices;
   if (auto opt =
@@ -3461,7 +3467,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3497,6 +3503,12 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
   // Read-only param indices attribute, if present.
   std::unordered_set<int> ro_param_indices;
   if (auto opt = f->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
@@ -3542,7 +3554,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 420f4d9f..8a18c3fc 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -1322,6 +1322,12 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
   ICHECK(global_symbol.has_value())
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -1356,7 +1362,7 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/transform/hoist_nonrestrict_params.cc b/src/transform/hoist_nonrestrict_params.cc
new file mode 100644
index 00000000..90db747e
--- /dev/null
+++ b/src/transform/hoist_nonrestrict_params.cc
@@ -0,0 +1,133 @@
+/*
+ * Hoist tl.non_restrict_params block annotation(s) to PrimFunc attribute.
+ *
+ * Previously, we only looked at the root block. This version recursively
+ * scans all blocks, unions any tl.non_restrict_params entries it finds,
+ * merges with any existing PrimFunc-level attribute, then writes the
+ * deduplicated result back to the PrimFunc attrs. This makes annotation
+ * placement within the function body flexible for frontends.
+ */
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+namespace tvm {
+namespace tl {
+using namespace tvm::tir;
+
+class NonRestrictCollector : public StmtVisitor {
+public:
+  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
+
+  Array<Var> Result() const {
+    Array<Var> out;
+    out.reserve(collected_.size());
+    for (const Var &v : collected_)
+      out.push_back(v);
+    return out;
+  }
+
+private:
+  static std::string NormalizeName(const std::string &s) {
+    if (s.size() >= 8 && s.rfind("_handle") == s.size() - 7) {
+      return s.substr(0, s.size() - 7);
+    }
+    return s;
+  }
+
+  void MaybeInsert(const Var &v) {
+    if (!v.defined())
+      return;
+    const VarNode *p = v.get();
+    if (seen_ptr_.count(p))
+      return;
+    // Also dedup by normalized name to be robust w.r.t recreated Vars
+    std::string norm = NormalizeName(v->name_hint);
+    if (seen_name_.count(norm))
+      return;
+    seen_ptr_.insert(p);
+    seen_name_.insert(std::move(norm));
+    collected_.push_back(v);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    auto it = op->annotations.find(attr::kNonRestrictParams);
+    if (it != op->annotations.end()) {
+      if (const auto *arr = (*it).second.as<ffi::ArrayObj>()) {
+        // Downcast directly to Array<Var> for convenience
+        Array<Var> vars = tvm::Downcast<Array<Var>>((*it).second);
+        for (const Var &v : vars) {
+          MaybeInsert(v);
+        }
+      }
+    }
+    // Recurse into child statements
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  std::vector<Var> collected_;
+  std::unordered_set<const VarNode *> seen_ptr_;
+  std::unordered_set<std::string> seen_name_;
+};
+
+static PrimFunc HoistNonRestrictParams(PrimFunc f) {
+  if (!f.defined())
+    return f;
+
+  NonRestrictCollector collector;
+  collector.Collect(f->body);
+  Array<Var> from_blocks = collector.Result();
+
+  // Merge with any existing PrimFunc-level attribute if present
+  if (auto opt_existing = f->GetAttr<Array<Var>>(attr::kNonRestrictParams)) {
+    for (const Var &v : opt_existing.value()) {
+      // Reuse the collector's dedup logic by temporarily constructing a new
+      // collector Alternatively, do a small inline dedup mirroring MaybeInsert
+      // Here we inline a simplified pointer-based dedup plus name-based
+      // fallback
+      bool exists = false;
+      for (const Var &cur : from_blocks) {
+        if (cur.get() == v.get() || cur->name_hint == v->name_hint) {
+          exists = true;
+          break;
+        }
+      }
+      if (!exists)
+        from_blocks.push_back(v);
+    }
+  }
+
+  if (from_blocks.empty())
+    return f;
+
+  return WithAttr(std::move(f), attr::kNonRestrictParams,
+                  std::move(from_blocks));
+}
+
+namespace transform {
+
+tvm::transform::Pass HoistNonRestrictParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &,
+                      const tvm::transform::PassContext &) {
+    return tvm::tl::HoistNonRestrictParams(std::move(f));
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tl.HoistNonRestrictParams", {});
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.HoistNonRestrictParams",
+                        tvm::tl::transform::HoistNonRestrictParams);
+}
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index 0f88ad2e..bfdcb5cd 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,6 +33,7 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "../op/builtin.h"
 #include "common/assume.h"
 #include "tir/analysis/var_use_def_analysis.h"
 #include "tvm/node/cast.h"
@@ -57,6 +58,12 @@ public:
                               std::function<GlobalVar()> var_supply)
       : device_mod_(device_mod), var_supply_(std::move(var_supply)) {}
 
+  void SetNonRestrictParams(Optional<Array<tir::Var>> params) {
+    for (auto param : params.value()) {
+      non_restrict_params_.push_back(param);
+    }
+  }
+
   tir::Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       found_device_region_ = true;
@@ -93,6 +100,7 @@ public:
 
 private:
   bool found_device_region_{false};
+  Array<tir::Var> non_restrict_params_;
 
   Stmt wrapBodyWithHostSideAssumes(Stmt body) {
     for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
@@ -103,6 +111,7 @@ private:
   }
 
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
+
     auto [params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
       tir::VarUseDefAnalyzer use_def(/*defined_vars=*/{},
@@ -152,9 +161,11 @@ private:
 
     tir::PrimFunc device_func(params, body, kernel_ret_type);
     device_func =
-        WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target},
-                                           {tir::attr::kNoAlias, true},
-                                           {tir::attr::kIsGlobalFunc, true}});
+        WithAttrs(std::move(device_func),
+                  {{tvm::attr::kTarget, device_target},
+                   {tir::attr::kNoAlias, true},
+                   {tir::attr::kIsGlobalFunc, true},
+                   {tl::attr::kNonRestrictParams, non_restrict_params_}});
 
     GlobalVar kernel_symbol_global = var_supply_();
     (*device_mod_)->Add(kernel_symbol_global, device_func);
@@ -188,6 +199,13 @@ private:
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
                               std::function<GlobalVar()> var_supply) {
   HostDeviceSplitter splitter(device_mod, std::move(var_supply));
+  // Propagate non-restrict parameter list from host func to device kernels
+  if (auto opt = func->GetAttr<Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    splitter.SetNonRestrictParams(opt.value());
+    // Remove the attribute from host-side PrimFunc; it only matters for device
+    // codegen.
+    func = tvm::WithoutAttr(std::move(func), tl::attr::kNonRestrictParams);
+  }
 
   if (auto body = splitter(func->body); !body.same_as(func->body)) {
     func.CopyOnWrite()->body = body;
@@ -204,7 +222,6 @@ tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
       }
     }
   }
-
   return func;
 }
 
@@ -235,7 +252,6 @@ tvm::transform::Pass SplitHostDevice() {
         }
       }
     }
-
     mod->Update(updates);
     mod->Update(device_mod);
     return tir::transform::ConvertSSA()(mod);
@@ -252,4 +268,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 } // namespace transform
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/testing/python/components/test_cuda_restrict_codegen.py b/testing/python/components/test_cuda_restrict_codegen.py
new file mode 100644
index 00000000..bff8b3b1
--- /dev/null
+++ b/testing/python/components/test_cuda_restrict_codegen.py
@@ -0,0 +1,48 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _get_sig_line(code: str) -> str:
+    # Find the kernel signature line in generated CUDA code
+    for line in code.splitlines():
+        line = line.strip()
+        if line.startswith('extern "C" __global__ void'):
+            return line
+    raise AssertionError("Kernel signature not found in generated code")
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_default_has_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        with T.Kernel(N, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    artifact = tilelang.lower(kernel, target="cuda")
+    sig = _get_sig_line(artifact.kernel_source)
+    # By default, kNoAlias is set and both pointers are restrict-qualified
+    assert "__restrict__" in sig
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_annotation_removes_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel_body_annot(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        # Explicitly mark buffers that may alias as non-restrict
+        with T.Kernel(N, threads=32) as pid:
+            T.annotate_restrict_buffers(x, y)
+            y[pid] = x[pid] + 1.0
+
+    art1 = tilelang.lower(kernel_body_annot, target="cuda")
+    sig1 = _get_sig_line(art1.kernel_source)
+    # No parameter should be emitted with __restrict__
+    assert "__restrict__" not in sig1
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index fbbe3e4e..0e72c837 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -175,6 +175,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # TODO(lei): return to tir pass when kSymbolicBound simplification
     # is merged into tvm.
     mod = tilelang.transform.Simplify()(mod)
+    # Hoist any root-block annotations to PrimFunc attrs if pass is available
+    mod = tilelang.transform.HoistNonRestrictParams()(mod)
     return mod
 
 
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 9a6354e9..435aa974 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -108,6 +108,7 @@ from .annotations import (  # noqa: F401
     annotate_layout,
     annotate_safe_value,
     annotate_l2_hit_ratio,
+    annotate_restrict_buffers,
 )
 
 
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 09cfa58b..43ca9c05 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -11,6 +11,7 @@ __all__ = [
     "annotate_layout",
     "annotate_safe_value",
     "annotate_l2_hit_ratio",
+    "annotate_restrict_buffers",
 ]
 
 
@@ -51,3 +52,31 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
         assert buffer.scope() == "global", "persistent L2 can only be applied to global buffers"
         _l2_hit_ratio_map[buffer.data] = FloatImm("float32", float(hit_ratio))
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
+
+
+def annotate_restrict_buffers(*buffers):
+    """Mark the given buffer parameters as non-restrict.
+
+    This annotation tells codegen to omit the `__restrict__` qualifier for the
+    specified kernel buffer parameters. Use this when two (or more) buffers may
+    alias, for example overlapping slices from the same base tensor.
+
+    Example
+    -------
+    >>> @T.prim_func
+    ... def buggy_kernel(x: T.Tensor((N,), T.float32),
+    ...                  y: T.Tensor((N,), T.float32)):
+    ...     T.annotate_restrict_buffers(x, y)
+    ...     with T.Kernel(N, threads=32) as pid:
+    ...         y[pid] = x[pid] + 1
+    """
+    if not buffers:
+        return None
+    data_vars = []
+    for buf in buffers:
+        try:
+            data_vars.append(buf.data)
+        except Exception as e:
+            raise TypeError(f"annotate_restrict_buffers expects Buffer arguments, got {type(buf)}") from e
+    # Also return as block attribute (root block exists by default) for readability/tools.
+    return block_attr({"tl.non_restrict_params": data_vars})
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 1dbf66d7..697dee2b 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -435,6 +435,10 @@ def PlanAndUpdateBufferAllocationLocation():
     return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
 
 
+def HoistNonRestrictParams():
+    return _ffi_api.HoistNonRestrictParams()  # type: ignore
+
+
 def StorageRewrite():
     """StorageRewrite
 
-- 
GitLab


From f914f2d738404c85b5e7f6da68464afa702ecc9c Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 17 Dec 2025 13:04:43 +0800
Subject: [PATCH 120/139] [Analyzer] Require loop extent > 0 when entering loop
 (#1451)

---
 3rdparty/tvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 4d3ec925..0a7a6eac 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 4d3ec9253e346b2281513700e692124aefaff347
+Subproject commit 0a7a6eac5f10b896927610f2fff864f66753aea9
-- 
GitLab


From 0c25c4f31ea479ac4b37913cb2d58270412e3acf Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Wed, 17 Dec 2025 14:16:59 +0800
Subject: [PATCH 121/139] Updat ROCm CI to Nightly-ROCm-7.1 (#1449)

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0bf2e2ec..bd0196b3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -93,7 +93,7 @@ jobs:
             name: self-hosted-amd
             # Format: [Nightly-]ROCm-<major>.<minor>[.<patch>]. E.g., "ROCm-6.4" or "Nightly-ROCm-7.0".
             # Use "Nightly-" prefix to use torch nightly builds.
-            toolkit: ROCm-6.3
+            toolkit: Nightly-ROCm-7.1
           - tags: [macos-latest]
             name: macos-latest
             toolkit: Metal # or Nightly-Metal
-- 
GitLab


From c750fb8a988e74f10c65e9b5b218ab1092aa9340 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 17 Dec 2025 14:41:25 +0800
Subject: [PATCH 122/139] [Enhancement] Update examples and tests for improved
 type handling functionality (#1448)

* [Enhancement] Update examples and tests for improved type handling and functionality

- Enhanced various example scripts to support new data types and improve compatibility with PyTorch.
- Updated tests across multiple modules to ensure correct functionality with the latest changes in type handling.
- Refactored code in examples to streamline operations and improve clarity, particularly in tensor operations and memory management.
- Added comprehensive tests for new features and fixed existing issues related to type conversions and buffer handling.

* [Refactor] Update accumulation data type to float32 across examples

- Changed accumulation data type from "float" to T.float32 in multiple example scripts to ensure consistency and improve numerical stability.
- This update affects various modules including flash attention, GEMM analysis, convolution, and deepseek MLA examples, enhancing type handling across the board.

* [Refactor] Standardize data type usage across benchmark scripts

- Updated data type definitions in benchmark scripts to use T.float16 and T.float32 consistently, enhancing clarity and type handling.
- Adjusted dtype assignments in matmul functions and configuration setups to align with the new standard.
- Improved overall code consistency and maintainability by ensuring uniform data type usage across various modules.

* [Refactor] Standardize data type usage in templates and scripts

- Updated data type definitions in various templates and scripts to use string representations (e.g., "float16", "int32") instead of T.float16 and T.int32 for improved consistency and clarity.
- Enhanced overall code maintainability by ensuring uniform data type usage across multiple modules, including convolution, elementwise operations, and matrix multiplication templates.
- This change aims to streamline type handling and improve compatibility with existing workflows.

* [Refactor] Standardize data type usage in examples and benchmarks

- Updated data type definitions in various example and benchmark scripts to use T.float16 and T.int32 consistently, enhancing clarity and maintainability.
- Adjusted dtype assignments in kernel functions and configuration setups to align with the new standard.
- Improved overall code consistency by ensuring uniform data type usage across multiple modules, including attention mechanisms, matrix multiplication, and GEMM examples.

* [Refactor] Import dtypes from language.v2 module

- Added import statement for dtypes from the language.v2 module to enhance type handling and maintain consistency across the codebase.
- This change aims to streamline data type management and improve overall code clarity.

* fix

* [Refactor] Standardize data type usage across scripts

- Updated data type definitions in various scripts to use string representations (e.g., "float16", "int8") instead of T.float16 and T.int8 for improved consistency and clarity.
- Adjusted dtype assignments in functions and configuration setups to align with the new standard, enhancing overall code maintainability.
- This change affects multiple modules, including benchmark and attention mechanisms, ensuring uniform data type usage throughout the codebase.

* [Refactor] Update data type handling for consistency and clarity

- Changed string representations of data types in the Hint class to use T.float32 and T.int32 for improved consistency.
- Added new data types "int4" and "int16" to the dtypes module, enhancing type support across the codebase.
- Updated function signatures and assertions in the lop3 and mxfp modules to utilize the new data types, ensuring uniformity in type handling.
- This refactor aims to streamline data type management and improve overall code clarity and maintainability.

* [Enhancement] Improve data type handling and error messaging

- Introduced a mapping for canonical data types to their display strings, enhancing clarity in type representation.
- Updated the dtype creation logic to utilize the new mapping, ensuring more intuitive handling of string inputs.
- Refined error messages in the lop3 module to provide clearer feedback on invalid source formats, improving debugging and user experience.

* [Fix] Correct boolean flag in GEMM SP test case

- Updated the boolean flag in the test_gemm_sp_sm90 function to ensure proper functionality in the test case.
- This change enhances the accuracy of the test and aligns it with expected behavior for the GEMM SP implementation.

* [Refactor] Standardize data type usage across scripts

- Updated data type definitions in various scripts to use T.float16 and T.bfloat16 consistently, enhancing clarity and maintainability.
- Adjusted dtype assignments in function signatures and argument parsing to align with the new standard, ensuring uniform data type usage throughout the codebase.
- This change affects multiple modules, including benchmarks and examples, improving overall code consistency and readability.

* [Refactor] Standardize data type usage in various modules

- Updated data type assignments in multiple scripts to utilize T.float32, T.int8, and T.int32 consistently, enhancing clarity and maintainability.
- Adjusted function signatures and parameter types across benchmarks, examples, and tests to align with the new standard, ensuring uniform data type usage throughout the codebase.
- This change improves overall code consistency and readability, impacting modules related to matrix multiplication, GEMM, and tensor operations.

* [Refactor] Update argument parsing for data types in benchmarks

- Changed argument parsing for data types in benchmark_matmul_intrinsic.py and benchmark_matmul_sp.py to use string representations ("float16", "int8", "float") instead of T.float16 and T.float.
- This update enhances consistency in data type handling across benchmark scripts, improving clarity and maintainability.

* [Refactor] Update data type handling in benchmark and example scripts

- Changed data type arguments in benchmark and example scripts to use string representations ("float16") instead of T.float16 for improved consistency.
- Updated function signatures and argument parsing to align with the new standard, enhancing clarity and maintainability across the codebase.
- This change affects multiple modules related to attention mechanisms and tensor operations, ensuring uniform data type usage throughout the examples.

* [Refactor] Fix data type conversion in multiple scripts

- Corrected the usage of the data type conversion method from dtype..as_torch() to dtype.as_torch() across various benchmark and example scripts.
- This change enhances consistency in data type handling and improves code readability, impacting modules related to attention mechanisms and tensor operations.

* [Refactor] Update float8 data type usage across multiple scripts

- Changed instances of T.float8_e4m3 to T.float8_e4m3fn in various benchmark, example, and test scripts to ensure consistency in data type handling.
- This update enhances clarity and maintainability across the codebase, particularly in modules related to matrix multiplication and tensor operations.

* [Refactor] Enhance float8 data type handling in CUDA code generation

- Updated the handling of float8 data types in the CUDA code generation to include additional float8 variants, improving type conversion logic.
- Adjusted conditions to ensure proper type checks for float8 conversions, enhancing clarity and maintainability in the codebase.
- Modified layout inference to streamline float8 type checks, ensuring consistency across the implementation.
- This change impacts modules related to matrix operations and CUDA code generation, improving overall type handling and conversion accuracy.

* [Refactor] Streamline float8 data type handling in CUDA and related modules

- Enhanced float8 data type handling in CUDA code generation by refining type conversion logic and ensuring consistent type checks.
- Updated layout inference for float8 types to improve clarity and maintainability across the implementation.
- This change impacts modules related to matrix operations and CUDA code generation, improving overall type handling and conversion accuracy.

* [Refactor] Remove unnecessary cache disabling in float8 example script

- Eliminated the call to tilelang.disable_cache() in example_group_per_split_token_cast_to_fp8.py to streamline the code.
- This change enhances clarity and maintainability of the example script without affecting its functionality.

* [Refactor] Update data type usage in debug print tests

- Changed the argument for dtype in the test_debug_print_buffer function from a string representation to the corresponding T.bool type.
- This update enhances consistency in data type handling within the test suite, improving clarity and maintainability.

* lint fix

* Update function parameter types from `str` to `T.dtype` for improved type safety in attention sink and related examples

* Refactor `gemv_alloc_reducer` function signature for improved readability by formatting parameters across multiple lines.
---
 README.md                                     |   2 +-
 .../benchmark_tilelang_block_sparse_fmha.py   |   6 +-
 .../mamba2/benchmark_mamba_chunk_scan.py      |   4 +-
 benchmark/matmul/benchmark_matmul.py          |  10 +-
 .../matmul/benchmark_matmul_intrinsic.py      |  32 ++--
 benchmark/matmul/benchmark_matmul_sp.py       |   2 +-
 benchmark/matmul_fp8/benchmark_matmul.py      |  10 +-
 docs/deeplearning_operators/elementwise.md    |  12 +-
 examples/amd/example_amd_flash_attn_bwd.py    |  18 +-
 examples/amd/example_amd_flash_attn_fwd.py    |   6 +-
 examples/analyze/README.md                    |  12 +-
 examples/analyze/example_conv_analyze.py      |   6 +-
 examples/analyze/example_gemm_analyze.py      |   4 +-
 .../attention_sink/benchmark_gqa_sink_fwd.py  |   4 +-
 .../attention_sink/benchmark_mha_sink_fwd.py  |   4 +-
 .../example_gqa_sink_bwd_bhsd.py              |  31 +--
 ...ample_gqa_sink_fwd_bhsd_wgmma_pipelined.py |   9 +-
 .../example_mha_sink_bwd_bhsd.py              |  33 ++--
 .../example_mha_sink_fwd_bhsd.py              |  11 +-
 ...ample_mha_sink_fwd_bhsd_wgmma_pipelined.py |  11 +-
 .../tilelang_bitnet_158_int8xint2_decode.py   |  26 +--
 .../tilelang_bitnet_158_int8xint2_prefill.py  |  40 ++--
 .../kernel_benchmark/tl_int8xint8.py          |  26 +--
 .../example_tilelang_block_sparse_attn.py     |   6 +-
 ...xample_tilelang_sparse_gqa_decode_paged.py |  18 +-
 ...ilelang_sparse_gqa_decode_varlen_indice.py |  18 +-
 ..._tilelang_sparse_gqa_decode_varlen_mask.py |  12 +-
 .../example_blocksparse_gemm.py               |   2 +-
 ...ample_group_per_split_token_cast_to_fp8.py |  18 +-
 .../cast/example_per_token_cast_to_fp8.py     |   6 +-
 examples/compile_flags/usecase.py             |  54 ------
 examples/convolution/example_convolution.py   |   6 +-
 .../example_convolution_autotune.py           |   6 +-
 .../example_deepgemm_fp8_2xAcc.py             |  20 +-
 .../amd/benchmark_mla_decode_amd_tilelang.py  |   4 +-
 examples/deepseek_mla/example_mla_decode.py   |   4 +-
 .../deepseek_mla/example_mla_decode_paged.py  |  20 +-
 .../example_mla_decode_persistent.py          |   4 +-
 .../deepseek_mla/example_mla_decode_ws.py     |   4 +-
 .../experimental/example_mla_decode_kv_fp8.py |   6 +-
 .../benchmark/benchmark_nsa_fwd.py            |  10 +-
 .../deepseek_nsa/example_tilelang_nsa_bwd.py  |  24 +--
 .../example_tilelang_nsa_decode.py            |   6 +-
 .../deepseek_nsa/example_tilelang_nsa_fwd.py  |   6 +-
 .../example_tilelang_nsa_fwd_varlen.py        |  12 +-
 examples/deepseek_v32/fp8_lighting_indexer.py |  10 +-
 examples/deepseek_v32/inference/kernel.py     |  16 +-
 examples/deepseek_v32/sparse_mla_bwd.py       |  34 ++--
 examples/deepseek_v32/sparse_mla_fwd.py       |   6 +-
 .../deepseek_v32/sparse_mla_fwd_pipelined.py  |   6 +-
 examples/deepseek_v32/topk_selector.py        |  46 ++---
 .../example_dequant_gemm_bf16_fp4_hopper.py   |  54 +++---
 .../example_dequant_gemm_bf16_mxfp4_hopper.py |  58 +++---
 ...mple_dequant_gemm_bf16_mxfp4_hopper_tma.py |  58 +++---
 .../example_dequant_gemm_fine_grained.py      |  26 +--
 .../example_dequant_gemm_fp4_hopper.py        |  32 ++--
 .../example_dequant_gemm_w4a8.py              |  22 +--
 .../example_dequant_gemv_fp16xint4.py         |  12 +-
 ...e_dequant_groupedgemm_bf16_mxfp4_hopper.py |  40 ++--
 examples/dsa_sparse_finetune/indexer_bwd.py   |   6 +-
 .../indexer_topk_reducesum.py                 |   6 +-
 .../dsa_sparse_finetune/sparse_mla_bwd.py     |  34 ++--
 .../dsa_sparse_finetune/sparse_mla_fwd.py     |   6 +-
 .../sparse_mla_topk_reducesum.py              |  12 +-
 examples/dynamic_shape/example_dynamic.py     |   4 +-
 .../elementwise/example_elementwise_add.py    |   4 +-
 examples/flash_attention/example_gqa_bwd.py   |  20 +-
 .../example_gqa_bwd_tma_reduce.py             |  20 +-
 .../example_gqa_bwd_tma_reduce_varlen.py      |  34 ++--
 .../example_gqa_bwd_wgmma_pipelined.py        |  12 +-
 .../flash_attention/example_gqa_fwd_bshd.py   |   4 +-
 .../example_gqa_fwd_bshd_wgmma_pipelined.py   |   4 +-
 .../flash_attention/example_gqa_fwd_varlen.py |   8 +-
 .../flash_attention/example_mha_bwd_bhsd.py   |  16 +-
 .../flash_attention/example_mha_bwd_bshd.py   |  16 +-
 .../example_mha_bwd_bshd_wgmma_pipelined.py   |  12 +-
 .../flash_attention/example_mha_fwd_bhsd.py   |   4 +-
 .../example_mha_fwd_bhsd_wgmma_pipelined.py   |   4 +-
 .../flash_attention/example_mha_fwd_bshd.py   |   4 +-
 .../example_mha_fwd_bshd_wgmma_pipelined.py   |   4 +-
 .../flash_attention/example_mha_fwd_varlen.py |   8 +-
 examples/flash_decoding/example_gqa_decode.py |   4 +-
 .../example_gqa_decode_varlen_logits.py       |  24 +--
 .../example_gqa_decode_varlen_logits_paged.py |  28 +--
 .../flash_decoding/example_mha_inference.py   |   4 +-
 .../fusedmoe/example_fusedmoe_tilelang.py     |  22 +--
 examples/gdn/example_chunk_delta_bwd.py       |  18 +-
 examples/gdn/example_chunk_delta_h.py         |  10 +-
 examples/gdn/example_chunk_o.py               |   8 +-
 examples/gdn/example_chunk_o_bwd.py           |  10 +-
 examples/gdn/example_chunk_scaled_dot_kkt.py  |  12 +-
 examples/gdn/example_cumsum.py                |   8 +-
 examples/gdn/example_wy_fast.py               |   8 +-
 examples/gdn/example_wy_fast_bwd_split.py     |  10 +-
 examples/gdn/test_example_gdn_compilation.py  |  13 +-
 examples/gemm/README.md                       |  16 +-
 examples/gemm/example_gemm.py                 |   2 +-
 examples/gemm/example_gemm_autotune.py        |  12 +-
 examples/gemm/example_gemm_intrinsics.py      |  16 +-
 examples/gemm/example_gemm_persistent.py      |   4 +-
 examples/gemm/example_gemm_schedule.py        |   2 +-
 .../gemm_fp8/example_tilelang_gemm_amd.py     |   4 +-
 .../gemm_fp8/example_tilelang_gemm_fp8.py     |   9 +-
 .../example_tilelang_gemm_fp8_2xAcc.py        |   9 +-
 .../example_tilelang_gemm_fp8_intrinsic.py    |  30 +--
 .../example_tilelang_gemm_fp8_sm100.py        |   4 +-
 examples/gemm_sm100/README.md                 |  16 +-
 examples/gemm_sm100/gemm_mma.py               |   2 +-
 examples/gemm_sm100/gemm_tcgen5mma.py         |   2 +-
 examples/gemm_sp/example_custom_compress.py   |  34 ++--
 examples/gemm_sp/example_gemm_sp.py           |  22 +--
 .../example_tilelang_gemm_splitk.py           |   2 +-
 ...ilelang_gemm_splitk_vectorize_atomicadd.py |   2 +-
 .../example_tilelang_gemm_streamk.py          |  10 +-
 examples/gemv/example_gemv.py                 |  28 +--
 .../grouped_gemm/example_grouped_gemm_bwd.py  |  22 +--
 .../grouped_gemm/example_grouped_gemm_fwd.py  |  14 +-
 .../hadamard_transform/example_hadamard.py    |   4 +-
 examples/lazy_jit/lazyjit.en.ipynb            |   6 +-
 examples/lazy_jit/lazyjit.zh.ipynb            |   6 +-
 .../example_linear_attn_bwd.py                |   4 +-
 .../example_linear_attn_fwd.py                |   4 +-
 .../example_mamba_chunk_scan.py               |   4 +-
 .../example_mamba_chunk_state.py              |   4 +-
 .../linear_attention/example_retention_fwd.py |   6 +-
 .../example_vertical_slash_sparse_attn.py     |   6 +-
 examples/norm/rms_norm.py                     |   4 +-
 examples/norm/test_rms_norm.py                |   4 +-
 examples/online_softmax/online_softmax.py     |   4 +-
 examples/plot_layout/README.md                |   6 +-
 examples/plot_layout/fragment_mfma_load_a.py  |   6 +-
 examples/plot_layout/fragment_mma_load_a.py   |   6 +-
 examples/quickstart.py                        |   2 +-
 .../block_sparse_attn_tilelang.py             |   6 +-
 .../tilelang_example_sparse_tensorcore.py     |   9 +-
 examples/topk/example_topk.py                 |   8 +-
 .../visual_layout_inference.py                |   2 +-
 .../example_warp_specialize_flashmla.py       |   4 +-
 ...warp_specialize_gemm_barrierpipe_stage2.py |   2 +-
 ...mple_warp_specialize_gemm_copy_0_gemm_1.py |   2 +-
 ...mple_warp_specialize_gemm_copy_1_gemm_0.py |   2 +-
 ...mple_warp_specialize_gemm_copy_gemm_0_1.py |   2 +-
 ...le_warp_specialize_gemm_softpipe_stage2.py |   2 +-
 maint/gemm_v2/correctness_evaluation.py       | 106 +++++-----
 maint/gemm_v2/correctness_evaluation_sm70.py  |  33 ++--
 .../gemm_v2/correctness_evaluation_tcgen05.py |  20 +-
 maint/gemm_v2/latency.py                      |   2 +-
 maint/gemm_v2/latency_gemm.py                 |   2 +-
 maint/gemm_v2/latency_mha_fwd_bhsd.py         |   4 +-
 maint/host_checks/common.py                   |   2 +-
 maint/precision/compare_ops.py                |  10 +-
 maint/scripts/performance.py                  |   4 +-
 src/target/codegen_cuda.cc                    |  54 +++---
 src/transform/layout_inference.cc             |   8 +-
 .../amd/test_tilelang_gemm_mfma_intrinsic.py  |  34 ++--
 .../amd/test_tilelang_gemm_mfma_preshuffle.py |  24 +--
 testing/python/amd/test_tilelang_test_amd.py  |  30 +--
 .../test_tilelang_fragment_loop_checker.py    |  12 +-
 .../test_tilelang_nested_loop_checker.py      |  46 ++---
 .../python/autotune/test_tilelang_autotune.py |  10 +-
 .../test_tilelang_autotune_with_inputs.py     |   4 +-
 .../test_tilelang_carver_generate_hints.py    |  13 +-
 .../test_tilelang_carver_recommend_hints.py   |  47 ++---
 .../test_storage_rewrite_detect_inplace.py    |  12 +-
 ...ng_pass_config_disable_warp_specialized.py |  19 +-
 testing/python/cpu/test_tilelang_cpu_gemm.py  |   6 +-
 .../python/debug/test_tilelang_debug_print.py |  46 ++---
 .../dynamic/test_tilelang_dynamic_symbolic.py |  40 ++--
 .../test_tilelang_dynamic_symbolic_bench.py   |  38 ++--
 .../python/fastmath/test_mathops_fastmath.py  |  20 +-
 .../python/issue/test_tilelang_issue_1001.py  |   4 +-
 .../python/issue/test_tilelang_issue_1008.py  |   2 +-
 .../python/issue/test_tilelang_issue_1115.py  |   4 +-
 .../python/issue/test_tilelang_issue_1198.py  |   2 +-
 .../python/issue/test_tilelang_issue_1210.py  |   4 +-
 .../python/issue/test_tilelang_issue_1237.py  |   6 +-
 .../python/issue/test_tilelang_issue_814.py   |   2 +-
 .../python/issue/test_tilelang_issue_830.py   |   2 +-
 .../python/issue/test_tilelang_issue_96.py    |   2 +-
 .../issue/test_tilelang_issue_merge_if.py     |   8 +-
 .../python/jit/test_tilelang_jit_callback.py  |  20 +-
 testing/python/jit/test_tilelang_jit_gemm.py  |  12 +-
 .../jit/test_tilelang_jit_gemm_cython.py      |  28 +--
 .../python/jit/test_tilelang_jit_nullptr.py   |   4 +-
 testing/python/jit/test_tilelang_jit_nvrtc.py |  26 +--
 .../jit/test_tilelang_jit_parcompile.py       |   7 +-
 .../python/jit/test_tilelang_jit_tvm_ffi.py   |  20 +-
 .../test_tilelang_kernel_bf16_gemm_mma.py     |  30 +--
 .../test_tilelang_kernel_element_wise_add.py  |  20 +-
 .../kernel/test_tilelang_kernel_fp8_gemm.py   |   4 +-
 .../test_tilelang_kernel_fp8_gemm_mma.py      |  30 +--
 .../test_tilelang_kernel_fp8_gemv_simt.py     |   6 +-
 .../kernel/test_tilelang_kernel_gemm.py       |  89 +++++----
 ...test_tilelang_kernel_gemm_mma_intrinsic.py |  40 ++--
 .../kernel/test_tilelang_kernel_gemm_simt.py  |  20 +-
 .../test_tilelang_kernel_gemm_with_stride.py  |   2 +-
 .../kernel/test_tilelang_kernel_gemv_simt.py  |  10 +-
 .../test_tilelang_kernel_int4_gemm_mma.py     |  40 ++--
 .../language/test_tilelang_language_alias.py  |   4 +-
 .../language/test_tilelang_language_all_of.py |  12 +-
 .../language/test_tilelang_language_alloc.py  |  11 +-
 ...t_tilelang_language_annotate_safe_value.py |   4 +-
 .../language/test_tilelang_language_any_of.py |  12 +-
 .../language/test_tilelang_language_assume.py |  10 +-
 .../test_tilelang_language_atomic_add.py      |  42 ++--
 .../test_tilelang_language_ceildiv.py         |   4 +-
 .../test_tilelang_language_chain_equal.py     |   4 +-
 .../language/test_tilelang_language_clamp.py  |  12 +-
 .../language/test_tilelang_language_clear.py  |   4 +-
 ...test_tilelang_language_composable_index.py |   6 +-
 .../language/test_tilelang_language_copy.py   |  34 ++--
 .../language/test_tilelang_language_cumsum.py |  31 ++-
 .../test_tilelang_language_frontend_v2.py     |  14 +-
 .../test_tilelang_language_get_warp_info.py   |  10 +-
 .../test_tilelang_language_if_range.py        |   4 +-
 .../test_tilelang_language_infinity.py        |   8 +-
 .../language/test_tilelang_language_int64.py  |   4 +-
 ...st_tilelang_language_intrinsics_codegen.py |   4 +-
 .../test_tilelang_language_lazy_jit.py        |  12 +-
 .../language/test_tilelang_language_let.py    |   2 +-
 .../test_tilelang_language_mask_op.py         |  16 +-
 .../test_tilelang_language_negative_index.py  |  12 +-
 .../test_tilelang_language_parallel.py        |   4 +-
 .../test_tilelang_language_pipeline.py        |  13 +-
 .../language/test_tilelang_language_ptr.py    |   4 +-
 .../language/test_tilelang_language_reduce.py |  53 +++--
 .../test_tilelang_language_reshape.py         |  41 ++--
 .../test_tilelang_language_ternary.py         |   4 +-
 .../language/test_tilelang_language_tma_1d.py |   2 +-
 .../language/test_tilelang_language_unroll.py |   4 +-
 .../test_tilelang_language_var_init.py        |   8 +-
 .../test_tilelang_language_vectorize.py       |  10 +-
 .../test_tilelang_language_vectorized_cast.py |  42 ++--
 .../language/test_tilelang_language_view.py   |  19 +-
 .../test_tilelang_language_warp_reduce.py     |  10 +-
 .../test_tilelang_layout_fused_replicate.py   |   6 +-
 .../python/math/test_math_bitwise_reduce.py   |  14 +-
 testing/python/math/test_math_fast_math.py    |  20 +-
 testing/python/math/test_math_ieee_math.py    |   8 +-
 testing/python/metal/test_metal_codegen.py    |  12 +-
 .../python/profiler/test_tilelang_profiler.py |   2 +-
 ..._tilelang_runtime_dynamic_shared_memory.py |   6 +-
 .../test_tilelang_tilelibrary_gemm.py         | 123 ++++++------
 .../test_tilelang_tilelibrary_gemm_sp.py      |  63 +++---
 .../test_tilelang_tilelibrary_gemm_sp_v2.py   |  97 +++++-----
 .../test_readonly_param_const_codegen.py      |   4 +-
 ...lang_transform_Inject_software_pipeline.py |   4 +-
 ...est_tilelang_transform_cluster_planning.py |  16 +-
 ...ilelang_transform_config_index_bitwidth.py |   4 +-
 ...t_tilelang_transform_inject_fence_proxy.py |  36 ++--
 ..._tilelang_transform_inject_set_max_nreg.py |  16 +-
 ...est_tilelang_transform_layout_inference.py |   4 +-
 ...elang_transform_legalize_negative_index.py | 108 +++++------
 ...g_transform_legalize_safe_memory_access.py |  44 +----
 ...lang_transform_legalize_vectorized_loop.py |   6 +-
 .../test_tilelang_transform_let_inline.py     |   8 +-
 .../test_tilelang_transform_lower_tile_op.py  |   2 +-
 ...tilelang_transform_multi_version_buffer.py |  42 ++--
 ...st_tilelang_transform_pipeline_planning.py |  16 +-
 .../test_tilelang_transform_simplify.py       |   8 +-
 ...est_tilelang_transform_warp_specialized.py |  30 +--
 testing/python/webgpu/test_webgpu_codegen.py  |   6 +-
 tilelang/__init__.py                          |   1 +
 tilelang/engine/param.py                      |   4 +-
 tilelang/intrinsics/mfma_macro_generator.py   |  22 +--
 tilelang/intrinsics/mma_macro_generator.py    |  24 +--
 .../intrinsics/mma_sm70_macro_generator.py    |  14 +-
 tilelang/intrinsics/mma_sp_macro_generator.py |  14 +-
 .../intrinsics/tcgen05_macro_generator.py     |   8 +-
 tilelang/intrinsics/wgmma_macro_generator.py  |   8 +-
 tilelang/language/allocate.py                 |  17 +-
 tilelang/language/ast/ir.py                   |  34 ++--
 tilelang/language/gemm.py                     |   2 +-
 tilelang/language/parser/entry.py             |   2 +-
 tilelang/language/tir/entry.py                |   4 +-
 tilelang/language/tir/op.py                   |   4 +-
 tilelang/language/v2/builder.py               |   1 +
 tilelang/language/v2/dtypes.py                |  48 ++++-
 tilelang/layout/gemm_sp.py                    |  23 ++-
 tilelang/quantize/lop3.py                     |  32 ++--
 tilelang/quantize/mxfp.py                     |  20 +-
 tilelang/quantize/quantization.py             | 181 +++++++++---------
 tilelang/tileop/gemm/gemm_base.py             |   5 +-
 tilelang/tileop/gemm/gemm_tcgen05.py          |   2 +-
 tilelang/transform/pass_config.py             |  12 +-
 285 files changed, 2241 insertions(+), 2329 deletions(-)
 delete mode 100644 examples/compile_flags/usecase.py

diff --git a/README.md b/README.md
index 0c0769e7..0a3cf381 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ import tilelang.language as T
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
 
     @T.prim_func
     def matmul_relu_kernel(
diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index fff65b44..e645ae14 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -40,9 +40,9 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
         @T.macro
diff --git a/benchmark/mamba2/benchmark_mamba_chunk_scan.py b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
index a3ed72b1..c9f5cec6 100644
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
@@ -202,8 +202,8 @@ def chunk_scan_fwd(
     num_stages=2,
     threads=128,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index 6ca1402d..643c1fd5 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -62,9 +62,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -155,8 +155,8 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index 010ce87f..4ef860c2 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -49,22 +49,22 @@ def tl_matmul(
     enable_rasteration=False,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     block_M = block_row_warps * warp_row_tiles
@@ -194,9 +194,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -251,9 +251,9 @@ def matmul(
     M,
     N,
     K,
-    in_dtype="float16",
-    out_dtype="float16",
-    accum_dtype="float16",
+    in_dtype=T.float16,
+    out_dtype=T.float16,
+    accum_dtype=T.float16,
     with_roller=False,
     block_row_warps=None,
     block_col_warps=None,
@@ -295,9 +295,9 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k
-    in_dtype = args.dtype
-    out_dtype = "float32" if in_dtype == "int8" else "float16"
-    accum_dtype = "float32" if in_dtype == "int8" else "float16"
+    in_dtype = T.dtype(args.dtype)
+    out_dtype = T.float32 if in_dtype == T.int8 else T.float16
+    accum_dtype = T.float32 if in_dtype == T.int8 else T.float16
     with_roller = args.with_roller
     with_roller = True
     # Compute total floating-point operations
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 22b5d13c..7ecffc26 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -262,7 +262,7 @@ if __name__ == "__main__":
     total_flops = 2 * M * N * K
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_result = matmul_sp(M, N, K, "float16", args.accum_dtype)
+    best_result = matmul_sp(M, N, K, T.float16, args.accum_dtype)
     best_latency = best_result.latency
     best_config = best_result.config
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 930e8a6d..e2e62812 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -63,9 +63,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -159,8 +159,8 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float8_e4m3fnuz" if torch.version.hip is not None else "float8_e4m3"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz if torch.version.hip is not None else T.float8_e4m3fn
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
index 5e1243c2..f3543c02 100644
--- a/docs/deeplearning_operators/elementwise.md
+++ b/docs/deeplearning_operators/elementwise.md
@@ -24,7 +24,7 @@ Please note that this tutorial does not delve deeply into the design principles
 ## Elementwise add in TileLang
 
 ```python
-def elementwise_add(N, threads=256, dtype="bfloat16"):
+def elementwise_add(N, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -43,7 +43,7 @@ Those familiar with CUDA programming might wonder where `threadIdx` fits into th
 The program can be compiled using the following code:
 
 ```python
-program = elementwise_add(1024, threads=256, dtype="bfloat16")
+program = elementwise_add(1024, threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 Launching the kernel is straightforward, just call it directly like a function:
@@ -89,7 +89,7 @@ def elementwise_add(
 In the compilation process above, a fixed shape was used. However, in practical usage, we often want the kernel to support dynamic shapes. So, how can we compile a kernel in TileLang to handle dynamic shapes? In TileLang, we can replace the target size with a dynamic symbolic value, making the dimension dynamic. The following example illustrates this:
 
 ```python
-program = elementwise_add(T.dynamic("N"), threads=256, dtype="bfloat16")
+program = elementwise_add(T.dynamic("N"), threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 
@@ -102,7 +102,7 @@ TileLang automatically incorporates boundary-checking conditions; however, this
 When compiling the example below, let's set `N` to 2047:
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -176,7 +176,7 @@ While TileLang incorporates various optimizations for the aforementioned case, i
 In such scenarios, explicitly specifying the number of elements computed per thread can help "guide" TileLang's code generation process, leading to implementations that are more closely aligned with the intended design.
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -212,7 +212,7 @@ Aha, this CUDA code aligns closely with conventional programming practices, maki
 But what happens if we provide additional hints to TileLang? For instance, by explicitly specifying register copies using the `T.copy(...)` operation. The example below demonstrates a vector addition implementation. Unlike the previous examples, this code explicitly loads data into registers before performing computations.
 
 ```python
-def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index c0335492..788aec36 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -87,8 +87,8 @@ def fast_flashattn(
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
@@ -109,7 +109,7 @@ def fast_flashattn(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx_loop_var = T.alloc_var("int32")
+            bx_loop_var = T.alloc_var(T.int32)
             bx_loop_var = b_split
 
             with T.While(bx_loop_var < num_q_blocks):
@@ -236,8 +236,8 @@ def get_bwd_configs():
 
 @tilelang.jit(out_idx=[2])
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
@@ -280,8 +280,8 @@ def flashattn_bwd(
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd_kernel(
@@ -368,8 +368,8 @@ def flashattn_bwd(
 
 @tilelang.jit(out_idx=[1])
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index bbb27557..ca9c361f 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -100,8 +100,8 @@ def fast_flashattn(
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
@@ -121,7 +121,7 @@ def fast_flashattn(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx = T.alloc_var("int32")
+            bx = T.alloc_var(T.int32)
             bx = b_split
 
             with T.While(bx < num_q_blocks):
diff --git a/examples/analyze/README.md b/examples/analyze/README.md
index 8171d882..9ec0a687 100644
--- a/examples/analyze/README.md
+++ b/examples/analyze/README.md
@@ -21,9 +21,9 @@ M = N = K = 1024
 
 def kernel(block_M=128, block_N=128, block_K=32, num_stages=3, thread_num=128):
     @T.prim_func
-    def main(A: T.Tensor((M, K), "float16"),
-             B: T.Tensor((N, K), "float16"),
-             C: T.Tensor((M, N), "float")):
+    def main(A: T.Tensor((M, K), T.float16),
+             B: T.Tensor((N, K), T.float16),
+             C: T.Tensor((M, N), T.float)):
         # ... (kernel definition)
     return main
 
@@ -40,9 +40,9 @@ from tilelang.carver.arch import CUDA
 
 def kernel(N=64, C=256, H=512, W=512, F=512, K=3, block_M=64, block_N=128):
     @T.prim_func
-    def main(data: T.Tensor((N, H, W, C), "float16"),
-             kernel: T.Tensor((K, K, C, F), "float16"),
-             out: T.Tensor((N, (H-K+1), (W-K+1), F), "float")):
+    def main(data: T.Tensor((N, H, W, C), T.float16),
+             kernel: T.Tensor((K, K, C, F), T.float16),
+             out: T.Tensor((N, (H-K+1), (W-K+1), F), T.float)):
         # ... (convolution kernel definition)
     return main
 
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index b90be143..db21e02f 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -25,12 +25,12 @@ def check_hopper():
     return False
 
 
-def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
+def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index e28440e1..0367af12 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -15,8 +15,8 @@ def kernel(
     thread_num=None,
     enable_rasteration=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def matmul(
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 3538adc3..211ef1d1 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -135,7 +136,8 @@ def main(
     dtype: str = "float16",
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= seq_q
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index 76997d84..50747e6b 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -1,6 +1,7 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
@@ -131,7 +132,8 @@ def main(
     dtype: str = "float16",
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= seq_q
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index 5af787a1..541baca0 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -37,7 +37,7 @@ def flashattn_fwd(
     block_N=64,
     num_stages=1,
     threads=128,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -49,7 +49,7 @@ def flashattn_fwd(
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -140,8 +140,8 @@ def flashattn_fwd(
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
@@ -179,8 +179,8 @@ def make_dq_layout(dQ):
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
@@ -204,7 +204,7 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     }
 )
-def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype="float16"):  # None for full attention
+def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype=T.float16):  # None for full attention
     if sm_scale is None:
         sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
@@ -212,7 +212,7 @@ def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
@@ -309,8 +309,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
@@ -346,7 +346,7 @@ class _attention(torch.autograd.Function):
 
         q, k, v, sinks = [maybe_contiguous(x) for x in (q, k, v, sinks)]
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -359,7 +359,7 @@ class _attention(torch.autograd.Function):
         q, k, v, sinks, o, lse = ctx.saved_tensors
         BATCH, H, N_CTX, D_HEAD = q.shape
         groups = ctx.groups
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
 
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
@@ -440,7 +440,8 @@ def main(
     window_size: Optional[int] = None,
     dtype: str = "float16",
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= N_CTX
@@ -472,8 +473,8 @@ def main(
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
     assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
     assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index feb5844f..df157cd0 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -41,7 +41,7 @@ def flashattn(
     block_N=128,
     num_stages=2,
     threads=256,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -53,7 +53,7 @@ def flashattn(
     head_kv = heads // groups
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, head_kv, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -263,10 +263,11 @@ def main(
     dim: int = 128,
     groups: int = 8,
     window_size: Optional[int] = None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= seq_q
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 155c488e..be405e8b 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -36,7 +36,7 @@ def flashattn_fwd(
     block_N=64,
     num_stages=1,
     threads=128,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -46,7 +46,7 @@ def flashattn_fwd(
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -137,8 +137,8 @@ def flashattn_fwd(
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
@@ -176,8 +176,8 @@ def make_dq_layout(dQ):
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
@@ -208,7 +208,7 @@ def flashattn_bwd(
     dim,
     window_size=None,  # None for full attention
     sm_scale=None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
@@ -217,7 +217,7 @@ def flashattn_bwd(
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -315,8 +315,8 @@ def flashattn_bwd(
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
@@ -346,7 +346,7 @@ class _attention(torch.autograd.Function):
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size):
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -364,7 +364,7 @@ class _attention(torch.autograd.Function):
             return x
 
         do, q, k, v, sinks, o = [maybe_contiguous(x) for x in (do, q, k, v, sinks, o)]
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         delta = kernel_prep(o, do)
@@ -433,8 +433,9 @@ def ref_program(
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: T.dtype = T.float16):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= N_CTX
@@ -466,8 +467,8 @@ def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
     assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
     assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 78ac443b..f6754bd9 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -35,7 +35,7 @@ def flashattn(
     block_N=64,
     num_stages=1,
     threads=128,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -45,7 +45,7 @@ def flashattn(
     scale = sm_scale * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -246,10 +246,11 @@ def main(
     seq_kv: int = 256,
     dim: int = 128,
     window_size: Optional[int] = None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= seq_q
@@ -308,7 +309,7 @@ if __name__ == "__main__":
     parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
     parser.add_argument("--dim", type=int, default=128, help="dim")
     parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
-    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
     parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index decdc8f4..ecaf2ce3 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -36,7 +36,7 @@ def flashattn(
     block_N=128,
     num_stages=2,
     threads=256,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
@@ -47,7 +47,7 @@ def flashattn(
 
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
@@ -256,10 +256,11 @@ def main(
     seq_kv: int = 256,
     dim: int = 128,
     window_size: Optional[int] = None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
         print("Using sliding window attention.")
         assert window_size <= seq_q
@@ -315,7 +316,7 @@ if __name__ == "__main__":
     parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
     parser.add_argument("--dim", type=int, default=128, help="dim")
     parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
-    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
     parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index 35a044e5..7b8b7b95 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -76,13 +76,13 @@ def bitnet_158_int8xint2_decode(
     reduce_thread=32,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     storage_nbit = 8
     num_bits = 2
@@ -94,7 +94,7 @@ def bitnet_158_int8xint2_decode(
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     micro_size_k = MAX_TRANSACTION_SIZE_IN_BITS // DataType(in_dtype).bits
     micro_size_k_compressed = micro_size_k // num_elems_per_byte
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     block_K = reduce_thread * micro_size_k
 
     use_dp4a = True
@@ -194,12 +194,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -209,7 +209,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -217,12 +217,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -259,4 +259,4 @@ def assert_bitnet_158_int8xint2_decode_correctness(M, N, K, in_dtype, out_dtype,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index d68a0128..8c337398 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -88,9 +88,9 @@ def bitnet_158_int8xint2_prefill(
     Create a TVM GPU prim_func implementing a block-tiled matrix multiply that multiplies dense A by compressed/interleaved low‑precision B (2-bit packed into int8 storage), decoding B to int8 on-chip and accumulating into C.
 
     The returned prim_func expects:
-    - A: shape (M, K) with dtype `in_dtype` ("float16" or "int8").
+    - A: shape (M, K) with dtype `in_dtype` (T.float16 or T.int8).
     - B: compressed storage with shape (N, K/4) and int8 storage layout (packing 4 2-bit elements per byte).
-    - C: output buffer shape (M, N) with dtype `out_dtype` ("float16", "float32", or "int32").
+    - C: output buffer shape (M, N) with dtype `out_dtype` (T.float16, T.float32, or T.int32).
 
     Details:
     - Builds a tiled, pipelined kernel using shared memory and warp-level MMA intrinsics (INT4TensorCoreIntrinEmitter). B is loaded from compressed storage, decoded to int8 in threads (via decode_i2u_to_i8s / decode_i2s_to_i8s), and dequantized into a shared buffer used by the MMA emitter.
@@ -98,15 +98,15 @@ def bitnet_158_int8xint2_prefill(
       - block_row_warps, block_col_warps: number of warps per block in row/col.
       - warp_row_tiles, warp_col_tiles: tiles per warp.
       - chunk: K-sized chunk per block (block_K).
-      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == "int32").
+      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == T.int32).
     - Uses 2-stage pipelining by default to overlap loads and compute and applies a swizzle layout to improve L2 behavior.
     - Assertions: raises AssertionError if in_dtype or out_dtype are not among supported values.
 
     Parameters:
         M, N, K (int): Global matrix dimensions.
-        in_dtype (str): Input and decoded B element dtype; "float16" or "int8".
-        out_dtype (str): Output C dtype; one of "float16", "float32", "int32".
-        accum_dtype (str): Accumulator dtype used by MMA (e.g., "int32").
+        in_dtype (str): Input and decoded B element dtype; T.float16 or T.int8.
+        out_dtype (str): Output C dtype; one of T.float16, T.float32, T.int32.
+        accum_dtype (str): Accumulator dtype used by MMA (e.g., T.int32).
         fast_decoding (bool): If True, enable the fast decoding path (affects which device decode is used).
         block_row_warps (int): Warps in block row dimension.
         block_col_warps (int): Warps in block column dimension.
@@ -118,18 +118,18 @@ def bitnet_158_int8xint2_prefill(
         T.prim_func: A TVM prim_func implementing the described GPU kernel suitable for compilation and execution.
     """
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     num_elems_per_byte = 4
@@ -138,7 +138,7 @@ def bitnet_158_int8xint2_prefill(
     local_size_compressed = local_size // num_elems_per_byte
 
     shared_scope = "shared.dyn"
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     # Pipeline Stage
     stage = 2
@@ -317,12 +317,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -332,7 +332,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -340,12 +340,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -382,4 +382,4 @@ def assert_bitnet_158_int8xint2_prefill_correctness(M, N, K, in_dtype, out_dtype
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
index f2a0e2e7..e3d35df4 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
@@ -38,18 +38,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -57,7 +57,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -183,7 +183,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
     print(src_code)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-7, 7, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-7, 7, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -209,12 +209,12 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
     # bitblas.testing.main()
-    # assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    # assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(16384, 16384, 16384, "int8", "int32", "int32")
+    # assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    # assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(16384, 16384, 16384, T.int8, T.int32, T.int32)
diff --git a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
index afb4cc88..934b0b25 100644
--- a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
+++ b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
@@ -41,9 +41,9 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
         @T.macro
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index 99418d5f..77a29ebe 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -14,8 +14,8 @@ from heuristic import num_splits_heuristic
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
@@ -43,9 +43,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            block_table: T.Tensor(shape_block_table, "int32"),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            block_table: T.Tensor(shape_block_table, T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
@@ -139,7 +139,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_logsum_local = T.alloc_local([1], accum_dtype)
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
+                max_split = T.alloc_local([1], T.int32)
 
                 T.annotate_layout(
                     {
@@ -177,9 +177,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            block_table: T.Tensor(shape_block_table, "int32"),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            block_table: T.Tensor(shape_block_table, T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
             Output: T.Tensor(shape_o, dtype),
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index 8b5cde38..257f4154 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -11,8 +11,8 @@ from heuristic import num_splits_heuristic
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
@@ -35,9 +35,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            # actual_num_blocks: T.Tensor([batch], "int32"),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            # actual_num_blocks: T.Tensor([batch], T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
@@ -128,7 +128,7 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
                 lse_logsum_local = T.alloc_local([1], accum_dtype)
                 lse_max_local = T.alloc_local([1], accum_dtype)
                 scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
+                max_split = T.alloc_local([1], T.int32)
 
                 T.annotate_layout(
                     {
@@ -166,9 +166,9 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            # actual_num_blocks: T.Tensor([batch], "int32"),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            # actual_num_blocks: T.Tensor([batch], T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
             Output: T.Tensor(shape_o, dtype),
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index 0d759211..2957f8c9 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -13,8 +13,8 @@ from heuristic import num_splits_heuristic
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
@@ -37,8 +37,8 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_mask: T.Tensor(shape_mask, "bool"),
-            cache_seqlens: T.Tensor([batch], "int32"),
+            block_mask: T.Tensor(shape_mask, T.bool),
+            cache_seqlens: T.Tensor([batch], T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
         ):
@@ -156,8 +156,8 @@ def flashattn(batch, heads, heads_kv, dim, dim_v):
             Q: T.Tensor(shape_q, dtype),
             K: T.Tensor(shape_k, dtype),
             V: T.Tensor(shape_v, dtype),
-            block_mask: T.Tensor(shape_mask, "bool"),
-            cache_seqlens: T.Tensor([batch], "int32"),
+            block_mask: T.Tensor(shape_mask, T.bool),
+            cache_seqlens: T.Tensor([batch], T.int32),
             glse: T.Tensor([batch, heads, num_split], accum_dtype),
             Output_partial: T.Tensor(part_shape, accum_dtype),
             Output: T.Tensor(shape_o, dtype),
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 0cbef5e0..b8a34e45 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -93,7 +93,7 @@ def supply_program(params: List[KernelParam]):
 )
 @tilelang.jit(out_idx=[-1])
 def blocksparse_matmul(
-    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"
+    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index ec15b292..6bde50c5 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -5,8 +5,8 @@ from typing import Tuple
 from tilelang.utils.tensor import torch_assert_close
 
 # support bfloat16, float, float16
-dtype = "bfloat16"
-accum_dtype = "float"
+dtype = T.bfloat16
+accum_dtype = T.float32
 
 
 @tilelang.jit(out_idx=[2, 3])
@@ -18,8 +18,8 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
     @T.prim_func
     def group_per_split_token_cast(
         X: T.Tensor((M, N), dtype),
-        batch_sizes: T.Tensor((BG,), "int32"),
-        X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"),
+        batch_sizes: T.Tensor((BG,), T.int32),
+        X_fp8: T.Tensor((BG, M_max, N), T.float8_e4m3fn),
         X_amax: T.Tensor((BG, M_max, T.ceildiv(N, group_size)), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
@@ -30,8 +30,8 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-            row_offset = T.alloc_fragment((1,), "int32")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+            row_offset = T.alloc_fragment((1,), T.int32)
 
             T.annotate_layout(
                 {
@@ -163,11 +163,11 @@ def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> Tuple[torch.Tenso
 def main(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
     if batch_sizes is None:
         batch_sizes = [2048, 6144]
-    if dtype == "float":
+    if dtype == T.float:
         x = torch.randn(M, N, device="cuda", dtype=torch.float32)
-    elif dtype == "float16":
+    elif dtype == T.float16:
         x = torch.randn(M, N, device="cuda", dtype=torch.float16)
-    elif dtype == "bfloat16":
+    elif dtype == T.bfloat16:
         x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 45281ab1..aa6d1488 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -7,14 +7,14 @@ from tilelang.utils.tensor import torch_assert_close
 
 @tilelang.jit(out_idx=[1, 2])
 def per_token_cast_to_fp8(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
     group_size = 128
     fp8_min = -448.0
     fp8_max = 448.0
 
     @T.prim_func
     def per_token_cast(
-        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
+        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), T.float8_e4m3fn), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
     ):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
@@ -23,7 +23,7 @@ def per_token_cast_to_fp8(M, N, blk_m):
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
 
             T.annotate_layout(
                 {
diff --git a/examples/compile_flags/usecase.py b/examples/compile_flags/usecase.py
deleted file mode 100644
index 80e2b784..00000000
--- a/examples/compile_flags/usecase.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import tilelang
-import tilelang.language as T
-
-
-# @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, K), dtype),
-        B: T.Tensor((K, N), dtype),
-        C: T.Tensor((M, N), dtype),
-    ):
-        # Initialize Kernel Context
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, ko * block_K], A_shared)
-                T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-M = 1024
-N = 1024
-K = 1024
-block_M = 128
-block_N = 128
-block_K = 32
-
-func = matmul(M, N, K, block_M, block_N, block_K)
-
-jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
-# or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
-
-import torch
-
-a = torch.randn(M, K, device="cuda", dtype=torch.float16)
-b = torch.randn(K, N, device="cuda", dtype=torch.float16)
-
-c = jit_kernel(a, b)
-
-print(c)
-
-ref_c = a @ b
-
-torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-print("Kernel output matches PyTorch reference.")
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index a84e5878..ffd3972f 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -25,12 +25,12 @@ def ref_program(stride, padding, dilation):
 
 
 @tilelang.jit(out_idx=[2])
-def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
+def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index 600b608a..59588ac4 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -75,13 +75,13 @@ def get_heuristic_config() -> dict:
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[2])
 def convolution(
-    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"
+    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
 ):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 8aba9140..18467a81 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,11 +20,11 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float8_e4m3",
+        T.float8_e4m3fn,
     ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.float32,
     ], "Currently only float16 and float32 are supported"
 
     group_size = 128
@@ -44,14 +44,14 @@ def tl_gemm(
         A: T.Tensor(A_shape, in_dtype),
         B: T.Tensor(B_shape, in_dtype),
         C: T.Tensor((M, N), out_dtype),
-        scales_a: T.Tensor(Scales_A_shape, "float32"),
-        scales_b: T.Tensor(Scales_B_shape, "float32"),
+        scales_a: T.Tensor(Scales_A_shape, T.float32),
+        scales_b: T.Tensor(Scales_B_shape, T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), "float32")
+            Scale_C_shared = T.alloc_shared((block_M), T.float32)
             C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
             C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
 
@@ -176,11 +176,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, T.float8_e4m3fn, T.bfloat16, T.float32)
 
 
 if __name__ == "__main__":
-    for dtype in ["float8_e4m3"]:
-        for out_dtype in ["bfloat16", "float32"]:
+    for dtype in [T.float8_e4m3fn]:
+        for out_dtype in [T.bfloat16, T.float32]:
             for block_N in [16, 32, 64, 128]:
-                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
+                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, T.float32)
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index 49958379..a9035793 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -36,8 +36,8 @@ def get_configs():
 )
 def flashmla_decode(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, threads=128):
     scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 733ae3c4..0d141b4b 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -15,8 +15,8 @@ import argparse
 )
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index dee05c1e..23001bde 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -17,8 +17,8 @@ def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, bloc
     if softmax_scale is None:
         softmax_scale = (dv + dpe) ** -0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = h_q // h_kv
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert h_kv == 1, "h_kv must be 1"
@@ -30,8 +30,8 @@ def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, bloc
         Q_pe: T.Tensor([batch, h_q, dpe], dtype),
         KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
         K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-        CACHE_SEQLENS: T.Tensor([batch], "int32"),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        CACHE_SEQLENS: T.Tensor([batch], T.int32),
         Output: T.Tensor([batch, h_q, dv], dtype),
     ):
         with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
@@ -103,8 +103,8 @@ def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, bloc
         Q_pe: T.Tensor([batch, h_q, dpe], dtype),
         KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
         K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-        CACHE_SEQLENS: T.Tensor([batch], "int32"),
+        BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        CACHE_SEQLENS: T.Tensor([batch], T.int32),
         glse: T.Tensor([batch, h_q, num_split], dtype),
         Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
     ):
@@ -224,8 +224,8 @@ def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, bloc
         Q_pe: T.Tensor([batch, h_q, dpe], dtype),
         KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
         K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-        block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-        cache_seqlens: T.Tensor([batch], "int32"),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
         glse: T.Tensor([batch, h_q, num_split], dtype),
         Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
         Output: T.Tensor([batch, h_q, dv], dtype),
@@ -239,8 +239,8 @@ def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, bloc
         Q_pe: T.Tensor([batch, h_q, dpe], dtype),
         KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
         K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-        block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-        cache_seqlens: T.Tensor([batch], "int32"),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
         glse: T.Tensor([batch, h_q, num_split], dtype),
         Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
         Output: T.Tensor([batch, h_q, dv], dtype),
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 305fd30e..b6a1300a 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -16,8 +16,8 @@ import argparse
 )
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
     scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 3fb90a55..8e317fa0 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -27,8 +27,8 @@ import argparse
 )
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     sm_scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 4a1a84cf..fa39fa49 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -15,9 +15,9 @@ import argparse
 )
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
     scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    q_dtype = "float8_e4m3"
-    accum_dtype = "float"
+    dtype = T.float16
+    q_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index ea3f72c5..dadb4b4c 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -479,9 +479,9 @@ def tilelang_sparse_attention(
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(block_T, tilelang.math.next_power_of_2(dim))
 
@@ -876,7 +876,7 @@ if __name__ == "__main__":
     parser.add_argument("--dim", type=int, default=128, help="Head dimension")
     parser.add_argument("--selected_blocks", type=int, default=16, help="Number of selected blocks")
     parser.add_argument("--block_size", type=int, default=32, help="Block size")
-    parser.add_argument("--dtype", type=str, default="float16", help="Data type (float16 or float32)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="Data type (float16 or float32)")
     parser.add_argument("--scale", type=float, default=0.1, help="Attention scale factor")
     parser.add_argument("--iterations", type=int, default=100, help="Number of iterations")
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
@@ -901,7 +901,7 @@ if __name__ == "__main__":
     if args.suite:
         run_benchmark_suite(impl=args.impl)
     else:
-        dtype = torch.float16 if args.dtype == "float16" else torch.float32
+        dtype = torch.float16 if args.dtype == T.float16 else torch.float32
 
         if args.impl in ["tilelang", "all"]:
             print("Benchmarking TileLang implementation:")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 56e98a95..41f1dd86 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -49,9 +49,9 @@ def tilelang_kernel_fwd(
     o_slc_shape = [batch, seq_len, heads, dim]
     lse_slc_shape = [batch, seq_len, heads]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -170,8 +170,8 @@ def tilelang_kernel_bwd_dkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
         sm_scale = (1.0 / dim) ** 0.5
@@ -217,7 +217,7 @@ def tilelang_kernel_bwd_dkv(
         DO_slc: T.Tensor(do_slc_shape, dtype),
         DK: T.Tensor(dk_shape, dtype),
         DV: T.Tensor(dv_shape, dtype),
-        BlockMask: T.Tensor(block_mask_shape, "int32"),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -340,8 +340,8 @@ def tilelang_kernel_bwd_dqkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
         sm_scale = (1.0 / dim) ** 0.5
@@ -388,7 +388,7 @@ def tilelang_kernel_bwd_dqkv(
         DQ: T.Tensor(dq_shape, dtype),
         DK: T.Tensor(dk_shape, dtype),
         DV: T.Tensor(dv_shape, dtype),
-        BlockMask: T.Tensor(block_mask_shape, "int32"),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -505,8 +505,8 @@ def tilelang_kernel_preprocess(
     heads,
     seq_len,
     dim,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
     blk=32,
 ):
     from tilelang import language as T
@@ -548,7 +548,7 @@ def tilelang_kernel_block_mask(
     seq_len,
     selected_blocks,
     block_size,
-    dtype="int32",
+    dtype=T.int32,
 ):
     from tilelang import language as T
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 38fc51a9..b7eea580 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -35,9 +35,9 @@ def native_sparse_attention(
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, 1, head_kv, selected_blocks]  # Changed seq_len to 1
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index a8dd26b6..ad36b104 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -26,9 +26,9 @@ def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, b
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index af87db8b..b52ebe42 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -38,12 +38,12 @@ def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scal
     block_counts_shape = [c_seq_len, head_kv]
     offsets_shape = [batch + 1]
     token_indices_shape = [c_seq_len, 2]
-    block_indices_dtype = "int32"
-    block_counts_dtype = "int32"
-    offsets_dtype = "int32"
-    token_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    block_counts_dtype = T.int32
+    offsets_dtype = T.int32
+    token_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index 305e2afc..01ad0a73 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -97,9 +97,9 @@ def mqa_attn_return_logits(
 ):
     if block_Q is None:
         block_Q = 128 // heads
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
-    index_dtype = "int32"
+    dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
 
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
@@ -178,8 +178,8 @@ def clean_logits_(
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
 
-    dtype = "float"
-    indices_dtype = "int32"
+    dtype = T.float
+    indices_dtype = T.int32
 
     @T.prim_func
     def clean_logits_kernel(
diff --git a/examples/deepseek_v32/inference/kernel.py b/examples/deepseek_v32/inference/kernel.py
index 26234353..25abf15d 100644
--- a/examples/deepseek_v32/inference/kernel.py
+++ b/examples/deepseek_v32/inference/kernel.py
@@ -11,21 +11,21 @@ pass_configs = {
     tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
 }
 
-FP8 = "float8_e4m3"
-BF16 = "bfloat16"
-FP32 = "float32"
+FP8 = T.float8_e4m3fn
+BF16 = T.bfloat16
+FP32 = T.float32
 
 
 def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
+    bits_x = T.reinterpret(T.uint32, x)
     exp_x = (bits_x >> 23) & 0xFF
     man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    return T.Cast(T.int32, exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
 
 
 def fast_pow2(x):
     bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
+    return T.reinterpret(T.float32, bits_x)
 
 
 def fast_round_scale(amax, fp8_max_inv):
@@ -107,8 +107,8 @@ def act_quant(x: torch.Tensor,
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
+def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype=T.float32):
+    assert out_dtype in [BF16, T.float32]
 
     M = T.dynamic("M")
     group_size = 128
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index 1266e70e..d8035c1b 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -13,11 +13,11 @@ def preprocess(
     D,
     block_ND=32,
     num_stages=5,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     shape = [B, S, H, D]
 
     @T.prim_func
@@ -52,11 +52,11 @@ def postprocess(
     kv_group=1,
     block_N=64,
     threads=128,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     dkv_shape = [B, S_kv, kv_group, D + D_tail]
 
     @T.prim_func
@@ -95,15 +95,15 @@ def bwd(
     block_size=32,
     num_stages=0,
     threads=256,
-    indices_dtype="int32",
-    dtype="bfloat16",
-    accum_dtype="float",
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
     assert is_causal == True, "non-casual is not supported now"
     assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
-    assert indices_dtype == "int32"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
 
     if sm_scale is None:
         sm_scale = (D + D_tail) ** (-0.5)
@@ -116,9 +116,9 @@ def bwd(
     indices_shape = [B, S, kv_group, topk]
     delta_shape = [B, S, H]
     lse_shape = [B, S, H]
-    assert indices_dtype == "int32"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     H = H_kv
     padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index 3b963c75..f9c4d2f0 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -44,9 +44,9 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 972160c9..54e1a720 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -53,9 +53,9 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index cf87f526..244f74c6 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -8,24 +8,24 @@ pass_configs = {
 
 
 def convert_to_uint16(x):
-    hval = T.Cast("float16", x)
-    bits_uint = T.reinterpret("uint16", hval)
+    hval = T.Cast(T.float16, x)
+    bits_uint = T.reinterpret(T.uint16, hval)
     bits_uint = T.if_then_else(x < 0, ~bits_uint & (0xFFFF), bits_uint | (0x8000))
     return bits_uint >> 8
 
 
 def convert_to_uint32(x):
-    bits_uint = T.reinterpret("uint32", x)
+    bits_uint = T.reinterpret(T.uint32, x)
     bits_uint = T.if_then_else(
         x < 0,
-        ~bits_uint & T.Cast("uint32", (0xFFFFFFFF)),
-        bits_uint | T.Cast("uint32", (0x80000000)),
+        ~bits_uint & T.Cast(T.uint32, (0xFFFFFFFF)),
+        bits_uint | T.Cast(T.uint32, (0x80000000)),
     )
     return bits_uint
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
+def tl_topk_impl(topk, in_dtype=T.float32, out_dtype=T.int32):
     batch = T.dynamic("batch")
     seq_len = T.dynamic("seq_len")
     RADIX = 1 << 8
@@ -42,20 +42,20 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
         with T.Kernel(batch, threads=BLOCK_SIZE) as (bx):
             tx = T.get_thread_binding()
 
-            s_threshold_bin_id = T.alloc_shared([1], "int32")
-            s_histogram = T.alloc_shared([RADIX + 1], "int32")
-            s_num_input = T.alloc_shared([2], "int32")
-            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], "int32")
-
-            l_threshold_bin_id = T.alloc_var("int32")
-            l_new_topk = T.alloc_var("int32")
-            l_num_input = T.alloc_var("int32")
-            l_bin_id32 = T.alloc_var("int32")
-            l_val = T.alloc_var("int32")
-            l_start_pos = T.alloc_var("int32")
-            l_start_idx = T.alloc_var("int32")
-            l_end_idx = T.alloc_var("int32")
-            l_out_pos = T.alloc_var("int32")
+            s_threshold_bin_id = T.alloc_shared([1], T.int32)
+            s_histogram = T.alloc_shared([RADIX + 1], T.int32)
+            s_num_input = T.alloc_shared([2], T.int32)
+            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], T.int32)
+
+            l_threshold_bin_id = T.alloc_var(T.int32)
+            l_new_topk = T.alloc_var(T.int32)
+            l_num_input = T.alloc_var(T.int32)
+            l_bin_id32 = T.alloc_var(T.int32)
+            l_val = T.alloc_var(T.int32)
+            l_start_pos = T.alloc_var(T.int32)
+            l_start_idx = T.alloc_var(T.int32)
+            l_end_idx = T.alloc_var(T.int32)
+            l_out_pos = T.alloc_var(T.int32)
 
             l_new_topk = topk
             l_start_idx = starts[bx]
@@ -99,7 +99,7 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 input_idx = s * BLOCK_SIZE + tx
                 if input_idx < l_end_idx and input_idx >= l_start_idx and input_idx < seq_len:
                     bin_id = convert_to_uint16(input[bx, input_idx])
-                    l_bin_id32 = T.Cast("int32", bin_id)
+                    l_bin_id32 = T.Cast(T.int32, bin_id)
                     if l_bin_id32 > l_threshold_bin_id:
                         # need a pos = T.atomic_add(s_histogram[bin_id32+1], 1)
                         pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True)
@@ -128,7 +128,7 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
                         l_bin_id32 = T.Cast(
-                            "int32", ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
                         )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
@@ -157,7 +157,7 @@ def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
                         l_bin_id32 = T.Cast(
-                            "int32", ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
                         )
                         if l_bin_id32 > l_threshold_bin_id:
                             pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index ba3e0b4a..2d9c945b 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -50,7 +50,7 @@ def matmul(
     in_dtype,
     out_dtype,
     accum_dtype,
-    source_format="uint",
+    source_format=T.uint32,
     num_bits=4,
     fast_dequant=True,
     block_M=256,
@@ -90,7 +90,7 @@ def matmul(
         A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
 
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
@@ -121,7 +121,7 @@ def matmul(
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a TileLang macro that performs fast, twiddling-based dequantization from packed FP4 to BF16 using an external runtime plugin.
 
@@ -131,13 +131,13 @@ def matmul(
         - Writes the dequantized BF16 values back to a shared dequantized buffer for use by the kernel.
 
         Notes and preconditions:
-        - Asserts that `in_dtype == "fp4"` and `out_dtype == "bfloat16"`.
+        - Asserts that `in_dtype == "fp4"` and `out_dtype == T.bfloat16`.
         - The generated macro depends on several surrounding-scope symbols (e.g., `import_source`, `func_name`, `block_K`, `Block_QK`, `threads`, `num_elems_per_byte`, `storage_dtype`, and `out_dtype`) and expects them to be defined consistently in the enclosing kernel.
         - The macro is optimized for block-wise, per-thread transactions sized to the target storage width (uses a MAX_TRANSACTION_SIZE_BITS constant) and uses local/register buffers sized accordingly.
         - The macro uses `T.import_source` to bring the external plugin into the module and `T.call_extern` to perform the high-throughput dequantization; callers must ensure the external function matches the expected calling convention and memory layout.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -193,7 +193,7 @@ def matmul(
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple TIR dequantization macro that converts packed 4-bit FP (FP4) stored in uint8 into bfloat16.
 
@@ -204,7 +204,7 @@ def matmul(
         - Writes the dequantized bfloat16 block into B_dequantize_shared.
 
         Constraints:
-        - Supports only in_dtype="fp4" and out_dtype="bfloat16".
+        - Supports only in_dtype="fp4" and out_dtype=T.bfloat16.
         - The helper assumes nbit == 4 and produces bfloat16 values.
         - The macro uses a fixed test-scale of 0 (no per-element scaling) as written.
 
@@ -212,7 +212,7 @@ def matmul(
             A TIR macro function performing the described in-place block dequantization from packed uint8 FP4 to bfloat16.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
             """
@@ -228,32 +228,32 @@ def matmul(
                 val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
                 pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
                 scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
-                dtype (str): Target dtype string; must be "bfloat16".
+                dtype (str): Target dtype string; must be T.bfloat16.
 
             Returns:
                 tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
 
             Notes:
-                - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
+                - The function asserts `nbit == 4`, `dtype == T.bfloat16`, and that `val.dtype` is T.uint8.
                 - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
                 bit fields and clamps the computed exponent to fit into 8 bits.
             """
             assert nbit == 4
-            assert dtype == "bfloat16"
-            assert val.dtype == "uint8"
-            mask = tir.const((1 << nbit) - 1, "uint16")
-            f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-            s = f4 >> tir.const(3, "uint16")
-            e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+            assert dtype == T.bfloat16
+            assert val.dtype == T.uint8
+            mask = tir.const((1 << nbit) - 1, T.uint16)
+            f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+            s = f4 >> tir.const(3, T.uint16)
+            e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
             # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-            e_bf16 = e_f4 + tir.const(126, "uint16")
+            e_bf16 = e_f4 + tir.const(126, T.uint16)
             # Scale is the exponential part, within the representation of uint8
             # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-            m_f4 = f4 & tir.const(1, "uint16")
+            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+            m_f4 = f4 & tir.const(1, T.uint16)
             val_bf16 = tir.reinterpret(
-                "bfloat16",
-                ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+                T.bfloat16,
+                ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
             )
             return val_bf16
 
@@ -364,7 +364,7 @@ def ref_program_twiddling(A, qB):
     Returns:
         torch.Tensor: Result matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -384,7 +384,7 @@ def ref_program_simple(A, qB):
     Returns:
         torch.Tensor: Resulting matrix C in bfloat16 with shape (M, N).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -410,15 +410,15 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     """
     total_flops = 2 * m * n * k
     if tune:
-        kernel = matmul(m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
+        kernel = matmul(m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, fast_dequant=fast_dequant)
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             fast_dequant=fast_dequant,
             block_M=256,
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index 1091306c..cc0375a1 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -20,31 +20,31 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
         pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
         scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-        dtype (str): Destination dtype string (must be "bfloat16").
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
     Returns:
         tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
     Notes:
-    - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
     - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
     """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
+    m_f4 = f4 & tir.const(1, T.uint16)
     val_bf16 = tir.reinterpret(
-        "bfloat16",
-        ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
     )
     return val_bf16
 
@@ -90,7 +90,7 @@ def matmul(
     in_dtype,
     out_dtype,
     accum_dtype,
-    source_format="uint",
+    source_format=T.uint32,
     num_bits=4,
     scale_size=32,
     fast_dequant=True,
@@ -116,7 +116,7 @@ def matmul(
     Parameters:
     M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
     in_dtype (str): element type of A (e.g., "fp4" in this file).
-    out_dtype (str): output tensor element type (e.g., "bfloat16").
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
     accum_dtype (str): accumulation type used for the inner GEMM.
     source_format (str, optional): format string passed to intrinsic selector (default "uint").
     num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -141,7 +141,7 @@ def matmul(
     - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -170,7 +170,7 @@ def matmul(
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -181,12 +181,12 @@ def matmul(
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -262,19 +262,19 @@ def matmul(
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
@@ -394,7 +394,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
@@ -417,7 +417,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
@@ -441,7 +441,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
@@ -469,7 +469,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
@@ -498,16 +498,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
         )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
index 12395df0..9e90418b 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
@@ -20,31 +20,31 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
         pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
         scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-        dtype (str): Destination dtype string (must be "bfloat16").
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
     Returns:
         tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
     Notes:
-    - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
     - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
     """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
+    m_f4 = f4 & tir.const(1, T.uint16)
     val_bf16 = tir.reinterpret(
-        "bfloat16",
-        ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
     )
     return val_bf16
 
@@ -90,7 +90,7 @@ def matmul(
     in_dtype,
     out_dtype,
     accum_dtype,
-    source_format="uint",
+    source_format=T.uint32,
     num_bits=4,
     scale_size=32,
     fast_dequant=True,
@@ -116,7 +116,7 @@ def matmul(
     Parameters:
     M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
     in_dtype (str): element type of A (e.g., "fp4" in this file).
-    out_dtype (str): output tensor element type (e.g., "bfloat16").
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
     accum_dtype (str): accumulation type used for the inner GEMM.
     source_format (str, optional): format string passed to intrinsic selector (default "uint").
     num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -141,7 +141,7 @@ def matmul(
     - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -170,7 +170,7 @@ def matmul(
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -181,12 +181,12 @@ def matmul(
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -262,19 +262,19 @@ def matmul(
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
@@ -402,7 +402,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
@@ -427,7 +427,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
@@ -453,7 +453,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
@@ -483,7 +483,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
@@ -514,16 +514,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
         )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index c2b972a0..37826874 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -26,7 +26,7 @@ def matmul(
     from tilelang.quantize import _tir_packed_to_unsigned_convert
 
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     storage_type = str("".join(c for c in storage_dtype if not c.isdigit()))
     A_shape = (M, K)
@@ -149,21 +149,21 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     from bitblas.gpu.intrin.lop3 import decode_i4_to_f16
 
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -182,7 +182,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     block_M = block_row_warps * warp_row_tiles
     block_N = block_col_warps * warp_col_tiles
-    block_K = 32 if in_dtype == "float16" else 64
+    block_K = 32 if in_dtype == T.float16 else 64
     chunk = block_K // reduce_k
 
     is_smooth_a = False
@@ -365,7 +365,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     assert src_code is not None
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
     qB = torch.randint(0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
@@ -417,13 +417,13 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
 
 @tilelang.testing.requires_package("bitblas")
 def test_run_dequantize_gemm():
-    run_gemm(256, 256, 256, "float16", "float16", "float16", 128, 128, 32, num_threads=128)
-    run_gemm(256, 256, 256, "int8", "int32", "int32", 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.float16, T.float16, T.float16, 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.int8, T.int32, T.int32, 128, 128, 32, num_threads=128)
 
 
 @tilelang.testing.requires_package("bitblas")
 def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
-    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, "float16", "float16", "float16", 3)
+    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, T.float16, T.float16, T.float16, 3)
 
 
 def main():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index 352637de..79345771 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -9,22 +9,22 @@ import argparse
 
 def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint8"
+    assert dtype == T.float16
+    assert val.dtype == T.uint8
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + ExponentialBias(f16, f4) = e_f4 + (2^4 - 2^1) = e_f4 + 14
     # s1e2m1
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
-    e_f16 = e_f4 + tir.const(14, "uint16")
-    m_f4 = f4 & tir.const(1, "uint16")
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
+    e_f16 = e_f4 + tir.const(14, T.uint16)
+    m_f4 = f4 & tir.const(1, T.uint16)
     m_f16 = m_f4
     val_f16 = tir.reinterpret(
-        "float16", ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16") | m_f16 << tir.const(9, "uint16")).astype("uint16")
+        T.float16, ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16) | m_f16 << tir.const(9, T.uint16)).astype(T.uint16)
     )
-    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
+    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, T.float16), val_f16)
     return val_f16
 
 
@@ -60,7 +60,7 @@ def torch_convert(tensor):
 @tilelang.jit(out_idx=[1])
 def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
@@ -98,7 +98,7 @@ def test_fp4_fp16_convert_close():
         K,
         block_N,
         block_K,
-        "float16",
+        T.float16,
     )
 
     B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
@@ -125,7 +125,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -241,7 +241,7 @@ def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
 
 
 def ref_program(A, qB):
-    dtypeC = "float16"
+    dtypeC = T.float16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -252,7 +252,7 @@ def main(m=256, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
 
     if not tune:
-        kernel = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
+        kernel = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)(
             block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
         )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
@@ -265,7 +265,7 @@ def main(m=256, n=256, k=256, tune=False):
         print("Tile-lang: {:.2f} ms".format(latency))
         print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
     else:
-        best_result = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)
+        best_result = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Best latency: {best_latency}")
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index 3ff72673..61baa668 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -9,15 +9,15 @@ import argparse
 
 def _tir_u8_to_i4_to_i8(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "int8"
-    assert val.dtype == "uint8"
+    assert dtype == T.int8
+    assert val.dtype == T.uint8
 
-    mask = tir.const((1 << nbit) - 1, "uint8")
+    mask = tir.const((1 << nbit) - 1, T.uint8)
 
-    i4 = (val >> (pos.astype("uint8") * tir.const(nbit, "uint8"))) & mask
+    i4 = (val >> (pos.astype(T.uint8) * tir.const(nbit, T.uint8))) & mask
 
-    i8_shifted = tir.reinterpret("int8", i4 << tir.const(4, "uint8"))
-    i8 = i8_shifted >> tir.const(4, "int8")
+    i8_shifted = tir.reinterpret(T.int8, i4 << tir.const(4, T.uint8))
+    i8 = i8_shifted >> tir.const(4, T.int8)
     return i8
 
 
@@ -35,7 +35,7 @@ def get_configs():
 @tilelang.jit(out_idx=[1])
 def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
@@ -85,7 +85,7 @@ def torch_convert(tensor):
 
 
 def ref_program(A, qB):
-    dtypeC = "int32"
+    dtypeC = T.int32
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -96,7 +96,7 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -166,7 +166,7 @@ def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune
 def main(m=128, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
     if not tune:
-        kernel = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
+        kernel = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)(
             block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
         )
         profiler = kernel.get_profiler()
@@ -177,7 +177,7 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Tilelang: {latency} ms")
 
     else:
-        best_result = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)
+        best_result = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Bset latency: {best_latency}")
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index 3f121467..dea2e5dd 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -17,7 +17,7 @@ def dequantize_gemv(
     out_dtype: str,
     accum_dtype: str,
     num_bits: int = 4,
-    storage_dtype: str = "int8",
+    storage_dtype: T.dtype = T.int8,
     source_format: str = "uint",
     n_partition: int = 4,
     reduce_thread: int = 32,
@@ -51,7 +51,7 @@ def dequantize_gemv(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     import_source: Optional[str] = None
     func_name: str = ""
@@ -159,11 +159,11 @@ def main() -> None:
     M = 1
     N = 1024
     K = 1024
-    in_dtype = "float16"
-    out_dtype = "float16"
-    accum_dtype = "float16"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float16
     num_bits = 4
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     source_format = "uint"
     n_partition = 4
     reduce_thread = 32
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
index 098f814c..9921c6bf 100644
--- a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -49,7 +49,7 @@ def matmul(
     in_dtype,
     out_dtype,
     accum_dtype,
-    source_format="uint",
+    source_format=T.uint32,
     num_bits=4,
     scale_size=32,
     fast_dequant=True,
@@ -83,8 +83,8 @@ def matmul(
         topk (int): number of experts selected per token.
         E (int): number of experts.
         padding_M (int): padded number of tokens after grouping and block alignment.
-        in_dtype (str): element type of A (e.g., "bfloat16").
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
+        in_dtype (str): element type of A (e.g., T.bfloat16).
+        out_dtype (str): output tensor element type (e.g., T.bfloat16).
         accum_dtype (str): accumulation type used for the inner GEMM.
         source_format (str, optional): format string passed to intrinsic selector (default "uint").
         num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -111,7 +111,7 @@ def matmul(
     """
 
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shared_shape = (block_M, block_K)
@@ -137,7 +137,7 @@ def matmul(
     import_source = import_source
 
     # the dequant part is the same as in dequant_gemm
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
         The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
@@ -147,12 +147,12 @@ def matmul(
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -227,9 +227,9 @@ def matmul(
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
@@ -259,8 +259,8 @@ def matmul(
         Bias: T.Tensor((E, N), out_dtype),
         # Add fusedmoe tensors
         topk_weights: T.Tensor((M * topk), out_dtype),
-        sorted_token_ids: T.Tensor((padding_M), "int32"),
-        expert_ids: T.Tensor((padding_M // block_M), "int32"),
+        sorted_token_ids: T.Tensor((padding_M), T.int32),
+        expert_ids: T.Tensor((padding_M // block_M), T.int32),
         C: T.Tensor((M, topk, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
@@ -271,8 +271,8 @@ def matmul(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
             topk_weights_shared = T.alloc_shared((block_M), out_dtype)
-            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")
-            expert_id = T.alloc_local((1), "int32")  # the expert id for the current block
+            sorted_token_ids_shared = T.alloc_shared((block_M), T.int32)
+            expert_id = T.alloc_local((1), T.int32)  # the expert id for the current block
             # To use 1D TMA, the last dim of Scale_shared must have stride=1
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
@@ -346,7 +346,7 @@ def matmul(
 
 
 def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=256):
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     M, K = A.shape
     E, N, QK = qB.shape
     topk = topk_weights.shape[0] // M
@@ -451,9 +451,9 @@ def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, wi
                 topk,
                 E,
                 padding_M,
-                "bfloat16",
-                "bfloat16",
-                "float32",
+                T.bfloat16,
+                T.bfloat16,
+                T.float32,
                 num_bits=num_bits,
                 scale_size=scale_size,
                 fast_dequant=fast_dequant,
@@ -467,9 +467,9 @@ def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, wi
             topk,
             E,
             padding_M,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=num_bits,
             scale_size=scale_size,
             fast_dequant=fast_dequant,
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
index 5d8132d9..68508ad4 100644
--- a/examples/dsa_sparse_finetune/indexer_bwd.py
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -9,9 +9,9 @@ from index import prepare_token_indices
 
 from utils import get_abs_err, get_err_ratio
 
-BF16 = "bfloat16"
-FP32 = "float32"
-INT32 = "int32"
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
 
 pass_configs = {
     tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
index 8e2f82ba..d76eb027 100644
--- a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -10,9 +10,9 @@ from index import prepare_token_indices
 
 from utils import get_abs_err, get_err_ratio
 
-BF16 = "bfloat16"
-FP32 = "float32"
-INT32 = "int32"
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
 
 pass_configs = {
     tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
index 0b085516..8b76dbca 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_bwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -13,11 +13,11 @@ def preprocess(
     D,
     block_ND=32,
     num_stages=5,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     S = T.symbolic("S")
 
@@ -53,11 +53,11 @@ def postprocess(
     kv_group=1,
     block_N=64,
     threads=128,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     S_kv = T.symbolic("S_kv")
 
     dkv_shape = [S_kv, kv_group, D + D_tail]
@@ -94,15 +94,15 @@ def bwd(
     block_size=32,
     num_stages=0,
     threads=128,
-    indices_dtype="int32",
-    dtype="bfloat16",
-    accum_dtype="float",
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
     assert is_causal == True, "non-casual is not supported now"
     assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
-    assert indices_dtype == "int32"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
 
     if sm_scale is None:
         sm_scale = (D + D_tail) ** (-0.5)
@@ -119,9 +119,9 @@ def bwd(
     lse_shape = [S, H]
     offsets_shape = [B_plus_one]
     token_indices_shape = [S, 2]
-    assert indices_dtype == "int32"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     H = H_kv
     padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
index 6ec3caa7..d8752369 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_fwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -47,9 +47,9 @@ def sparse_mla_fwd(
     lse_shape = [seq_len, heads]
     offsets_shape = [batch_plus_one]
     token_indices_shape = [seq_len, 2]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
index 6675215c..a03bc74f 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -8,9 +8,9 @@ from einops import repeat, rearrange, einsum
 from index import prepare_token_indices
 from utils import get_abs_err, get_err_ratio
 
-BF16 = "bfloat16"
-FP32 = "float32"
-INT32 = "int32"
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
 
 pass_configs = {
     tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
@@ -41,9 +41,9 @@ def tl_sparse_mla_topk_reducesum_impl(
     seq_len_kv = T.symbolic("seq_len_kv")
 
     head_kv = heads // kv_group
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
index 97ce7d9b..598c9edf 100644
--- a/examples/dynamic_shape/example_dynamic.py
+++ b/examples/dynamic_shape/example_dynamic.py
@@ -98,8 +98,8 @@ def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtyp
 def main(M=16384, N=16384, K=16384):
     block_M, block_N, block_K = 128, 128, 32
     trans_A, trans_B = False, False
-    in_dtype, out_dtype = "float16", "float16"
-    accum_dtype = "float32"
+    in_dtype, out_dtype = T.float16, T.float16
+    accum_dtype = T.float32
     num_stages = 3
     threads = 128
     matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index 72459459..f075c64f 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -43,11 +43,11 @@ def main(M=1024, N=1024, use_autotune=False):
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
     if use_autotune:
-        kernel = elementwise_add(M, N, in_dtype="float32", out_dtype="float32")
+        kernel = elementwise_add(M, N, in_dtype=T.float32, out_dtype=T.float32)
     else:
         # Default config
         config = {"block_M": 32, "block_N": 32, "threads": 128}
-        kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+        kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index d1f5843e..89c11666 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -17,8 +17,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -89,8 +89,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
@@ -129,8 +129,8 @@ def make_dq_layout(dQ):
     },
 )
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_qk]
     blk = 64
 
@@ -161,8 +161,8 @@ def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, bl
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
@@ -256,8 +256,8 @@ def flashattn_bwd_split(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index c6cf336d..07586f99 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -20,8 +20,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -94,8 +94,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
@@ -134,8 +134,8 @@ def make_dq_layout(dQ):
     },
 )
 def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
@@ -178,8 +178,8 @@ def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, bl
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
@@ -276,8 +276,8 @@ def flashattn_bwd_split_novarlen(batch, heads, seq_len, dim_qk, dim_v, is_causal
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 3501df1d..cc88b64d 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -33,16 +33,16 @@ def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, d
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     o_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
         Q: T.Tensor(q_shape, dtype),  # type: ignore
         K: T.Tensor(k_shape, dtype),  # type: ignore
         V: T.Tensor(v_shape, dtype),  # type: ignore
-        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
         Output: T.Tensor(o_shape, dtype),  # type: ignore
         lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
@@ -143,8 +143,8 @@ def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, d
     },
 )
 def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [total_q, heads, dim_v]
     blk = 32
 
@@ -152,7 +152,7 @@ def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
     def flash_bwd_prep(
         O: T.Tensor(shape, dtype),  # type: ignore
         dO: T.Tensor(shape, dtype),  # type: ignore
-        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
         Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch) as (bx, by, bz):
@@ -198,8 +198,8 @@ def make_dq_layout(dQ):
     },
 )
 def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
@@ -245,8 +245,8 @@ def flashattn_bwd_atomic_add(
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     do_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
@@ -256,8 +256,8 @@ def flashattn_bwd_atomic_add(
         dO: T.Tensor(do_shape, dtype),  # type: ignore
         lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
         Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
         dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
         dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
         dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
@@ -386,8 +386,8 @@ def flashattn_bwd_split(
     do_shape = [total_q, heads, dim_v]
     dk_shape = [groups, total_kv, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, total_kv, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
@@ -397,8 +397,8 @@ def flashattn_bwd_split(
         dO: T.Tensor(do_shape, dtype),  # type: ignore
         lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
         Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
-        cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
         dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
         dK: T.Tensor(dk_shape, dtype),  # type: ignore
         dV: T.Tensor(dv_shape, dtype),  # type: ignore
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index adb7e06a..f4e2de27 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -17,8 +17,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -89,8 +89,8 @@ def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
@@ -129,8 +129,8 @@ def flashattn_bwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, bloc
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 408d6e50..5005435e 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -70,8 +70,8 @@ def flashattn(batch, heads, seq_len, dim, is_causal, groups=1, block_M=64, block
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 3492be76..7b7a71b1 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -45,8 +45,8 @@ def flashattn(
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index 87b11f71..b02345d9 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -65,16 +65,16 @@ def flashattn(batch_size, groups, UQ, UKV, heads, dim, is_causal, block_M=64, bl
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
     o_shape = [UQ, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
         Q_unpad: T.Tensor(q_shape, dtype),
         K_unpad: T.Tensor(kv_shape, dtype),
         V_unpad: T.Tensor(kv_shape, dtype),
-        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
         max_seqlen_q: T.int32,
         Output_unpad: T.Tensor(o_shape, dtype),
     ):
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 81eb6d1e..835a3159 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -15,8 +15,8 @@ import argparse
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -91,8 +91,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
@@ -131,8 +131,8 @@ def make_dq_layout(dQ):
     },
 )
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
@@ -160,8 +160,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     sm_scale = (1.0 / dim) ** 0.5
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_mha_bwd_bshd.py b/examples/flash_attention/example_mha_bwd_bshd.py
index 427a0f69..c0620bde 100644
--- a/examples/flash_attention/example_mha_bwd_bshd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -15,8 +15,8 @@ import argparse
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -87,8 +87,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
@@ -127,8 +127,8 @@ def make_dq_layout(dQ):
     },
 )
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
@@ -156,8 +156,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     sm_scale = (1.0 / dim) ** 0.5
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index 813f379c..34a8d69c 100644
--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -15,8 +15,8 @@ import argparse
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
@@ -88,8 +88,8 @@ def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     },
 )
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
@@ -125,8 +125,8 @@ def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
     sm_scale = (1.0 / dim) ** 0.5
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index 7fa5549d..e70d17bf 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -24,8 +24,8 @@ def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=6
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index 440a2cd7..b8c4d81e 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -24,8 +24,8 @@ def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index 888914c9..248073f7 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -23,8 +23,8 @@ def get_configs():
 def flashattn(batch, heads, seq_len, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index b54d3e62..ab2aab44 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -23,8 +23,8 @@ def get_configs():
 def flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f7bb36f7..6ba2e8ab 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -80,16 +80,16 @@ def flashattn(batch_size, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64
     v_shape = [UKV, heads, dim]
     o_shape = [UQ, heads, dim]
 
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
         Q_unpad: T.Tensor(q_shape, dtype),
         K_unpad: T.Tensor(k_shape, dtype),
         V_unpad: T.Tensor(v_shape, dtype),
-        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
         max_seqlen_q: T.int32,
         Output_unpad: T.Tensor(o_shape, dtype),
     ):
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 136a5129..ee42df20 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -53,8 +53,8 @@ def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split,
     shape_k = [batch, seqlen_kv, groups, dim]
     shape_v = [batch, seqlen_kv, groups, dim]
     shape_o = [batch, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // groups
 
     part_shape = [batch, heads, num_split, dim]
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
index 0fdd5291..ef3d8bae 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -209,8 +209,8 @@ def flashattn(
     shape_v = [total_seqlen_k, k_heads, dim]
     shape_o = [batch, heads, dim]
     shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // k_heads
 
     valid_block_H = min(block_H, kv_group_num)
@@ -221,8 +221,8 @@ def flashattn(
         Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-        s_aux: T.Tensor([heads], "float32"),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
         Output: T.Tensor([batch, heads, dim], dtype),
         S: T.Tensor(shape_s, dtype),
     ):
@@ -241,7 +241,7 @@ def flashattn(
             logsum = T.alloc_fragment([block_H], accum_dtype)
             S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
             # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
-            s_aux_shared = T.alloc_shared([block_H], "float32")
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
 
             T.annotate_layout(
                 {
@@ -321,8 +321,8 @@ def flashattn(
         Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-        s_aux: T.Tensor([heads], "float32"),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
         Output: T.Tensor(shape_o, dtype),
         S: T.Tensor(shape_s, dtype),
     ):
@@ -449,7 +449,7 @@ def test_equal_seqlen_decode_main(args):
     real_max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     # For decode, query is just 1 token per batch
     q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
@@ -568,7 +568,7 @@ def test_varlen_decode_main(args):
     real_max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
 
@@ -789,7 +789,7 @@ def speed_benchmark_decode_comparison(args):
     max_k_seqlen = args.k_seqlen
     head_size = args.head_size
     block_size = args.block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print("\n=== Decode Speed Benchmark Comparison ===")
     print("Configuration:")
@@ -890,7 +890,7 @@ if __name__ == "__main__":
     parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
     parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
     parser.add_argument("--block_size", type=int, default=64, help="Block size for computation")
-    parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"], help="Data type")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
     parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
     parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
     parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
@@ -898,7 +898,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     args.test_sink = True
     args.test_varlen = False
-    args.dtype = "float16"
+    args.dtype = T.float16
     args.num_split = 1
 
     if args.benchmark:
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
index 3537e5af..0984e707 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
@@ -45,8 +45,8 @@ def flashattn(
     shape_v = [total_seqlen_k, k_heads, dim]
     shape_o = [batch, heads, dim]
     shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // k_heads
     assert page_block_size >= block_N and page_block_size % block_N == 0, (
         "page_block_size must be larger than block_N and a multiple of block_N"
@@ -60,9 +60,9 @@ def flashattn(
         Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-        s_aux: T.Tensor([heads], "float32"),
-        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], "int32"),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / block_N)], T.int32),
         Output: T.Tensor([batch, heads, dim], dtype),
         S: T.Tensor(shape_s, dtype),
     ):
@@ -80,7 +80,7 @@ def flashattn(
             scores_sum = T.alloc_fragment([block_H], accum_dtype)
             logsum = T.alloc_fragment([block_H], accum_dtype)
             S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
-            s_aux_shared = T.alloc_shared([block_H], "float32")
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
 
             bid = bx
             hid = by
@@ -146,9 +146,9 @@ def flashattn(
         Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
-        cu_seqlens_k: T.Tensor([batch + 1], "int32"),
-        s_aux: T.Tensor([heads], "float32"),
-        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], "int32"),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], T.int32),
         Output: T.Tensor(shape_o, dtype),
         S: T.Tensor(shape_s, dtype),
     ):
@@ -211,7 +211,7 @@ def test_equal_seqlen_decode_main(args):
     head_size = args.head_size
     block_size = args.block_size
     page_block_size = args.page_block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     # For decode, query is just 1 token per batch
     q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
@@ -341,7 +341,7 @@ def test_varlen_decode_main(args):
     head_size = args.head_size
     block_size = args.block_size
     page_block_size = args.page_block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
 
@@ -549,7 +549,7 @@ def speed_benchmark_decode_comparison(args):
     head_size = args.head_size
     block_size = args.block_size
     page_block_size = args.page_block_size
-    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
     print("\n=== Decode Speed Benchmark Comparison ===")
     print("Configuration:")
@@ -659,7 +659,7 @@ if __name__ == "__main__":
     parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
     parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
     parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
-    parser.add_argument("--dtype", type=str, default="bfloat16", choices=["float16", "bfloat16"], help="Data type")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
     parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
     parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
     parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
@@ -668,7 +668,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     args.test_sink = True
     args.test_varlen = True
-    args.dtype = "float16"
+    args.dtype = T.float16
     args.num_split = 1
 
     if args.benchmark:
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index d0381bc4..5b243d69 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -14,8 +14,8 @@ def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_
     shape_q = [batch, seqlen_q, heads, dim]
     shape_kv = [batch, seqlen_kv, heads, dim]
     part_shape = [batch, seqlen_q, heads, num_split, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.macro
     def MMA0(
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index b737f30a..36c6ef3d 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -33,7 +33,7 @@ def moe_forward_tilelang_shared(
     shared_W_up_shape = (dexpert, dhidden)
     shared_W_down_shape = (dhidden, dexpert)
 
-    accum_type = "float32"
+    accum_type = T.float32
 
     @T.prim_func
     def kernel_shared(
@@ -121,7 +121,7 @@ def moe_forward_tilelang_routed(
     # group_count = len(group_sizes_list)
     # M = sum([(group_size + block_token - 1) // block_token for group_size in group_sizes_list])
     M = math.ceil(group_sum / block_token) + group_count
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     # Tensors: Note that input shape is reshape to (bs * seq_len * n_experts_per_token, dhidden) for grouped gemm
     input_shape = (group_sum, dhidden)
@@ -139,10 +139,10 @@ def moe_forward_tilelang_routed(
         routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
         routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
         routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
-        group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-        group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-        group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-        group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
+        group_sizes: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_padded_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_idx_for_bx: T.Tensor((M,), T.int32),  # type: ignore
         up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
         output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
@@ -155,8 +155,8 @@ def moe_forward_tilelang_routed(
             gate_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
             up_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
+            cur_group_idx = T.alloc_local([1], T.int32)
+            cur_group_size = T.alloc_local([1], T.int32)
 
             T.use_swizzle(10, enable=True)
 
@@ -208,8 +208,8 @@ def moe_forward_tilelang_routed(
             routed_expert_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
+            cur_group_idx = T.alloc_local([1], T.int32)
+            cur_group_size = T.alloc_local([1], T.int32)
 
             T.use_swizzle(10, enable=True)
 
@@ -464,7 +464,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     """
     input_tensor, weights, config = data
 
-    dtype_str = "float16"
+    dtype_str = T.float16
 
     shared_kernel = moe_forward_tilelang_shared(
         config["d_hidden"],
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index ecda7e41..39450bc5 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -250,13 +250,13 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
             dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
-            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype="float32")
-            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype="float32")
-            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype="float32")
+            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype=T.float32)
+            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype=T.float32)
+            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype=T.float32)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype="float32")
+            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype=T.float32)
             W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             G_last_local = T.alloc_local((1), dtype=gate_dtype)
@@ -592,11 +592,11 @@ def main():
         H=8,
         DK=DK,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         use_g=True,
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 43f1e972..d316a621 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -387,11 +387,11 @@ def main():
         H=32,
         DK=128,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         use_g=True,
         use_initial_state=False,
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index bd1e9aa2..81536815 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -230,10 +230,10 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
         use_g=True,
         block_DK=128,
         block_DV=128,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 66cb6942..97e2f4f0 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -505,11 +505,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         # scale=1,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index af2b08e5..e8ef17e3 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -57,9 +57,9 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
     H,
     DK,
     chunk_size=64,
-    input_dtype="bfloat16",
-    output_dtype="bfloat16",
-    accum_dtype="float32",
+    input_dtype=T.bfloat16,
+    output_dtype=T.bfloat16,
+    accum_dtype=T.float32,
     use_g=True,
     # kernel config
     block_S=64,
@@ -183,9 +183,9 @@ def main():
         H=32,
         DK=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
         use_g=True,
         block_DK=64,
         threads=128,
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 13547cd6..0760b496 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -32,8 +32,8 @@ def tilelang_chunk_local_cumsum_scalar(
     is_varlen=False,
     head_first=False,
     reverse=False,
-    input_dtype="float16",
-    output_dtype="float32",
+    input_dtype=T.float16,
+    output_dtype=T.float32,
     # kernel config
     block_S=64,
     threads=256,
@@ -154,8 +154,8 @@ def main():
         chunk_size=64,
         reverse=True,
         head_first=False,
-        input_dtype="float32",
-        output_dtype="float32",
+        input_dtype=T.float32,
+        output_dtype=T.float32,
         threads=256,
         use_fragment=False,
     )
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
index 874e25c3..9ac086ca 100644
--- a/examples/gdn/example_wy_fast.py
+++ b/examples/gdn/example_wy_fast.py
@@ -205,10 +205,10 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        gate_dtype="float32",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
         block_DK=64,
         block_DV=32,
         threads=128,
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 5b0230e5..de8afc2b 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -518,11 +518,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         block_DK=32,
         block_DV=32,
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index a51936ef..e749fa08 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -1,16 +1,17 @@
-import tilelang.testing
 import torch
+import tilelang.testing
+from tilelang import language as T
 
 B = 1
 S = 1024  # small but for test only.
 H = 32
 DK = 128
 DV = 128
-input_dtype = "bfloat16"
-output_dtype = "bfloat16"
-accum_dtype = "float32"
-gate_dtype = "float32"
-state_dtype = "float32"
+input_dtype = T.bfloat16
+output_dtype = T.bfloat16
+accum_dtype = T.float32
+gate_dtype = T.float32
+state_dtype = T.float32
 chunk_size = 64
 use_g = True
 use_initial_state = True
diff --git a/examples/gemm/README.md b/examples/gemm/README.md
index d7833c97..9ab7fb66 100644
--- a/examples/gemm/README.md
+++ b/examples/gemm/README.md
@@ -53,7 +53,7 @@ import tilelang
 from tilelang import Profiler
 import tilelang.language as T
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -176,7 +176,7 @@ import tilelang.language as T
 # that helps align data for MMA (Matrix Multiply-Accumulate) operations.
 from tilelang.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -265,18 +265,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
diff --git a/examples/gemm/example_gemm.py b/examples/gemm/example_gemm.py
index 2c234d12..906a55d5 100644
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index badc3340..ca322217 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -51,9 +51,9 @@ def get_configs(M, N, K, with_roller=False, topk=20):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -116,8 +116,8 @@ def get_best_config(M, N, K, with_roller=False):
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "bfloat16"
-        accum_dtype = "float"
+        dtype = T.bfloat16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
@@ -178,7 +178,7 @@ def get_heuristic_config() -> dict:
 
 
 @tl.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_autotune(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index 488e5bf6..746e6ec0 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -35,18 +35,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -54,7 +54,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     chunk = 32
     shared_scope = "shared.dyn"
 
@@ -163,7 +163,7 @@ def ref_program(A, B):
 
 
 def main(M=4096, N=4096, K=4096):
-    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    in_dtype, out_dtype, accum_dtype = T.float16, T.float16, T.float32
     kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     src_code = kernel.get_kernel_source()
     # src_code is the generated cuda source
diff --git a/examples/gemm/example_gemm_persistent.py b/examples/gemm/example_gemm_persistent.py
index 6fc0e5aa..30f55de6 100644
--- a/examples/gemm/example_gemm_persistent.py
+++ b/examples/gemm/example_gemm_persistent.py
@@ -5,7 +5,7 @@ import argparse
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype="float16", accum_dtype="float"):
+def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -34,7 +34,7 @@ def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stage
 
 @tilelang.jit(out_idx=[-1])
 def matmul_persistent(
-    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype="float16", accum_dtype="float", use_persistent_primitive=True
+    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32, use_persistent_primitive=True
 ):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
diff --git a/examples/gemm/example_gemm_schedule.py b/examples/gemm/example_gemm_schedule.py
index d1eb11df..8663c878 100644
--- a/examples/gemm/example_gemm_schedule.py
+++ b/examples/gemm/example_gemm_schedule.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_schedule(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 4c58144e..93f8c498 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -53,8 +53,8 @@ def get_configs():
 )
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = "float8_e4m3fnuz"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz
+    accum_dtype = T.float32
 
     @T.prim_func
     def gemm_fp8_rs(
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index 1ecd344b..1b440a79 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -1,7 +1,6 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 def calc_diff(x, y):
@@ -12,7 +11,7 @@ def calc_diff(x, y):
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     @T.prim_func
     def gemm_fp8(
         A: T.Tensor((M, K), dtype),
@@ -36,7 +35,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
@@ -56,8 +55,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, "float8_e4m3")
-    test_gemm_fp8(1024, 1024, 1024, "float8_e5m2")
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e5m2)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index 3af4c3d6..1c5d84d7 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -1,11 +1,10 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     # for fp8 gemm, do one promote after 4 wgmma inst, i.e. block_K = 128.
     # if block_K < 128, promote after 128/block_K iters.
     # if block_K > 128, promote after every iter.
@@ -55,7 +54,7 @@ def calc_diff(x, y):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
@@ -74,8 +73,8 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, "float8_e4m3")
-    test_gemm_fp8(1024, 1024, 8192, "float8_e5m2")
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e5m2)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index 6e2d41be..7ecde7c1 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -39,26 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
     is_float8 = in_dtype in [
-        "float8_e4m3",
-        "float8_e5m2",
-        "float8_e4m3fn",
-        "float8_e5m2fnuz",
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
     ]
-    if out_dtype == "int32" or is_float8:
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -66,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -220,8 +220,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
index 5cb42e32..aa7e8b36 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -73,8 +73,8 @@ block_M, block_N, block_K = 64, 256, 32
 trans_A, trans_B = False, True
 num_stages = 2
 threads = 256
-for tvm_fp8_dtype in ["float8_e4m3", "float8_e5m2"]:
-    for tvm_acc_dtype in ["float16", "float32"]:  # , torch.float16]:
+for tvm_fp8_dtype in [T.float8_e4m3fn, T.float8_e5m2]:
+    for tvm_acc_dtype in [T.float16, T.float32]:  # , torch.float16]:
         torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
         torch_acc_dtype = map_torch_type(tvm_acc_dtype)
         print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
diff --git a/examples/gemm_sm100/README.md b/examples/gemm_sm100/README.md
index 73dd76c3..28bb611b 100644
--- a/examples/gemm_sm100/README.md
+++ b/examples/gemm_sm100/README.md
@@ -40,19 +40,19 @@ import tilelang.language as T
 
 @T.prim_func
 def main(
-    A: T.Tensor((M, K), "bfloat16"),
-    B: T.Tensor((N, K), "bfloat16"),
-    C: T.Tensor((M, N), "bfloat16"),
+    A: T.Tensor((M, K), T.bfloat16),
+    B: T.Tensor((N, K), T.bfloat16),
+    C: T.Tensor((M, N), T.bfloat16),
 ):
     with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
         # 1. Allocate memory buffers
-        A_shared = T.alloc_shared((block_M, block_K), "bfloat16")  # A matrix shared memory
-        B_shared = T.alloc_shared((block_N, block_K), "bfloat16")  # B matrix shared memory
-        C_tmem = T.alloc_tmem([block_M, block_N], "float")         # TCGEN5MMA output to Tensor Memory
+        A_shared = T.alloc_shared((block_M, block_K), T.bfloat16)  # A matrix shared memory
+        B_shared = T.alloc_shared((block_N, block_K), T.bfloat16)  # B matrix shared memory
+        C_tmem = T.alloc_tmem([block_M, block_N], T.float)         # TCGEN5MMA output to Tensor Memory
         mbar = T.alloc_barrier(1)                                 # mbarrier synchronization primitive
 
-        C_local = T.alloc_fragment((block_M, block_N), "float")   # Register storage
-        C_shared = T.alloc_shared((block_M, block_N), "bfloat16") # Output shared memory
+        C_local = T.alloc_fragment((block_M, block_N), T.float)   # Register storage
+        C_shared = T.alloc_shared((block_M, block_N), T.bfloat16) # Output shared memory
 
         # 2. Main computation loop
         for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index be43f4ec..226e33c0 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 88614f56..523a94fe 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -54,7 +54,7 @@ def matmul(
 M, N, K = 4096, 4096, 8192
 block_M, block_N, block_K = 128, 256, 128
 trans_A, trans_B = False, True
-in_dtype, out_dtype, accum_dtype = "bfloat16", "bfloat16", "float"
+in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
 num_stages = 2
 threads = 256
 
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
index fe3b1523..7f18523b 100644
--- a/examples/gemm_sp/example_custom_compress.py
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -17,7 +17,7 @@ torch.manual_seed(42)
 
 DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        "float": {
+        T.float: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 64,
@@ -26,7 +26,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
             "policy": T.GemmWarpPolicy.Square,
             "enable_rasterization": True,
         },
-        "float16": {
+        T.float16: {
             "block_M": 256,
             "block_N": 128,
             "block_K": 64,
@@ -37,7 +37,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
         },
     },
     "h20": {
-        "float": {
+        T.float: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 128,
@@ -46,7 +46,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
             "policy": T.GemmWarpPolicy.Square,
             "enable_rasterization": True,
         },
-        "float16": {
+        T.float16: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 128,
@@ -65,26 +65,26 @@ ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
 def matmul_sp_fp16_custom_compress(
     M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization, use_cutlass_layout
 ):
-    e_factor, e_dtype = (16, "int16")
+    e_factor, e_dtype = (16, T.int16)
 
     @T.prim_func
     def gemm_sp_fp16_custom_compress(
-        A_sparse: T.Tensor((M, K // 2), "float16"),
+        A_sparse: T.Tensor((M, K // 2), T.float16),
         E: T.Tensor((M, K // e_factor), e_dtype),
-        B: T.Tensor((K, N), "float16"),
+        B: T.Tensor((K, N), T.float16),
         C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), "float16")
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), "float16")
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             if use_cutlass_layout:
                 T.annotate_layout(
                     {
-                        E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="8.0", block_k=block_K),
-                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
                     }
                 )
             T.clear(C_local)
@@ -253,15 +253,15 @@ def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
             if use_cutlass_layout:
                 T.annotate_layout(
                     {
-                        E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="8.0", block_k=block_K),
-                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="8.0", block_k=block_K),
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
                     }
                 )
             T.clear(A_sp_shared)
             T.clear(E_shared)
             # TODO: alloc_var seems buggy here
-            non_zero_cnt = T.alloc_local((1,), dtype="uint8")
-            non_zero_elt_log_idx = T.alloc_local((elem,), dtype="uint8")
+            non_zero_cnt = T.alloc_local((1,), dtype=T.uint8)
+            non_zero_elt_log_idx = T.alloc_local((elem,), dtype=T.uint8)
             T.copy(A[bx * block_M, by * block_K], A_shared)
             for tm in T.Parallel(block_M):
                 for g_i in range(0, block_K // group):
@@ -300,7 +300,7 @@ def main():
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
     parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
-    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
     args = parser.parse_args()
     kernel = matmul_sp_fp16_custom_compress(
@@ -314,7 +314,7 @@ def main():
         assert not args.use_cutlass_layout, "torch sparse must be used with naive layout"
         a_sparse, e = torch_compress(a)
     else:
-        a_sparse, e = compress_kernel(args.m, args.k, 32, 32, "float16", use_cutlass_layout=args.use_cutlass_layout)(a)
+        a_sparse, e = compress_kernel(args.m, args.k, 32, 32, T.float16, use_cutlass_layout=args.use_cutlass_layout)(a)
 
     c = kernel(a_sparse, e, b)
 
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 828ca43a..708bc723 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -16,7 +16,7 @@ arch = nvcc.get_target_compute_version()
 
 DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        "float": {
+        T.float: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 64,
@@ -25,7 +25,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
             "policy": T.GemmWarpPolicy.Square,
             "enable_rasterization": True,
         },
-        "float16": {
+        T.float16: {
             "block_M": 256,
             "block_N": 128,
             "block_K": 64,
@@ -36,7 +36,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
         },
     },
     "h20": {
-        "float": {
+        T.float: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 128,
@@ -45,7 +45,7 @@ DEFAULT_CONFIG = {  # take best config from autotune script
             "policy": T.GemmWarpPolicy.Square,
             "enable_rasterization": True,
         },
-        "float16": {
+        T.float16: {
             "block_M": 128,
             "block_N": 64,
             "block_K": 128,
@@ -66,15 +66,15 @@ def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages,
 
     @T.prim_func
     def gemm_sp_fp16(
-        A_sparse: T.Tensor((M, K // 2), "float16"),
+        A_sparse: T.Tensor((M, K // 2), T.float16),
         E: T.Tensor((M, K // e_factor), e_dtype),
-        B: T.Tensor((K, N), "float16"),
+        B: T.Tensor((K, N), T.float16),
         C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), "float16")
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), "float16")
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
@@ -83,8 +83,8 @@ def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages,
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
             T.annotate_layout(
                 {
-                    E: make_cutlass_metadata_layout(E, mma_dtype="float16", block_k=block_K, arch=arch),
-                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", block_k=block_K, arch=arch),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, block_k=block_K, arch=arch),
                 }
             )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
@@ -104,7 +104,7 @@ def main():
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
     args = parser.parse_args()
     kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype, **DEFAULT_CONFIG[args.cfg][args.accum_dtype])
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index 320a699c..62073c5b 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype="float16", accum_dtype="float", out_dtype="float32"):
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
index dfd84710..83e83b5d 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype="float16", accum_dtype="float", out_dtype="float32"):
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index 2d83586e..7ec1541e 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -87,8 +87,8 @@ def tl_matmul_streamk(
         C: T.Tensor,
         C_local: T.LocalBuffer,
     ):
-        start_iter = T.alloc_fragment((1,), "int32", "local")
-        end_iter = T.alloc_fragment((1,), "int32", "local")
+        start_iter = T.alloc_fragment((1,), T.int32, "local")
+        end_iter = T.alloc_fragment((1,), T.int32, "local")
 
         start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
         last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
@@ -179,9 +179,9 @@ def main():
         BLOCK_SIZE_K,
         False,
         True,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         2,
         64,
     )
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 00cbac06..9dd0e4dd 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -17,8 +17,8 @@ def naive_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     @T.prim_func
     def main(
@@ -49,8 +49,8 @@ def naive_splitk_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     @T.prim_func
     def main(
@@ -85,8 +85,8 @@ def splitk_gemv(
     BLOCK_N: int,
     BLOCK_K: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     TILE_K = T.ceildiv(BLOCK_K, reduce_threads)
 
@@ -124,8 +124,8 @@ def splitk_gemv_vectorized(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -165,8 +165,8 @@ def splitk_gemv_vectorized_tvm(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -233,7 +233,9 @@ def get_block_template_configs():
     },
     out_idx=[2],
 )
-def gemv_alloc_reducer(M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: str = "float16", accum_dtype: str = "float"):
+def gemv_alloc_reducer(
+    M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float
+):
     @T.prim_func
     def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M, dtype)):  # type: ignore
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as i0_m:
@@ -274,8 +276,8 @@ def get_autotuned_kernel(
     BLOCK_N=None,
     reduce_threads=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
     BLOCK_K = reduce_threads * TILE_K
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index b1af5360..bb57c607 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -6,29 +6,29 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
+def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
         A: T.Tensor([batch_sum, K], dtype),  # type: ignore
         B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
         C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-        batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_local([1], T.int32)
+            cur_batch_size = T.alloc_local([1], T.int32)
 
             m_start_padded = bx * block_M
 
@@ -158,21 +158,21 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
 
 
 @tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
+def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
         A: T.Tensor([batch_sum, M], dtype),  # type: ignore
         B: T.Tensor([batch_sum, N], dtype),  # type: ignore
         C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
-        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count, threads=threads) as (bx, by, bz):
             A_shared = T.alloc_shared([block_K, block_M], dtype)
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 8f771051..48d91605 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -37,7 +37,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
 
 
 @tilelang.jit(out_idx=[2])
-def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype="float16"):
+def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -45,7 +45,7 @@ def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2
     """
     batch_sum = sum(batch_sizes_list)
     batch_count = len(batch_sizes_list)
-    accum_dtype = "float32"
+    accum_dtype = T.float32
     total_m_blocks = sum((size + block_M - 1) // block_M for size in batch_sizes_list)
 
     @T.prim_func
@@ -53,16 +53,16 @@ def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2
         A: T.Tensor([batch_sum, K], dtype),  # type: ignore
         B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
         C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-        batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-        batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-        batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
         with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_local([1], T.int32)
+            cur_batch_size = T.alloc_local([1], T.int32)
 
             m_start_padded = bx * block_M
 
diff --git a/examples/hadamard_transform/example_hadamard.py b/examples/hadamard_transform/example_hadamard.py
index 64eb9bbd..65f463b7 100644
--- a/examples/hadamard_transform/example_hadamard.py
+++ b/examples/hadamard_transform/example_hadamard.py
@@ -17,7 +17,7 @@ def is_pow_of_2(n):
 def hadamard(b, n, dtype):
     assert is_pow_of_2(n), "n must be a power of 2"
     assert 2 <= n <= 32768, "n must be in [2, 32768]"
-    elem_size = {"float32": 4, "float16": 2, "bfloat16": 2}[dtype]
+    elem_size = {T.float32: 4, T.float16: 2, T.bfloat16: 2}[dtype]
 
     logN = int(math.log2(n))
     threads = [0, 1, 1, 1, 2, 4, 8, 16, 32, 32, 128, 256, 256, 256, 256, 256][logN]
@@ -138,7 +138,7 @@ def main():
 
     B, D = args.batch, args.dim
     x = torch.randn((B, D), device="cuda")
-    kernel = hadamard(B, D, "float32")
+    kernel = hadamard(B, D, T.float32)
     y = kernel(x)
     y_ref = ref_program(x)
     torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
index 196ddfc4..99cb977f 100644
--- a/examples/lazy_jit/lazyjit.en.ipynb
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -552,7 +552,7 @@
     {
      "data": {
       "text/plain": [
-       "# from tvm.script import tir as T\n",
+       "# import tilelang.language as T\n",
        "\n",
        "@T.prim_func\n",
        "def foo(x_handle: T.handle):\n",
@@ -723,7 +723,7 @@
     {
      "data": {
       "text/plain": [
-       "# from tvm.script import tir as T\n",
+       "# import tilelang.language as T\n",
        "\n",
        "@T.prim_func\n",
        "def foo():\n",
@@ -786,4 +786,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
index d6db4c76..601c5c5d 100644
--- a/examples/lazy_jit/lazyjit.zh.ipynb
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -552,7 +552,7 @@
     {
      "data": {
       "text/plain": [
-       "# from tvm.script import tir as T\n",
+       "# import tilelang.language as T\n",
        "\n",
        "@T.prim_func\n",
        "def foo(x_handle: T.handle):\n",
@@ -723,7 +723,7 @@
     {
      "data": {
       "text/plain": [
-       "# from tvm.script import tir as T\n",
+       "# import tilelang.language as T\n",
        "\n",
        "@T.prim_func\n",
        "def foo():\n",
@@ -786,4 +786,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 7cbfc465..397ec7bd 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -21,12 +21,12 @@ def tl_fused_chunk_bwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index 3d28f92b..849841e5 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -22,12 +22,12 @@ def tl_fused_chunk_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
diff --git a/examples/linear_attention/example_mamba_chunk_scan.py b/examples/linear_attention/example_mamba_chunk_scan.py
index 53b6cf9f..1958dfb5 100644
--- a/examples/linear_attention/example_mamba_chunk_scan.py
+++ b/examples/linear_attention/example_mamba_chunk_scan.py
@@ -89,8 +89,8 @@ def chunk_scan_fwd(
     num_stages=2,
     threads=128,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
diff --git a/examples/linear_attention/example_mamba_chunk_state.py b/examples/linear_attention/example_mamba_chunk_state.py
index 6aefde7b..fb766d5e 100644
--- a/examples/linear_attention/example_mamba_chunk_state.py
+++ b/examples/linear_attention/example_mamba_chunk_state.py
@@ -55,8 +55,8 @@ def get_configs():
 def chunk_state_fwd(
     batch, seqlen, chunk_size, ngroups, nheads, headdim, dstate, block_M=64, block_N=64, block_K=64, num_stages=2, threads=128
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index ccb11fe1..f45e3838 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -13,12 +13,12 @@ def chunk_retention_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -37,7 +37,7 @@ def chunk_retention_fwd_kernel(
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
-            log_decay = T.alloc_var("float32")
+            log_decay = T.alloc_var(T.float32)
             log_decay = T.log2(1 - T.exp2(-5.0 - 1.0 * i_h))  # Head-specific log decay
 
             q = T.alloc_shared([chunk_size, BK], dtype)
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index 6600bb5e..f96e73ae 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -31,9 +31,9 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
 
     vertical_size_round, slash_size_round = tilelang.next_power_of_2(vertical_size), tilelang.next_power_of_2(slash_size)
 
-    dtype = "float16"
-    accum_dtype = "float"
-    int_dtype = "int32"
+    dtype = T.float16
+    accum_dtype = T.float32
+    int_dtype = T.int32
 
     def kernel_func(block_M, block_N, num_stages, threads):
         @T.macro
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index a7a06b9c..57bccc1a 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -35,7 +35,7 @@ def rms_norm_splitk(M, N, blk_m, blk_k):
 
 @tilelang.jit(out_idx=[-1], pass_configs={"tl.disable_tma_lower": True})
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index 124a212f..53db03d9 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -35,7 +35,7 @@ def rms_norm_splitk(M, N, blk_m, blk_k):
 
 
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
diff --git a/examples/online_softmax/online_softmax.py b/examples/online_softmax/online_softmax.py
index 32f1c001..811870e4 100644
--- a/examples/online_softmax/online_softmax.py
+++ b/examples/online_softmax/online_softmax.py
@@ -9,12 +9,12 @@ from typing import Callable
 def softmax_kernel(
     M,
     N,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ) -> "Callable":
     BN = min(tl.next_power_of_2(N), 8192)
     NN = tl.cdiv(N, BN)
 
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     scale = 1.44269504  # log2(e)
 
diff --git a/examples/plot_layout/README.md b/examples/plot_layout/README.md
index a65d771c..8204e93d 100644
--- a/examples/plot_layout/README.md
+++ b/examples/plot_layout/README.md
@@ -10,7 +10,7 @@ from typing import Literal, Callable
 from tilelang.intrinsics.utils import get_mma_micro_size
 from tilelang.tools import plot_layout
 
-def make_mma_load_base_layout(dtype: str = "float16",
+def make_mma_load_base_layout(dtype: str = T.float16,
                               matrix: Literal["A", "B"] = "A",
                               transposed: bool = False) -> T.Fragment:
     """
@@ -69,7 +69,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
     micro_size_s, _, micro_size_r = get_mma_micro_size(dtype)
 
     transform_func = transform_func
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -94,7 +94,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
 
 
 # Create a 16×16 matrix layout for ldmatrix operations
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 
 # Print the layout structure (optional for debugging)
 print(base_layout)
diff --git a/examples/plot_layout/fragment_mfma_load_a.py b/examples/plot_layout/fragment_mfma_load_a.py
index a7e8f890..d45cc227 100644
--- a/examples/plot_layout/fragment_mfma_load_a.py
+++ b/examples/plot_layout/fragment_mfma_load_a.py
@@ -12,7 +12,7 @@ from tilelang.intrinsics.mfma_layout import (
 
 
 def make_mfma_load_base_layout(
-    dtype: str = "float16", matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
+    dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
 ) -> T.Fragment:
     """
     Create a layout function for storing MFMA results into a fragment buffer.
@@ -79,7 +79,7 @@ def make_mfma_load_base_layout(
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -112,7 +112,7 @@ chunk = 2
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mfma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mfma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
diff --git a/examples/plot_layout/fragment_mma_load_a.py b/examples/plot_layout/fragment_mma_load_a.py
index 17d1c6d5..df4a0b88 100644
--- a/examples/plot_layout/fragment_mma_load_a.py
+++ b/examples/plot_layout/fragment_mma_load_a.py
@@ -5,7 +5,7 @@ from tvm.tir import IndexMap
 from tilelang.intrinsics.utils import get_mma_micro_size
 
 
-def make_mma_load_base_layout(dtype: str = "float16", matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
+def make_mma_load_base_layout(dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
     """
     Create a layout function for storing MMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -74,7 +74,7 @@ def make_mma_load_base_layout(dtype: str = "float16", matrix: Literal["A", "B"]
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -107,7 +107,7 @@ chunk = 2
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 4b765ce1..e99fc0db 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -6,7 +6,7 @@ import tilelang.language as T
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index f5f7fe7b..25741f97 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -42,9 +42,9 @@ def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_c
     kv_shape = [batch, heads, seq_kv, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "int8"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.int8
 
     def kernel_func(block_M, block_N, num_stages, threads):
         @T.macro
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 6c37dc09..14339ff0 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -2,6 +2,7 @@ import torch
 import tilelang
 from tilelang.utils.sparse import compress_sm90
 from tilelang.layout import make_cutlass_metadata_layout
+from tilelang import language as T
 import tilelang.testing
 
 
@@ -24,8 +25,6 @@ def matmul_sp(
     A_shared_shape = (block_M, block_K // 2)
     B_shared_shape = (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A_sparse: T.Tensor(A_sparse_shape, in_dtype),
@@ -40,8 +39,8 @@ def matmul_sp(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.annotate_layout(
                 {
-                    E: make_cutlass_metadata_layout(E, mma_dtype="float16", arch="9.0", block_k=block_K),
-                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype="float16", arch="9.0", block_k=block_K),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="9.0", block_k=block_K),
                 }
             )
             T.clear(C_local)
@@ -111,7 +110,7 @@ def run_gemm_sp(
 
 
 def main():
-    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 2, 128)
+    run_gemm_sp(512, 1024, 768, T.float16, T.float16, T.float32, 128, 128, 128, 2, 128)
 
 
 if __name__ == "__main__":
diff --git a/examples/topk/example_topk.py b/examples/topk/example_topk.py
index c0cf09bc..d4f0c8bf 100644
--- a/examples/topk/example_topk.py
+++ b/examples/topk/example_topk.py
@@ -22,19 +22,19 @@ def tl_topk(
     blk_m,
     threads=128,
 ):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def topk_kernel(
         logits: T.Tensor([M, N], dtype),
         topk_gates: T.Tensor([M, topk], dtype),
-        topk_indices: T.Tensor([M, topk], "int32"),
+        topk_indices: T.Tensor([M, topk], T.int32),
     ):
         with T.Kernel(T.ceildiv(M, blk_m), threads=threads) as bx:
             logits_frag = T.alloc_fragment([blk_m, N], dtype=dtype)
             max_val = T.alloc_fragment([blk_m], dtype=dtype)
-            expand_max_idx = T.alloc_fragment([blk_m, N], "int32")
-            max_idx = T.alloc_fragment([blk_m], "int32")
+            expand_max_idx = T.alloc_fragment([blk_m, N], T.int32)
+            max_idx = T.alloc_fragment([blk_m], T.int32)
 
             T.copy(logits[bx * blk_m, 0], logits_frag)
 
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
index dbb39f78..8fa1eaf8 100644
--- a/examples/visual_layout_inference/visual_layout_inference.py
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -10,7 +10,7 @@ import tilelang.language as T
         tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg",
     },
 )
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 4f4417e7..6dcd51aa 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -10,8 +10,8 @@ import argparse
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
     scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index 5d438b5d..4a2aa00d 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -7,7 +7,7 @@ tilelang.disable_cache()
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 2
     mbarrier_list = [128, 128] * num_stages
 
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index 03ddf812..7b227843 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index 63aed2be..02d88019 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index f24d76a2..5468aa6e 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -10,7 +10,7 @@ import tilelang.language as T
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
     },
 )
-def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     warp_group_num = 2
     threads = 128 * warp_group_num
 
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index f3f8a665..31d156f3 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor[(M, K), dtype],
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
index e7a82254..44441cde 100644
--- a/maint/gemm_v2/correctness_evaluation.py
+++ b/maint/gemm_v2/correctness_evaluation.py
@@ -2,6 +2,8 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -24,8 +26,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -74,13 +74,11 @@ def _compile_and_check(
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -148,8 +146,6 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -235,8 +231,6 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -323,8 +317,6 @@ def matmul_rr(
     A_frag_shape = A_shared_shape
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -399,9 +391,9 @@ FALSE_TRUE_CASES = (
     [
         pytest.param(
             k,
-            "float16",
-            "float16",
-            "float16",
+            T.float16,
+            T.float16,
+            T.float16,
             id=f"K{k}-float16-float16-float16",
         )
         for k in K_VALUES
@@ -409,9 +401,9 @@ FALSE_TRUE_CASES = (
     + [
         pytest.param(
             k,
-            "int8",
-            "int32",
-            "int32",
+            T.int8,
+            T.int32,
+            T.int32,
             id="K32-int8-int32-int32",
         )
         for k in K_VALUES_8Bit
@@ -419,9 +411,9 @@ FALSE_TRUE_CASES = (
     + [
         pytest.param(
             k,
-            "float8_e5m2",
-            "float32",
-            "float32",
+            T.float8_e5m2,
+            T.float32,
+            T.float32,
             id="K32-float8_e5m2-float32-float32",
         )
         for k in K_VALUES_8Bit
@@ -429,9 +421,9 @@ FALSE_TRUE_CASES = (
     + [
         pytest.param(
             k,
-            "float8_e4m3",
-            "float32",
-            "float32",
+            T.float8_e4m3fn,
+            T.float32,
+            T.float32,
             id="K32-float8_e4m3-float32-float32",
         )
         for k in K_VALUES_8Bit
@@ -452,15 +444,15 @@ def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rs_true_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rs_true_true(m, n, k):
-    run_gemm_rs(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_rs(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
@@ -468,15 +460,15 @@ def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_sr_false_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_true_false(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_sr_true_true(m, n, k):
-    run_gemm_sr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_sr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
@@ -484,15 +476,15 @@ def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rr_false_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_true_false(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
 
 
 def run_gemm_rr_true_true(m, n, k):
-    run_gemm_rr(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k)
+    run_gemm_rr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
 
 
 TRANS_CASES = [
@@ -548,9 +540,9 @@ def test_gemm_false_false(m, n, k):
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -567,9 +559,9 @@ def test_gemm_true_false(m, n, k):
         k * 3,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -586,9 +578,9 @@ def test_gemm_true_true(m, n, k):
         k * 3,
         True,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -607,7 +599,7 @@ def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_false_false(m, n, k)
 
 
@@ -615,7 +607,7 @@ def test_gemm_rs_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_true_false(m, n, k)
 
 
@@ -623,7 +615,7 @@ def test_gemm_rs_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_true_true(m, n, k)
 
 
@@ -639,7 +631,7 @@ def test_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_false_false(m, n, k)
 
 
@@ -647,7 +639,7 @@ def test_gemm_sr_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_true_false(m, n, k)
 
 
@@ -655,7 +647,7 @@ def test_gemm_sr_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_sr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_sr_true_true(m, n, k)
 
 
@@ -671,7 +663,7 @@ def test_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_false_false(m, n, k)
 
 
@@ -679,7 +671,7 @@ def test_gemm_rr_false_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_true_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_true_false(m, n, k)
 
 
@@ -687,7 +679,7 @@ def test_gemm_rr_true_false(m, n, k):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rr_true_true(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rr_true_true(m, n, k)
 
 
@@ -699,7 +691,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -707,7 +699,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -715,7 +707,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True False =============================")
-    #             run_gemm(m, n, k * 3, True, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -724,7 +716,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} True True =============================")
-    #             run_gemm(m, n, k * 3, True, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m}, {n} {k} Pass")
     #         print(f"Test {n} Pass")
 
@@ -733,15 +725,15 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64, 128]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm_rs(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm_rs(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm_rs(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm_rs(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
 
     # for n in [16, 32, 64, 128]:
     #     for k in [16, 32, 64, 128]:
-    #         run_gemm(64, n, k, False, False, "float16", "float16", "float16", 64, n, k, 0, 256)
+    #         run_gemm(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
     #         print(f"Test {64} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_sm70.py b/maint/gemm_v2/correctness_evaluation_sm70.py
index 3b4503d4..606d1026 100644
--- a/maint/gemm_v2/correctness_evaluation_sm70.py
+++ b/maint/gemm_v2/correctness_evaluation_sm70.py
@@ -2,6 +2,7 @@
 import pytest
 from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 
 
 def matmul(
@@ -24,8 +25,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -81,7 +80,7 @@ def _compile_and_check(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -147,8 +146,6 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -217,18 +214,18 @@ K_VALUES = [16, 32, 64]
 FALSE_TRUE_CASES = [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         id=f"K{k}-float16-float16-float16",
     )
     for k in K_VALUES
 ] + [
     pytest.param(
         k,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         id=f"K{k}-float16-float16-float32",
     )
     for k in K_VALUES
@@ -248,7 +245,7 @@ def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 
 
 def run_gemm_rs_false_false(m, n, k):
-    run_gemm_rs(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
 
 
 TRANS_CASES = [
@@ -306,9 +303,9 @@ def test_gemm_false_false(m, n, k):
         k * 3,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         m,
         n,
         k,
@@ -329,7 +326,7 @@ def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
 @pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
 @pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
 def test_gemm_rs_false_false(m, n, k):
-    _ensure_torch_dtypes("float16")
+    _ensure_torch_dtypes(T.float16)
     run_gemm_rs_false_false(m, n, k)
 
 
@@ -341,7 +338,7 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -349,5 +346,5 @@ if __name__ == "__main__":
     #     for n in [16, 32, 64, 128]:
     #         for k in [16, 32, 64]:
     #             print(f"======================= Test {m} {n} {k} False False =============================")
-    #             run_gemm(m, n, k * 3, False, False, "float16", "float16", "float16", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
index 4ce8691e..8d972818 100644
--- a/maint/gemm_v2/correctness_evaluation_tcgen05.py
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -80,7 +80,7 @@ def _compile_and_check(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
             B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
@@ -134,18 +134,18 @@ K_VALUES_8Bit = [32, 64, 128]
 FALSE_TRUE_CASES = [
     pytest.param(
         k,
-        "float16",
-        "float32",
-        "float32",
+        T.float16,
+        T.float32,
+        T.float32,
         id=f"K{k}-float16-float-float",
     )
     for k in K_VALUES
 ] + [
     pytest.param(
         k,
-        "float8_e5m2",
-        "float32",
-        "float32",
+        T.float8_e5m2,
+        T.float32,
+        T.float32,
         id="K32-float8_e5m2-float32-float32",
     )
     for k in K_VALUES_8Bit
@@ -195,7 +195,7 @@ if __name__ == "__main__":
     #             if m in [32, 64] and (n not in [64, 128, 256]):
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float", "float", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 128)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -205,7 +205,7 @@ if __name__ == "__main__":
     #             if m in [32, 64] and (n not in [64, 128, 256]):
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float16", "float", "float", m, n, k, 2, 256)
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 256)
     #             print(f"Test {m} {n} {k} Pass")
 
     # # Test Pass
@@ -215,4 +215,4 @@ if __name__ == "__main__":
     #             if m in [32, 64] and (n not in [64, 128, 256]):
     #                 continue
     #             print(f"======================= Test {m} {n} {k} False True =============================")
-    #             run_gemm(m, n, k * 3, False, True, "float8_e5m2", "float", "float", m, n, k, 2, 128)
+    #             run_gemm(m, n, k * 3, False, True, T.float8_e5m2, T.float, T.float, m, n, k, 2, 128)
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
index 4dcb7cf9..b7b2a2af 100644
--- a/maint/gemm_v2/latency.py
+++ b/maint/gemm_v2/latency.py
@@ -13,7 +13,7 @@ use_v2 = args.use_v2
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
         A: T.Tensor((M, K), dtype),
diff --git a/maint/gemm_v2/latency_gemm.py b/maint/gemm_v2/latency_gemm.py
index a66167d4..5f0450e0 100644
--- a/maint/gemm_v2/latency_gemm.py
+++ b/maint/gemm_v2/latency_gemm.py
@@ -13,7 +13,7 @@ use_v2 = args.use_v2
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
         A: T.Tensor((M, K), dtype),
diff --git a/maint/gemm_v2/latency_mha_fwd_bhsd.py b/maint/gemm_v2/latency_mha_fwd_bhsd.py
index 3fd56001..7a83d7ce 100644
--- a/maint/gemm_v2/latency_mha_fwd_bhsd.py
+++ b/maint/gemm_v2/latency_mha_fwd_bhsd.py
@@ -38,8 +38,8 @@ def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=6
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
index 649527d4..3dbac548 100644
--- a/maint/host_checks/common.py
+++ b/maint/host_checks/common.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 import torch
 
 
-def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype="float16", accum_dtype="float"):
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
index 985c3bd9..c77a67cf 100644
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -186,8 +186,8 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
 
     @T.prim_func
     def tilelang_unary_kernel(
-        A: T.Tensor((M, N), "float32"),
-        B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
@@ -224,9 +224,9 @@ def make_tilelang_binary_kernel(M: int, N: int):
 
     @T.prim_func
     def tilelang_binary_kernel(
-        A: T.Tensor((M, N), "float32"),
-        B: T.Tensor((M, N), "float32"),
-        C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
diff --git a/maint/scripts/performance.py b/maint/scripts/performance.py
index 849bcf36..d53c227f 100644
--- a/maint/scripts/performance.py
+++ b/maint/scripts/performance.py
@@ -30,8 +30,8 @@ def run(M, N, K):
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 408b16cc..702ae017 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -1124,8 +1124,10 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   }
 
   // Handle conversion from float32 to float8 (E4M3/E5M2)
-  if (from_ty.is_float() &&
-      (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2())) {
+  if (from_ty.is_float() && (target_ty.is_float8())) {
+    bool target_type_is_e4m3 = target_ty.is_float8_e4m3() ||
+                               target_ty.is_float8_e4m3fn() ||
+                               target_ty.is_float8_e4m3fnuz();
     // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion
     // (float2 -> fp8x2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
@@ -1134,8 +1136,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret
              << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&("
              << src << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       os << sret;
       return;
     } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
@@ -1144,14 +1145,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
              << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
              << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       os << sret;
       return;
     } else if (from_ty.lanes() == 8 && target_ty.lanes() == 8) {
@@ -1160,33 +1159,31 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
              << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
              << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[2] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+2), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       PrintIndent();
       stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[3] = "
              << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
              << "))+3), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
+             << (target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2") << ");\n";
       os << sret;
       return;
     }
   }
 
-  if ((from_ty.is_float8_e4m3() || from_ty.is_float8_e5m2()) &&
-      target_ty.is_float()) {
+  if (from_ty.is_float8() && target_ty.is_float()) {
+    bool from_type_is_e4m3 = from_ty.is_float8_e4m3() ||
+                             from_ty.is_float8_e4m3fn() ||
+                             from_ty.is_float8_e4m3fnuz();
     // FP8 -> FP32: Use __tl_cvt_fp8x2_to_float2 for vectorized conversion
     // (fp8x2 -> float2)
     if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
@@ -1196,8 +1193,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
              << ")) = "
                 "__tl_cvt_fp8x2_to_float2(*reinterpret_cast<__nv_fp8x2_storage_"
                 "t*>(&("
-             << src << ")), "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << src << ")), " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       os << sret;
       return;
@@ -1206,14 +1202,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       PrintIndent();
       stream << "*(float2*)(&" << sret << ") = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[0], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[0], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       PrintIndent();
       stream << "*((float2*)(&" << sret << ")+1) = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[1], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[1], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       os << sret;
       return;
@@ -1222,26 +1216,22 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
       PrintIndent();
       stream << "*(float2*)(&" << sret << ") = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[0], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[0], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       PrintIndent();
       stream << "*((float2*)(&" << sret << ")+1) = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[1], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[1], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       PrintIndent();
       stream << "*((float2*)(&" << sret << ")+2) = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[2], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[2], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       PrintIndent();
       stream << "*((float2*)(&" << sret << ")+3) = "
              << "__tl_cvt_fp8x2_to_float2(((__nv_fp8x2_storage_t*)(&" << src
-             << "))[3], "
-             << (from_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
+             << "))[3], " << (from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2")
              << ");\n";
       os << sret;
       return;
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index 33731285..0c6a7637 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -1179,10 +1179,10 @@ private:
         // Check if this is a non-reducer store with Cast operation
         DataType src_type = cast->value.dtype();
         DataType dst_type = cast->dtype;
-        bool src_ok = src_type.is_float() || src_type.is_bfloat() ||
-                      src_type.is_float8_e4m3() || src_type.is_float8_e5m2();
-        bool dst_ok = dst_type.is_float() || dst_type.is_bfloat() ||
-                      dst_type.is_float8_e4m3() || dst_type.is_float8_e5m2();
+        bool src_ok =
+            src_type.is_float() || src_type.is_bfloat() || src_type.is_float8();
+        bool dst_ok =
+            dst_type.is_float() || dst_type.is_bfloat() || dst_type.is_float8();
         if (src_ok && dst_ok && TargetIsCuda(Target::Current())) {
           has_cast_operations = true;
         }
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index 65b2d5cf..b2635483 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -26,7 +26,7 @@ def tl_matmul(
 ):
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -160,7 +160,7 @@ def tl_matmul(
     return main
 
 
-def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype="float32", a_transposed=False, b_transposed=True, k_pack=1):
+def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.float32, a_transposed=False, b_transposed=True, k_pack=1):
     matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
     print(matmul)
     kernel = tilelang.compile(matmul)
@@ -169,10 +169,10 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype="floa
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == "float8_e4m3fnuz":
+    elif in_dtype == T.float8_e4m3fnuz:
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -211,15 +211,15 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype="floa
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack",
     [
-        (128, 128, 128, "float16", "float16", "float32", False, True, 1),
-        (128, 256, 256, "float16", "float32", "float32", False, True, 1),
-        (128, 256, 256, "float16", "float32", "float32", False, True, 2),
-        (128, 128, 128, "int8", "int32", "int32", False, True, 1),
-        (128, 256, 256, "int8", "int32", "int32", False, True, 1),
-        (128, 256, 256, "int8", "int32", "int32", False, True, 2),
-        (128, 256, 256, "int8", "int32", "int32", False, False, 1),
-        (128, 256, 256, "int8", "int32", "int32", False, False, 2),
-        (128, 128, 128, "float8_e4m3fnuz", "float16", "float32", False, True, 1),
+        (128, 128, 128, T.float16, T.float16, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 2),
+        (128, 128, 128, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 2),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 2),
+        (128, 128, 128, T.float8_e4m3fnuz, T.float16, T.float32, False, True, 1),
     ],
 )
 @tilelang.testing.requires_rocm
@@ -235,10 +235,10 @@ def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transpose
         b_transposed=b_transposed,
         k_pack=k_pack,
     )
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index eb2c6cbc..dc95eb70 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -26,7 +26,7 @@ def tl_matmul(
 ):
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -196,7 +196,7 @@ def assert_tl_matmul_correctness(
     K,
     in_dtype,
     out_dtype,
-    accum_dtype="float32",
+    accum_dtype=T.float32,
     a_transposed=False,
     b_transposed=True,
     k_pack=1,
@@ -211,10 +211,10 @@ def assert_tl_matmul_correctness(
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == "float8_e4m3fnuz":
+    elif in_dtype == T.float8_e4m3fnuz:
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -261,14 +261,14 @@ def assert_tl_matmul_correctness(
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
     [
-        (256, 256, 512, "int8", "int32", "int32", False, True, 1, True, False),
-        (256, 256, 512, "int8", "int32", "int32", False, False, 1, True, False),
-        (256, 256, 512, "int8", "int32", "int32", False, True, 2, True, False),
-        (256, 256, 512, "int8", "int32", "int32", False, False, 2, True, False),
-        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, True, 1, True, False),
-        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, False, 1, True, False),
-        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, True, 2, True, False),
-        (256, 256, 512, "float8_e4m3fnuz", "float32", "float32", False, False, 2, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 2, True, False),
     ],
 )
 @tilelang.testing.requires_rocm
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index c9c3bedb..4035c299 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -108,7 +108,7 @@ def run_gemm(
 )
 @tilelang.testing.requires_rocm
 def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
-    run_gemm(1024, 1024, 1024, trans_A, trans_B, "float16", "float32", "float32", 128, 128, 32, k_pack=k_pack)
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.float16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 @pytest.mark.parametrize(
@@ -123,7 +123,7 @@ def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
 )
 @tilelang.testing.requires_rocm
 def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
-    run_gemm(1024, 1024, 1024, trans_A, trans_B, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=k_pack)
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 @pytest.mark.parametrize(
@@ -138,7 +138,7 @@ def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
 )
 @tilelang.testing.requires_rocm
 def test_gemm_bf16bf16f32(trans_A, trans_B, k_pack):
-    run_gemm(1024, 1024, 1024, trans_A, trans_B, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=k_pack)
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 def matmul_rs(
@@ -241,24 +241,24 @@ def run_gemm_rs(
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_f16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.float16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16f32f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
 
 # @tilelang.testing.requires_rocm
 # def test_gemm_rs_bf16bf16f32_nt():
-#     run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-#     run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
index 85aa5189..99458f1c 100644
--- a/testing/python/analysis/test_tilelang_fragment_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -5,7 +5,7 @@ import pytest
 
 
 @tilelang.jit
-def simple_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def simple_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
@@ -26,7 +26,7 @@ def simple_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", n
 
 
 @tilelang.jit
-def nested_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def nested_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
@@ -48,7 +48,7 @@ def nested_invalid_loop(dtype: str = "bfloat16", accum_dtype: str = "float32", n
 
 
 @tilelang.jit
-def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def invalid_loop_with_complex_dataflow(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
@@ -69,7 +69,7 @@ def invalid_loop_with_complex_dataflow(dtype: str = "bfloat16", accum_dtype: str
 
 
 @tilelang.jit
-def valid_loop_not_use_loop_var(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def valid_loop_not_use_loop_var(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
@@ -91,7 +91,7 @@ def valid_loop_not_use_loop_var(dtype: str = "bfloat16", accum_dtype: str = "flo
 
 
 @tilelang.jit
-def valid_loop_not_frag(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def valid_loop_not_frag(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
@@ -112,7 +112,7 @@ def valid_loop_not_frag(dtype: str = "bfloat16", accum_dtype: str = "float32", n
 
 
 @tilelang.jit
-def valid_loop_serial(dtype: str = "bfloat16", accum_dtype: str = "float32", num_threads: int = 128):
+def valid_loop_serial(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
     A = T.dynamic("A")
 
     @T.prim_func
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
index e282c8e3..664fda5b 100644
--- a/testing/python/analysis/test_tilelang_nested_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -29,7 +29,7 @@ Rule:
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_parallels(length=256, block=16, dtype="float32"):
+def nested_continuous_parallels(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -44,7 +44,7 @@ def nested_continuous_parallels(length=256, block=16, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype="float32"):
+def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -60,7 +60,7 @@ def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype="fl
 
 
 @tilelang.jit(out_idx=[1])
-def nested_noncontinuous_parallels(length=256, block=16, dtype="float32"):
+def nested_noncontinuous_parallels(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -149,9 +149,9 @@ def run_gemm_nested_pipelines(
     block_K = 32
     trans_A = False
     trans_B = False
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
     program = matmul_nested_pipelines(
         M,
@@ -188,7 +188,7 @@ def run_gemm_nested_pipelines(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
@@ -215,7 +215,7 @@ is OK.
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_serials(length=256, block=16, dtype="float32"):
+def nested_continuous_serials(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -230,7 +230,7 @@ def nested_continuous_serials(length=256, block=16, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def nested_noncontinuous_serials(length=256, block=16, dtype="float32"):
+def nested_noncontinuous_serials(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -272,7 +272,7 @@ Rule:
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_sp(length=256, block=16, dtype="float32"):
+def nested_continuous_sp(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -287,7 +287,7 @@ def nested_continuous_sp(length=256, block=16, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_ps(length=256, block=16, dtype="float32"):
+def nested_continuous_ps(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -302,7 +302,7 @@ def nested_continuous_ps(length=256, block=16, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_psp(length=256, block1=8, block2=2, dtype="float32"):
+def nested_continuous_psp(length=256, block1=8, block2=2, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -318,7 +318,7 @@ def nested_continuous_psp(length=256, block1=8, block2=2, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def nested_continuous_sps(length=256, block1=8, block2=2, dtype="float32"):
+def nested_continuous_sps(length=256, block1=8, block2=2, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -469,9 +469,9 @@ def run_gemm_mixed_pp(
     block_M = 128
     block_N = 128
     block_K = 32
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
 
     program = matmul_nested_pipa(
@@ -502,7 +502,7 @@ def run_gemm_mixed_pp(
     def ref_program(A, B):
         import torch
 
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
@@ -603,9 +603,9 @@ def run_gemm_tiled_op_with_parallel(
     block_M = 128
     block_N = 128
     block_K = 32
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
 
     program = matmul_nested_pipa(
@@ -636,7 +636,7 @@ def run_gemm_tiled_op_with_parallel(
     def ref_program(A, B):
         import torch
 
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
@@ -673,7 +673,7 @@ def run_gemm_tiled_op_with_parallel(
 
 
 @tilelang.jit(out_idx=[1])
-def tir_op_with_parallel(length=256, block=16, dtype="float32"):
+def tir_op_with_parallel(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -688,7 +688,7 @@ def tir_op_with_parallel(length=256, block=16, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def customize_op_with_parallel(length=256, block=16, dtype="float32"):
+def customize_op_with_parallel(length=256, block=16, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 3e6a05a2..53707ca3 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -57,9 +57,9 @@ def get_configs(M, N, K, with_roller=False):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -187,8 +187,8 @@ def matmul(M, N, K, with_roller):
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 8f9a6098..4edea0b8 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -39,8 +39,8 @@ def get_configs():
 )
 @tilelang.jit(out_idx=[-1])
 def matmul(M, N, K, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index 313dc857..ea674f7c 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -3,19 +3,20 @@ from tilelang import carver
 from tilelang.carver.roller import PrimFuncNode, OutputNode, Edge
 from tilelang.carver.arch import auto_infer_current_arch
 from tvm import te
+from tilelang.language import dtypes as T
 
 
 def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name="A", dtype="float16")
-        B = te.placeholder((N, K), name="B", dtype="float16")
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
         k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype("float16") * B[j, k].astype("float16"), axis=[k]), name="C")
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
@@ -55,13 +56,13 @@ def run_general_matmul_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name="A", dtype="float16")
-        B = te.placeholder((N, K), name="B", dtype="float16")
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
         k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype("float16") * B[j, k].astype("float16"), axis=[k]), name="C")
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index 4973c24d..3a060f53 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -1,10 +1,11 @@
 import tilelang.testing
 from tilelang import carver
+from tilelang.language import dtypes as T
 from tilelang.carver.arch import auto_infer_current_arch
 from typing import List
 
 
-def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: str = "float16", topk: int = 20):
+def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.GeneralReductionTemplate(
         structure=structure,
@@ -20,12 +21,12 @@ def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[in
 
 
 def test_general_reduction_recommend_hints():
-    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SS", [1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], "float16")
+    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SS", [1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], T.float16)
 
 
-def run_elementwise_recommend_hints(shape: List[int] = None, dtype: str = "float16", topk: int = 20):
+def run_elementwise_recommend_hints(shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.ElementwiseTemplate(
         shape=shape,
@@ -40,18 +41,18 @@ def run_elementwise_recommend_hints(shape: List[int] = None, dtype: str = "float
 
 
 def test_elementwise_recommend_hints():
-    run_elementwise_recommend_hints([1024, 1024], "float16")
-    run_elementwise_recommend_hints([1024], "float16")
-    run_elementwise_recommend_hints([1024, 1024, 1024], "float16")
+    run_elementwise_recommend_hints([1024, 1024], T.float16)
+    run_elementwise_recommend_hints([1024], T.float16)
+    run_elementwise_recommend_hints([1024, 1024, 1024], T.float16)
 
 
 def run_matmul_recommend_hints(
     M: int = 1024,
     N: int = 1024,
     K: int = 1024,
-    in_dtype: str = "float16",
-    out_dtype: str = "float16",
-    accum_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.MatmulTemplate(
@@ -71,13 +72,13 @@ def run_matmul_recommend_hints(
 
 
 def test_matmul_recommend_hints():
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float16", "float16")
-    run_matmul_recommend_hints(1024, 1024, 1024, "int8", "int32", "int32")
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float32", "float16")
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float16, T.float16)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.int8, T.int32, T.int32)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float32, T.float16)
 
 
 def run_gemv_recommend_hints(
-    N: int = 1024, K: int = 1024, in_dtype: str = "float16", out_dtype: str = "float16", accum_dtype: str = "float16"
+    N: int = 1024, K: int = 1024, in_dtype: T.dtype = T.float16, out_dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float16
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.GEMVTemplate(
@@ -96,9 +97,9 @@ def run_gemv_recommend_hints(
 
 
 def test_gemv_recommend_hints():
-    run_gemv_recommend_hints(1024, 1024, "float16", "float16", "float16")
-    run_gemv_recommend_hints(1024, 1024, "int8", "int32", "int32")
-    run_gemv_recommend_hints(1024, 1024, "float16", "float32", "float16")
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float16, T.float16)
+    run_gemv_recommend_hints(1024, 1024, T.int8, T.int32, T.int32)
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float32, T.float16)
 
 
 def run_fmha_recommend_hints(
@@ -107,9 +108,9 @@ def run_fmha_recommend_hints(
     seq_length: int = 512,
     seq_kv_length: int = 512,
     head_dim: int = 128,
-    in_dtype: str = "float16",
-    accum_dtype: str = "float16",
-    out_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.FlashAttentionTemplate(
@@ -133,8 +134,8 @@ def run_fmha_recommend_hints(
 
 
 def test_fmha_recommend_hints():
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "float16", "float16", "float16")
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "int8", "int32", "int32")
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index bd0a64d3..4c4f4e5f 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -8,12 +8,12 @@ def _compile_kernel_without_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
@@ -29,12 +29,12 @@ def _compile_kernel_with_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index 323f7645..d599e581 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -1,5 +1,6 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -22,8 +23,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -93,8 +92,6 @@ def run_gemm(
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
@@ -114,9 +111,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -129,9 +126,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 4a878f32..55646622 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -5,7 +5,7 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 0
 
     @T.prim_func
@@ -61,7 +61,7 @@ def test_matmul_codegen():
 
 
 def test_matmul_compile():
-    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
         # a simple kernel just for jit test
         @T.prim_func
         def matmul(
@@ -103,7 +103,7 @@ def test_matmul_compile():
     with tvm.target.Target("c"):
         complied_fun = tilelang.compile(cpu_func, -1, execution_backend="ctypes")
 
-    in_dtype = "float16"
+    in_dtype = T.float16
     A = torch.randn(M, K, dtype=torch.__getattribute__(in_dtype))
     B = torch.randn(K, N, dtype=torch.__getattribute__(in_dtype))
 
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index e2629661..3483cffc 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -5,7 +5,7 @@ import tilelang.testing
 import tilelang.language as T
 
 
-def debug_print_buffer(M=16, N=16, dtype="float16"):
+def debug_print_buffer(M=16, N=16, dtype=T.float16):
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
@@ -18,28 +18,28 @@ def debug_print_buffer(M=16, N=16, dtype="float16"):
 
 
 def test_debug_print_buffer():
-    debug_print_buffer(dtype="bool")
-    debug_print_buffer(dtype="int8")
-    debug_print_buffer(dtype="int16")
-    debug_print_buffer(dtype="int32")
-    debug_print_buffer(dtype="int64")
-    debug_print_buffer(dtype="uint8")
-    debug_print_buffer(dtype="uint16")
-    debug_print_buffer(dtype="uint32")
-    debug_print_buffer(dtype="uint64")
-    debug_print_buffer(dtype="float16")
-    debug_print_buffer(dtype="float32")
-    debug_print_buffer(dtype="float64")
-    debug_print_buffer(dtype="bfloat16")
-    debug_print_buffer(dtype="float8_e4m3")
-    debug_print_buffer(dtype="float8_e4m3fn")
-    debug_print_buffer(dtype="float8_e4m3fnuz")
-    debug_print_buffer(dtype="float8_e5m2")
-    debug_print_buffer(dtype="float8_e5m2fnuz")
+    debug_print_buffer(dtype=T.bool)
+    debug_print_buffer(dtype=T.int8)
+    debug_print_buffer(dtype=T.int16)
+    debug_print_buffer(dtype=T.int32)
+    debug_print_buffer(dtype=T.int64)
+    debug_print_buffer(dtype=T.uint8)
+    debug_print_buffer(dtype=T.uint16)
+    debug_print_buffer(dtype=T.uint32)
+    debug_print_buffer(dtype=T.uint64)
+    debug_print_buffer(dtype=T.float16)
+    debug_print_buffer(dtype=T.float32)
+    debug_print_buffer(dtype=T.float64)
+    debug_print_buffer(dtype=T.bfloat16)
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e4m3fnuz)
+    debug_print_buffer(dtype=T.float8_e5m2)
+    debug_print_buffer(dtype=T.float8_e5m2fnuz)
 
 
 def debug_print_buffer_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -59,7 +59,7 @@ def test_debug_print_buffer_conditional():
 
 
 def debug_print_value_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -78,7 +78,7 @@ def test_debug_print_value_conditional():
 
 
 def debug_print_register_files(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -97,7 +97,7 @@ def test_debug_print_register_files():
 
 
 def debug_print_msg(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
index 8e50a275..f93c330c 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
@@ -33,18 +33,18 @@ def tl_matmul_macro(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -52,7 +52,7 @@ def tl_matmul_macro(
     block_col_warps = 1
     warp_row_tiles = 16
     warp_col_tiles = 16
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -453,36 +453,36 @@ def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
 
 
 def test_assert_tl_matmul_macro():
-    assert_tl_matmul_macro_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(66, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_macro_correctness(32, 128, 128, "float16", "float16", "float16")
+    assert_tl_matmul_macro_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_macro_correctness(66, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_macro_correctness(32, 128, 128, T.float16, T.float16, T.float16)
 
 
 def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_correctness(128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
+    assert_tl_matmul_block_correctness(67, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
+    assert_tl_matmul_block_correctness(36, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+        128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=8
     )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+        64, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=8
     )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4
+        64, 128, 60, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=4
     )
     # Tail split is enabled with dynamic alignment 0
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0
+        64, 128, 64, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=0
     )
 
 
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
index 1bee1356..ea6efadb 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
@@ -437,7 +437,7 @@ def assert_tl_matmul_block_dynamic_mnk(
 
 
 def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16", "float16", "float32")
+    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, T.float16, T.float16, T.float32)
 
 
 def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
@@ -450,9 +450,9 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
     )
     assert_tl_matmul_block_dynamic_m(
@@ -464,9 +464,9 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": False},
     )
 
@@ -481,9 +481,9 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
     )
     assert_tl_matmul_block_dynamic_mn(
@@ -495,9 +495,9 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": False},
     )
 
@@ -512,9 +512,9 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 4},
     )
     assert_tl_matmul_block_dynamic_mnk(
@@ -526,9 +526,9 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         block_K,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         pass_configs={"tl.disable_dynamic_tail_split": False},
     )
 
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index 72eddd96..e181eb4d 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -50,7 +50,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -86,7 +86,7 @@ def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=3
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
@@ -134,7 +134,7 @@ def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -160,8 +160,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-        A: T.Tensor((M, N), "float32"),
-        B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -189,7 +189,7 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
@@ -222,7 +222,7 @@ def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -272,7 +272,7 @@ def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
 @tilelang.testing.requires_cuda
 def test_mathops_generate_no_fastmath(name, func):
     """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
-    run_single_arg_mathop_test(name, func, dtype="float32")
+    run_single_arg_mathop_test(name, func, dtype=T.float32)
     print(f"✓ {name} test passed")
 
 
@@ -286,7 +286,7 @@ def test_mathops_generate_no_fastmath(name, func):
 @tilelang.testing.requires_cuda
 def test_two_arg_mathops_fastmath(name, func):
     """Test all two-argument mathops"""
-    run_two_arg_mathop_test(name, func, dtype="float32")
+    run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -311,7 +311,7 @@ def test_abs_maps_to_fabs():
 @tilelang.testing.requires_cuda
 def test_fastmath_versions(name, func):
     """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    run_fastmath_mathop_test(name, func, dtype="float32")
+    run_fastmath_mathop_test(name, func, dtype=T.float32)
     print(f"✓ {name} test passed")
 
 
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
index a4283daa..f2315ef2 100644
--- a/testing/python/issue/test_tilelang_issue_1001.py
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -14,9 +14,9 @@ def _cumsum_view_infer_layout(hidden):
     num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
         with T.Kernel(num_tokens, threads=128) as pid:
-            smem = T.alloc_shared((hidden,), dtype="float")
+            smem = T.alloc_shared((hidden,), dtype=T.float32)
             T.copy(x[pid, :], smem)
             T.cumsum(T.view(smem, (1, hidden)), dim=1)
 
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index 2d86d164..a35a1844 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -33,7 +33,7 @@ def _fill_with_dynamic_region_kernel():
     @T.prim_func
     def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
         with T.Kernel(num_tokens, threads=128) as _:
-            a, b = T.alloc_var("int"), T.alloc_var("int")
+            a, b = T.alloc_var(T.int), T.alloc_var(T.int)
             T.fill(x[a:b], 0)
 
     return buggy_kernel
diff --git a/testing/python/issue/test_tilelang_issue_1115.py b/testing/python/issue/test_tilelang_issue_1115.py
index ce21a3b0..658c126a 100644
--- a/testing/python/issue/test_tilelang_issue_1115.py
+++ b/testing/python/issue/test_tilelang_issue_1115.py
@@ -9,7 +9,7 @@ def test_int64_address():
         S,
         D,
         pos_ty="int64",
-        dtype="float32",
+        dtype=T.float32,
     ):
         @T.prim_func
         def main(
@@ -36,7 +36,7 @@ def test_int64_address():
     pos_int64 = torch.arange(S, device="cuda", dtype=torch.int64)
     pos_int32 = torch.arange(S, device="cuda", dtype=torch.int32)
     kernel_int64 = set_cache_kernel(S, D, "int64")
-    kernel_int32 = set_cache_kernel(S, D, "int32")
+    kernel_int32 = set_cache_kernel(S, D, T.int32)
     kernel_int64(pos_int64, value, cache)
     torch.testing.assert_close(cache, value)
     kernel_int32(pos_int32, value, cache)
diff --git a/testing/python/issue/test_tilelang_issue_1198.py b/testing/python/issue/test_tilelang_issue_1198.py
index 08f36822..e6330e43 100644
--- a/testing/python/issue/test_tilelang_issue_1198.py
+++ b/testing/python/issue/test_tilelang_issue_1198.py
@@ -9,7 +9,7 @@ def test_issue_1198():
             [
                 32,
             ],
-            "int32",
+            T.int32,
         ),
     ):
         pass
diff --git a/testing/python/issue/test_tilelang_issue_1210.py b/testing/python/issue/test_tilelang_issue_1210.py
index 971fb819..2e141d78 100644
--- a/testing/python/issue/test_tilelang_issue_1210.py
+++ b/testing/python/issue/test_tilelang_issue_1210.py
@@ -4,10 +4,10 @@ import tilelang.testing
 
 
 def _make_kernel(M, N):
-    dtype = "bfloat16"
+    dtype = T.bfloat16
 
     @T.prim_func
-    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), "int32")):
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
         with T.Kernel(4, threads=1):
             A = T.alloc_shared([N], dtype)
             B = T.alloc_shared([N], dtype)
diff --git a/testing/python/issue/test_tilelang_issue_1237.py b/testing/python/issue/test_tilelang_issue_1237.py
index a9aadc5e..bb936e46 100644
--- a/testing/python/issue/test_tilelang_issue_1237.py
+++ b/testing/python/issue/test_tilelang_issue_1237.py
@@ -7,12 +7,12 @@ def test_issue_1237_dynamic_copy_extent_builds():
     # The goal is to ensure T.copy correctly handles dynamic extents
     # (e.g., src slice length vs. static dst buffer size) during prim_func building.
 
-    length = T.symbolic("len", dtype="int32")
+    length = T.symbolic("len", dtype=T.int32)
 
     @T.prim_func
-    def sample_kernel(global_tensor: T.Tensor[(length,), "int32"]):  # noqa: F821
+    def sample_kernel(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
         with T.Kernel(1, threads=32):
-            buffer_shared = T.alloc_shared((1024,), dtype="int32")
+            buffer_shared = T.alloc_shared((1024,), dtype=T.int32)
             T.copy(global_tensor[0:length], buffer_shared)
 
     # Building the prim_func is sufficient to exercise the bug path; no need to JIT/execute.
diff --git a/testing/python/issue/test_tilelang_issue_814.py b/testing/python/issue/test_tilelang_issue_814.py
index a202bd96..f9f94bd7 100644
--- a/testing/python/issue/test_tilelang_issue_814.py
+++ b/testing/python/issue/test_tilelang_issue_814.py
@@ -5,7 +5,7 @@ import torch
 
 
 @tilelang.jit
-def _tmp_var_kernel(N, block_N, dtype="float"):
+def _tmp_var_kernel(N, block_N, dtype=T.float32):
     @T.prim_func
     def kernel(
         A: T.Tensor((N,), dtype),
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index 74ceed3d..1a2a909d 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -34,7 +34,7 @@ def _empty_with_dead_code_kernel():
     num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float32"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
         with T.Kernel(num_tokens, threads=32) as pid:
             y = x[pid]
 
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index 6ab7fe47..9bf5c69b 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/testing/python/issue/test_tilelang_issue_merge_if.py b/testing/python/issue/test_tilelang_issue_merge_if.py
index fa9432fc..e3b1e308 100644
--- a/testing/python/issue/test_tilelang_issue_merge_if.py
+++ b/testing/python/issue/test_tilelang_issue_merge_if.py
@@ -8,10 +8,10 @@ import tilelang.language as T
 def merge_if_test():
     @T.prim_func
     def main():
-        A = T.alloc_fragment((1,), "float16")
-        B = T.alloc_fragment((1,), "float16")
-        C = T.alloc_fragment((1,), "float16")
-        D = T.alloc_fragment((1,), "float16")
+        A = T.alloc_fragment((1,), T.float16)
+        B = T.alloc_fragment((1,), T.float16)
+        C = T.alloc_fragment((1,), T.float16)
+        D = T.alloc_fragment((1,), T.float16)
         if A[0] == 0:
             A[0] = 0
         if B[0] == 0:
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index 7d76a64d..98b88820 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 from tilelang.engine.callback import register_cuda_postproc_callback
@@ -25,8 +25,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -107,9 +105,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -137,8 +135,6 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -207,8 +203,6 @@ def run_gemm_jit_kernel(
         B = B.T
 
     def ref_program(A, B):
-        import torch
-
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -226,9 +220,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 153f06cb..97391f26 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 import torch
@@ -27,8 +27,6 @@ def matmul_kernel_jit(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -95,8 +93,6 @@ def run_gemm_kernel_jit(
         B = B.T
 
     def ref_program(A, B):
-        import torch
-
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -114,9 +110,9 @@ def test_gemm_f16f16f16_nn_kernel_jit():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 4ea4ba88..c5399fc5 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -104,9 +104,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -226,9 +226,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -278,7 +278,7 @@ def run_cython_kernel_do_bench(
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_cython_kernel_multi_stream(
@@ -322,7 +322,7 @@ def run_cython_kernel_multi_stream(
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_cython_dynamic_shape(
@@ -371,11 +371,11 @@ def run_cython_dynamic_shape(
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_cython_dynamic_shape_with_out_idx(
@@ -424,7 +424,7 @@ def run_cython_dynamic_shape_with_out_idx(
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def matmul_int_variable(
@@ -495,7 +495,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16", "float32", 0, 128)
+    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 def matmul_float_variable(
@@ -566,7 +566,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16", "float32", 0, 128)
+    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index 8965e2ad..a9edb5e9 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -7,7 +7,7 @@ from tilelang.utils import map_torch_type
 
 
 @tl.jit
-def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float", with_bias=False):
+def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, with_bias=False):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -38,7 +38,7 @@ def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_
     return main
 
 
-def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
index 2b150277..b6823b8c 100644
--- a/testing/python/jit/test_tilelang_jit_nvrtc.py
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -104,9 +104,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -224,9 +224,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -269,7 +269,7 @@ def run_nvrtc_kernel_do_bench(
 
 
 def test_nvrtc_kernel_do_bench():
-    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_nvrtc_kernel_multi_stream(
@@ -311,7 +311,7 @@ def run_nvrtc_kernel_multi_stream(
 
 
 def test_nvrtc_kernel_multi_stream():
-    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_nvrtc_dynamic_shape(
@@ -360,11 +360,11 @@ def run_nvrtc_dynamic_shape(
 
 
 def test_nvrtc_dynamic_shape():
-    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
-    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
-    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def check_hopper():
@@ -375,7 +375,7 @@ def check_hopper():
     return compute_capability == (9, 0)
 
 
-def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
@@ -463,7 +463,7 @@ def test_nvrtc_l2_persistent_map():
         M,
         N,
         block_size=256,
-        dtype="float32",
+        dtype=T.float32,
     ):
         @T.prim_func
         def kernel(
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index 0a6e9062..56201e1c 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -1,6 +1,7 @@
 import tilelang.testing
 import tilelang
 import torch
+from tilelang import language as T
 
 
 @tilelang.jit(
@@ -16,9 +17,9 @@ def matmul_kernel_jit(
     block_K,
     trans_A=False,
     trans_B=True,
-    in_dtype="float16",
-    out_dtype="float32",
-    accum_dtype="float32",
+    in_dtype=T.float16,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
     num_stages=2,
     threads=128,
 ):
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index 5daaf308..a0df2719 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -162,9 +162,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -207,7 +207,7 @@ def run_tvm_ffi_kernel_do_bench(
 
 
 def test_tvm_ffi_kernel_do_bench():
-    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_kernel_multi_stream(
@@ -249,7 +249,7 @@ def run_tvm_ffi_kernel_multi_stream(
 
 
 def test_tvm_ffi_kernel_multi_stream():
-    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_dynamic_shape(
@@ -298,12 +298,12 @@ def run_tvm_ffi_dynamic_shape(
 
 
 def test_tvm_ffi_dynamic_shape():
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
     run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2
     )
 
 
@@ -315,7 +315,7 @@ def check_hopper():
     return compute_capability == (9, 0)
 
 
-def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype="float16", accum_dtype="float"):
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
@@ -403,7 +403,7 @@ def test_tvm_ffi_l2_persistent_map():
         M,
         N,
         block_size=256,
-        dtype="float32",
+        dtype=T.float32,
     ):
         @T.prim_func
         def kernel(
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index e7d7021c..97d050b7 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -39,27 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
     is_float8 = in_dtype in [
-        "float8_e4m3",
-        "float8_e5m2",
-        "float8_e4m3fn",
-        "float8_e5m2fnuz",
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
     ]
-    if out_dtype == "int32" or is_float8:
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -67,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -221,7 +221,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
index 52763c81..501b38fd 100644
--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
@@ -1,5 +1,5 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 import torch
 
 
@@ -12,8 +12,6 @@ def elementwise_add(
     out_dtype,
     threads,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((M, N), in_dtype),
@@ -67,8 +65,8 @@ def test_elementwise_add_f32():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
         128,
         256,
     )
@@ -78,8 +76,8 @@ def test_elementwise_add_f16():
     run_elementwise_add(
         512,
         1024,
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
         128,
         256,
     )
@@ -89,8 +87,8 @@ def test_elementwise_add_i32():
     run_elementwise_add(
         512,
         1024,
-        "int32",
-        "int32",
+        T.int32,
+        T.int32,
         128,
         256,
     )
@@ -100,8 +98,8 @@ def test_elementwise_add_f32f16():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float16",
+        T.float32,
+        T.float16,
         128,
         256,
     )
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 63c82120..276083b2 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -54,8 +54,8 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9)
 def test_assert_matmul():
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e4m3", "float32", "float32")
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e5m2", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e4m3fn, T.float32, T.float32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index eec3a9ca..9ba369b6 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -39,26 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
     is_float8 = in_dtype in [
-        "float8_e4m3",
-        "float8_e5m2",
-        "float8_e4m3fn",
-        "float8_e5m2fnuz",
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
     ]
-    if out_dtype == "int32" or is_float8:
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -66,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -221,8 +221,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index 4a48b656..7b757992 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -46,7 +46,7 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
@@ -164,8 +164,8 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 6c01297a..6dc95e98 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -22,8 +23,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -92,7 +91,7 @@ def run_gemm(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
@@ -111,9 +110,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -128,9 +127,9 @@ def test_gemm_f16f16f32_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         128,
         32,
@@ -144,9 +143,9 @@ def test_gemm_bf16bf16f32_nn():
         768,
         False,
         False,
-        "bfloat16",
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.bfloat16,
+        T.float32,
         128,
         128,
         32,
@@ -160,9 +159,9 @@ def test_gemm_f32f32f32_nn():
         768,
         False,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -176,9 +175,9 @@ def test_gemm_f16f16f16_tn():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -193,9 +192,9 @@ def test_gemm_f16f16f16_nt():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -204,15 +203,15 @@ def test_gemm_f16f16f16_nt():
 
 
 def test_gemm_i8i8i32_nt():
-    run_gemm(512, 1024, 768, False, True, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, False, True, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_i8i8i32_tn():
-    run_gemm(512, 1024, 768, True, False, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_f64f64f64_nt():
-    run_gemm(512, 512, 512, False, True, "float64", "float64", "float64", 64, 32, 16)
+    run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
 
 def test_gemm_f32f32f32_nt():
@@ -222,9 +221,9 @@ def test_gemm_f32f32f32_nt():
         768,
         False,
         True,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -238,9 +237,9 @@ def test_gemm_f32f32f32_tn():
         768,
         True,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -254,9 +253,9 @@ def test_pad_aligned_f16f16f16_nn():
         768 - 24,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -271,9 +270,9 @@ def test_pad_f16f16f16_nn():
         768 - 5,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -288,9 +287,9 @@ def test_pad_f16f16f32_nn():
         768 + 15,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         64,
         32,
@@ -407,9 +406,9 @@ def test_gemm_f16f16f16_sr():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -526,9 +525,9 @@ def test_gemm_f16f16f16_rs():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index 3633d3ec..dd1b75eb 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -39,27 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
     is_float8 = in_dtype in [
-        "float8_e4m3",
-        "float8_e5m2",
-        "float8_e4m3fn",
-        "float8_e5m2fnuz",
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
     ]
-    if out_dtype == "int32" or is_float8:
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -67,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -219,22 +219,22 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul_fp8():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index e4da44b2..584aa854 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -35,13 +35,13 @@ def tl_matmul_simt(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     # This is a debug config
@@ -72,7 +72,7 @@ def tl_matmul_simt(
 
     micro_size_k = 128 // DataType(in_dtype).bits
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
@@ -139,7 +139,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -161,9 +161,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index 2def480d..1f766003 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 5825f695..b4a5c824 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -46,7 +46,7 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
@@ -164,15 +164,15 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float16", "float16", "float16", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "int8", "int32", "int32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float16, T.float16, T.float16, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.int8, T.int32, T.int32, with_bias=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt_fp8():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index affeb3dd..9d60e522 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -26,20 +26,20 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     K = K // 2
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -47,7 +47,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -197,8 +197,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul_correctness():
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(128, 128, 64, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(128, 128, 64, T.int8, T.int32, T.int32)
 
 
 @simplify_prim_func
@@ -212,18 +212,18 @@ def tl_matmul_weight_only_transform(
 ):
     K = K // 2
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     transform_b = 3
@@ -233,7 +233,7 @@ def tl_matmul_weight_only_transform(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -375,8 +375,8 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
         M=N,
         N=(K // 2),
-        datatype="int8",
-        storage_dtype="int8",
+        datatype=T.int8,
+        storage_dtype=T.int8,
         transform_kind=transform_b,
         transpose_matrix=True,
     )
@@ -400,9 +400,9 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
 @tilelang.testing.requires_package("bitblas")
 @tilelang.testing.requires_llvm
 def test_assert_tl_matmul_weight_only_transform():
-    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index f55d9e85..48fe1ac4 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -43,7 +43,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     kernel = tilelang.compile(program, out_idx=[2], target="cuda")
     kernel.run_once()
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index 48412127..db694d33 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -31,8 +31,8 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
@@ -75,8 +75,8 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
@@ -124,8 +124,8 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 6695e934..883f65c3 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -1,4 +1,5 @@
 import tilelang.testing
+from tilelang import language as T
 
 
 def alloc_var(
@@ -6,8 +7,6 @@ def alloc_var(
     block_N,
     dtype,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -38,7 +37,7 @@ def run_alloc_var(
 
 
 def test_alloc_var():
-    run_alloc_var(1024, 128, "float16")
+    run_alloc_var(1024, 128, T.float16)
 
 
 def alloc_var_add(
@@ -78,7 +77,7 @@ def run_alloc_var_add(
 
 
 def test_alloc_var_add():
-    run_alloc_var_add(1024, 128, "float16")
+    run_alloc_var_add(1024, 128, T.float16)
 
 
 def alloc_var_with_initializer(
@@ -117,7 +116,7 @@ def run_alloc_var_with_initializer(
 
 
 def test_alloc_var_with_initializer():
-    run_alloc_var_with_initializer(256, 64, "int32", 5)
+    run_alloc_var_with_initializer(256, 64, T.int32, 5)
 
 
 def alloc_multi_vars_with_initializer(
@@ -156,7 +155,7 @@ def run_alloc_multi_vars_with_initializer(
 
 
 def test_alloc_multi_vars_with_initializer():
-    run_alloc_multi_vars_with_initializer(256, 64, "int32")
+    run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 442172b6..3c8239a1 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -6,7 +6,7 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
+def tilelang_copy(M, N, block_M, block_N, dtype=T.float16, pad_value=0):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -26,7 +26,7 @@ def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16", pad_value=0):
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
     kernel = tilelang.compile(
         program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 37605e5a..74db94f7 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -31,8 +31,8 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
@@ -75,8 +75,8 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
@@ -124,8 +124,8 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
index 32e6b1c3..06e92dfa 100644
--- a/testing/python/language/test_tilelang_language_assume.py
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -9,7 +9,7 @@ def test_assume_remove_boundary_check():
         N = T.dynamic("N")
 
         @T.prim_func
-        def main(A: T.Tensor((N,), "float32"), l: T.int32, r: T.int32):
+        def main(A: T.Tensor((N,), T.float32), l: T.int32, r: T.int32):
             with T.Kernel(1, threads=32) as _:
                 for i in T.serial(r - l + 1):
                     T.assume(l + i >= 0 and l + i < N)
@@ -31,8 +31,8 @@ def test_assume_enable_vectorization():
 
         @T.prim_func
         def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
         ):
             with T.Kernel(1, threads=32) as _:
                 tid = T.get_thread_binding()
@@ -60,8 +60,8 @@ def test_assume_complex_indexing():
 
         @T.prim_func
         def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
         ):
             with T.Kernel(1, threads=32) as _:
                 tid = T.get_thread_binding()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index eaf5ae1e..fa4dff7b 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
+def atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
@@ -17,7 +17,7 @@ def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_add
 
 
-def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -36,7 +36,7 @@ def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
+def tile_atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
@@ -49,7 +49,7 @@ def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_add
 
 
-def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_tile_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     print(kernel.get_kernel_source())
     import torch
@@ -71,7 +71,7 @@ def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
+def atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
@@ -85,7 +85,7 @@ def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_max
 
 
-def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -104,7 +104,7 @@ def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
+def atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
@@ -118,7 +118,7 @@ def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_min
 
 
-def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -137,7 +137,7 @@ def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
+def atomic_load_store_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -151,7 +151,7 @@ def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
     return atomic_load_store
 
 
-def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_load_store(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -162,7 +162,7 @@ def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
+def atomic_memory_order_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
@@ -176,7 +176,7 @@ def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
     return atomic_with_memory_order
 
 
-def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_memory_order(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -197,7 +197,7 @@ def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
 @tilelang.jit
 def atomic_addx2_program(M, N, block_M, block_N):
     @T.prim_func
-    def atomic_addx2(A: T.Tensor((M, N), "float16"), B: T.Tensor((M, N), "float16")):
+    def atomic_addx2(A: T.Tensor((M, N), T.float16), B: T.Tensor((M, N), T.float16)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 2):
                 idx_i = bx * block_M + i
@@ -248,7 +248,7 @@ def test_atomic_addx2():
 
 
 @tilelang.jit
-def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
+def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_different_orders(
         A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
@@ -266,7 +266,7 @@ def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"
     return atomic_different_orders
 
 
-def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -285,7 +285,7 @@ def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
 @tilelang.jit
 def atomic_addx4_program(M, N, block_M, block_N):
     @T.prim_func
-    def atomic_addx4(A: T.Tensor((M, N), "float32"), B: T.Tensor((M, N), "float32")):
+    def atomic_addx4(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 4):
                 idx_i = bx * block_M + i
@@ -315,7 +315,7 @@ def run_atomic_addx4(M, N, block_M, block_N):
 
 
 @tilelang.jit
-def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
+def atomic_return_prev_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -328,7 +328,7 @@ def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
     return atomic_with_return_prev
 
 
-def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -344,9 +344,9 @@ def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
 
 
 def test_atomic_different_memory_orders():
-    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="float")
-    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="float16")
-    run_atomic_different_memory_orders(32, 32, 8, 8, dtype="bfloat16")
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.bfloat16)
 
 
 def test_atomic_addx4():
diff --git a/testing/python/language/test_tilelang_language_ceildiv.py b/testing/python/language/test_tilelang_language_ceildiv.py
index 66215abc..f5af31b8 100644
--- a/testing/python/language/test_tilelang_language_ceildiv.py
+++ b/testing/python/language/test_tilelang_language_ceildiv.py
@@ -6,7 +6,7 @@ import torch
 @tilelang.jit(out_idx=[-1])
 def _ceildiv_kernel(a: int, b: int):
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32")):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32)):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
@@ -30,7 +30,7 @@ def test_ceildiv():
 @tilelang.jit
 def _ceildiv_kernel_dyn(b: int):
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32"), a: T.int32):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32), a: T.int32):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
index 0a9623fa..083eefdc 100644
--- a/testing/python/language/test_tilelang_language_chain_equal.py
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -10,7 +10,7 @@ import torch
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
     },
 )
-def chain_equal(N, block_size, dtype="float32"):
+def chain_equal(N, block_size, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -25,7 +25,7 @@ def chain_equal(N, block_size, dtype="float32"):
     return main
 
 
-def run_chain_equal(N=128, block_size=64, dtype="float32"):
+def run_chain_equal(N=128, block_size=64, dtype=T.float32):
     kernel = chain_equal(N, block_size, dtype)
     A = torch.zeros((N,), dtype=torch.float32, device="cuda")
     B = torch.zeros((N,), dtype=torch.float32, device="cuda")
diff --git a/testing/python/language/test_tilelang_language_clamp.py b/testing/python/language/test_tilelang_language_clamp.py
index 06e558fd..372d7478 100644
--- a/testing/python/language/test_tilelang_language_clamp.py
+++ b/testing/python/language/test_tilelang_language_clamp.py
@@ -1,5 +1,5 @@
 import tilelang.testing
-from tilelang.utils.tensor import map_torch_type
+from tilelang import language as T
 
 
 def clamp_within_bounds(
@@ -91,7 +91,7 @@ def run_clamp_value_range(
     import torch
 
     # Convert string dtype to torch.dtype
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = dtype.as_torch()
 
     def ref_program(A):
         min_val = torch.min(A) * 0.5
@@ -107,10 +107,10 @@ def run_clamp_value_range(
 
 def test_clamp():
     # clamp tests for float16 and float32
-    run_clamp(1024, 128, "float16", -0.05, 0.05)
-    run_clamp(1024, 128, "float32", -0.06, 0.05)
-    run_clamp_value_range(1024, 128, "float16")
-    run_clamp_value_range(1024, 128, "float32")
+    run_clamp(1024, 128, T.float16, -0.05, 0.05)
+    run_clamp(1024, 128, T.float32, -0.06, 0.05)
+    run_clamp_value_range(1024, 128, T.float16)
+    run_clamp_value_range(1024, 128, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index 19ae0bbd..af9d8963 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -4,7 +4,7 @@ import tilelang.language as T
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -39,7 +39,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
     import torch
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index 8a586956..7893c1f2 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -6,7 +6,7 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
+def tilelang_composable_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -25,7 +25,7 @@ def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_composable_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
@@ -44,7 +44,7 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
 def test_tilelang_copy():
     run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
+    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index c8515d5b..29bb0f95 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -8,7 +8,7 @@ print(torch.__version__)
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, src_dtype="float16", dst_dtype="float16"):
+def tilelang_copy(M, N, block_M, block_N, src_dtype=T.float16, dst_dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), src_dtype),
@@ -24,7 +24,7 @@ def tilelang_copy(M, N, block_M, block_N, src_dtype="float16", dst_dtype="float1
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy(M, N, block_M, block_N, src_dtype=dtype, dst_dtype=dtype)
     kernel = tilelang.compile(
         program,
@@ -42,10 +42,10 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
+    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
 
-def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.StridedTensor((M, N), (NN, 1), dtype),
@@ -59,7 +59,7 @@ def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype=T.float16):
     if isinstance(NN, int):
         assert NN > N, "NN must be greater than N"
     program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
@@ -84,21 +84,21 @@ def test_tilelang_copy_with_stride():
     run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.dynamic("NN"), block_M=128, block_N=128)
 
 
-def tilelang_copy_bufferload(num_tokens, dtype="float16"):
+def tilelang_copy_bufferload(num_tokens, dtype=T.float16):
     @T.prim_func
     def main(
-        indices: T.Tensor((num_tokens,), "int32"),
+        indices: T.Tensor((num_tokens,), T.int32),
         x: T.Tensor((num_tokens,), dtype),
     ):
         with T.Kernel(num_tokens, threads=32) as pid:
-            idx = T.alloc_local([1], "int32")
+            idx = T.alloc_local([1], T.int32)
             T.copy(indices[pid], idx[0])
             x[idx[0]] = x[idx[0]] + 1
 
     return main
 
 
-def run_tilelang_copy_bufferload(num_tokens=128, dtype="float16"):
+def run_tilelang_copy_bufferload(num_tokens=128, dtype=T.float16):
     program = tilelang_copy_bufferload(num_tokens, dtype)
     # test compilation only
     tilelang.compile(
@@ -112,7 +112,7 @@ def test_tilelang_copy_bufferload():
     run_tilelang_copy_bufferload(num_tokens=128)
 
 
-def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -126,7 +126,7 @@ def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float
     return main
 
 
-def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
@@ -143,7 +143,7 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
-def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype="float8_e8m0fnu", dst_dtype="float8_e8m0fnu"):
+def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu):
     program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
     kernel = tilelang.compile(
         program,
@@ -159,10 +159,10 @@ def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dty
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(10, 0)
 def test_tilelang_copy_fp8_e8m0():
-    run_tilelang_copy_fp8_e8m0(src_dtype="float8_e8m0fnu", dst_dtype="float8_e8m0fnu")
+    run_tilelang_copy_fp8_e8m0(src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu)
 
 
-def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype="float4_e2m1fn", dst_dtype="float4_e2m1fn"):
+def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn):
     program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
     kernel = tilelang.compile(
         program,
@@ -179,9 +179,9 @@ def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype="f
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(10, 0)
 def test_tilelang_copy_fp4():
-    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="float4_e2m1fn")
-    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="float16")
-    run_tilelang_copy_fp4(src_dtype="float4_e2m1fn", dst_dtype="bfloat16")
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float16)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.bfloat16)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index c563bcf2..fecc0d2a 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -2,11 +2,10 @@ from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
 import torch
+import tilelang.language as T
 
 
-def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
-    import tilelang.language as T
-
+def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     @T.prim_func
     def cumsum(
         A: T.Tensor((M, N), dtype),
@@ -23,7 +22,7 @@ def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float3
     return cumsum
 
 
-def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
@@ -44,7 +43,7 @@ def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="fl
     return cumsum
 
 
-def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test(M, N, block_M, block_N, dim, reverse, dtype)
     elif scope == "fragment":
@@ -74,7 +73,7 @@ def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", sc
     torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
 
 
-def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_smem_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
@@ -92,7 +91,7 @@ def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
     return cumsum
 
 
-def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
@@ -112,7 +111,7 @@ def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
     return cumsum
 
 
-def run_cumsum_1d(N, block_N, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum_1d(N, block_N, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test_1d(N, block_N, reverse, dtype)
     elif scope == "fragment":
@@ -150,8 +149,8 @@ def test_cumsum_smem():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True)
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32")
-    run_cumsum(256, 256, 128, 128, dtype="float32")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
 
 
 def test_cumsum_fragment():
@@ -160,8 +159,8 @@ def test_cumsum_fragment():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True, scope="fragment")
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
 
 
 def test_cumsum_smem_1d():
@@ -174,7 +173,7 @@ def test_cumsum_fragment_1d():
     run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
 
 
-def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype="float32"):
+def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype=T.float32):
     """Test cumsum with buffer region (slice) as input."""
     import tilelang.language as T
 
@@ -198,7 +197,7 @@ def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype="float32"):
     return cumsum_region
 
 
-def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype="float32"):
+def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype=T.float32):
     """Run test for cumsum with region input."""
     program = cumsum_region_test_1d(N, chunk_size, reverse, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
@@ -224,7 +223,7 @@ def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype="float32"):
     torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
 
 
-def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     """Test cumsum with buffer region (slice) as input in 2D."""
     import tilelang.language as T
 
@@ -253,7 +252,7 @@ def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype="f
     return cumsum_region
 
 
-def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     """Run test for cumsum with 2D region input."""
     program = cumsum_region_test_2d(M, N, block_M, block_N, dim, reverse, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index b0191b4d..67115e8c 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -303,8 +303,8 @@ def test_serial_for_with_step():
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
     assert isinstance(T.serial(1, 10, 1), IRBuilderFrame)
-    assert isinstance(T.serial(1, 10, IntImm("int32", 1)), IRBuilderFrame)
-    assert not isinstance(T.serial(1, 10, Var("tmp", "int32")), IRBuilderFrame)
+    assert isinstance(T.serial(1, 10, IntImm(T.int32, 1)), IRBuilderFrame)
+    assert not isinstance(T.serial(1, 10, Var("tmp", T.int32)), IRBuilderFrame)
     assert not isinstance(T.serial(10, -1, -1), IRBuilderFrame)
 
 
@@ -433,7 +433,7 @@ def test_frame_inside_macro():
             idx_out: T.Tensor[(32,), T.int32],
         ):
             with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
-                fragment = T.alloc_fragment(32, "int32")
+                fragment = T.alloc_fragment(32, T.int32)
                 T.copy(idx_out, fragment)
 
                 for i in T.Parallel(32):
@@ -458,10 +458,10 @@ def test_buffer_slice_step():
 
 
 def test_boolop():
-    a = Var("a", "int32")
-    b = Var("b", "int32")
-    c = Var("c", "int32")
-    d = Var("d", "int32")
+    a = Var("a", T.int32)
+    b = Var("b", T.int32)
+    c = Var("c", T.int32)
+    d = Var("d", T.int32)
 
     @T.macro
     def cond():
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index edbc511d..e14cece9 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -24,7 +24,7 @@ def _resolve_warps_per_group(warps_per_group: Optional[int]) -> int:
 @tilelang.jit(out_idx=[-1])
 def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
     @T.prim_func
-    def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
+    def laneid_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_lane_idx(warp_size)
@@ -35,7 +35,7 @@ def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
     @T.prim_func
-    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx_sync(warp_size)
@@ -46,7 +46,7 @@ def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] =
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
     @T.prim_func
-    def warp_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx(warp_size)
@@ -61,7 +61,7 @@ def _get_warp_group_idx_kernel(
     warps_per_group: Optional[int] = None,
 ):
     @T.prim_func
-    def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_group_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_group_idx(warp_size, warps_per_group)
@@ -72,7 +72,7 @@ def _get_warp_group_idx_kernel(
 @tilelang.jit(out_idx=[-1])
 def _shuffle_elect_kernel(num_threads: int = 128, thread_extent: int = 64):
     @T.prim_func
-    def shuffle_elect_kernel(A: T.Tensor((num_threads,), "int32")):
+    def shuffle_elect_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             elected = T.shuffle_elect(thread_extent)
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index 9c984569..c81a241b 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -7,7 +7,7 @@ import tilelang.testing
 @tilelang.jit(
     out_idx=[1],
 )
-def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
+def tilelang_if_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -27,7 +27,7 @@ def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_if_range(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 5d251866..746afc4e 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -22,10 +22,10 @@ def _test_infinity(dtype: str):
 
 @tilelang.testing.requires_cuda
 def test_infinity():
-    _test_infinity("float16")
-    _test_infinity("bfloat16")
-    _test_infinity("float32")
-    _test_infinity("float64")
+    _test_infinity(T.float16)
+    _test_infinity(T.bfloat16)
+    _test_infinity(T.float32)
+    _test_infinity(T.float64)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_int64.py b/testing/python/language/test_tilelang_language_int64.py
index 28fa2211..d81e9dc6 100644
--- a/testing/python/language/test_tilelang_language_int64.py
+++ b/testing/python/language/test_tilelang_language_int64.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit
-def fill_symbolic(value: float, dtype="bfloat16"):
+def fill_symbolic(value: float, dtype=T.bfloat16):
     n = T.symbolic("n", "int64")
     block_n = 512
 
@@ -33,7 +33,7 @@ def test_fill_symbolic():
 
 
 @tilelang.jit
-def fill_static(n: int, value: float, dtype="bfloat16"):
+def fill_static(n: int, value: float, dtype=T.bfloat16):
     block_n = 512
 
     @T.prim_func
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
index 80318242..b1d1e540 100644
--- a/testing/python/language/test_tilelang_language_intrinsics_codegen.py
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -9,8 +9,8 @@ def test_language_ldg_codegen():
 
     @T.prim_func
     def main(
-        x: T.Tensor((N,), "float32"),
-        y: T.Tensor((N,), "float32"),
+        x: T.Tensor((N,), T.float32),
+        y: T.Tensor((N,), T.float32),
     ):
         with T.Kernel(N, threads=32) as pid:
             # Explicitly request read-only cache load for x[pid]
diff --git a/testing/python/language/test_tilelang_language_lazy_jit.py b/testing/python/language/test_tilelang_language_lazy_jit.py
index 31da09c5..50573096 100644
--- a/testing/python/language/test_tilelang_language_lazy_jit.py
+++ b/testing/python/language/test_tilelang_language_lazy_jit.py
@@ -60,8 +60,8 @@ def test_jit2_gemm_annot():
     )
 
     for in_dtype, out_dtype in prod:
-        in_dtype = in_dtype.torch()
-        out_dtype = out_dtype.torch()
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
         A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         C_ref = out_dtype(A @ B)
@@ -97,8 +97,8 @@ def test_jit2_gemm_ptr():
         ]
     )
     for in_dtype, out_dtype in prod:
-        in_dtype = in_dtype.torch()
-        out_dtype = out_dtype.torch()
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
         A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
         C_ref = out_dtype(A @ B)
@@ -326,8 +326,8 @@ def test_jit2_return():
 def test_jit2_deepseek_deepgemm():
     @tilelang.lazy_jit
     def deep_gemm(
-        A: T.Tensor[[int, int], T.float8_e4m3],
-        B: T.Tensor[[int, int], T.float8_e4m3],
+        A: T.Tensor[[int, int], T.float8_e4m3fn],
+        B: T.Tensor[[int, int], T.float8_e4m3fn],
         scales_a: T.Tensor[[int, int], T.float32],
         scales_b: T.Tensor[[int, int], T.float32],
         out_dtype: T.dtype = T.bfloat16,
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index a2905952..6f94ad66 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -6,7 +6,7 @@ from tilelang import language as T
 def test_let_vectorize_load():
     @T.prim_func
     def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
 
         for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
             for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index 37b52045..8f899729 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -5,7 +5,7 @@ import torch
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -26,7 +26,7 @@ def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
@@ -42,7 +42,7 @@ def test_tilelang_copy_mask_parallel():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -62,7 +62,7 @@ def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
@@ -78,7 +78,7 @@ def test_tilelang_copy_mask_copy():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -99,7 +99,7 @@ def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
@@ -115,7 +115,7 @@ def test_tilelang_copy_mask_parallel_range():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
+def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -135,7 +135,7 @@ def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
diff --git a/testing/python/language/test_tilelang_language_negative_index.py b/testing/python/language/test_tilelang_language_negative_index.py
index c052ccb9..feeed2c6 100644
--- a/testing/python/language/test_tilelang_language_negative_index.py
+++ b/testing/python/language/test_tilelang_language_negative_index.py
@@ -1,37 +1,37 @@
 from tilelang import tvm
 import tilelang as tl
 import tilelang.testing
-from tvm.script import tir as T
+import tilelang.language as T
 
 
 @T.prim_func
-def negative_index_before(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_before(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(-1)]
 
 
 @T.prim_func
-def negative_index_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     B[0] = A[T.int32(15)]
 
 
 @T.prim_func
-def negative_index_loop_before(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_before(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[-i - 1]
 
 
 @T.prim_func
-def negative_index_loop_expected(A: T.Buffer((16,), "float32"), B: T.Buffer((4,), "float32")):
+def negative_index_loop_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(4):
         B[i] = A[15 - i]
 
 
 @T.prim_func
-def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32")):
+def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
     T.func_attr({"tir.noalias": True})
     for i in T.serial(16):
         B[i] = A[shift + i]
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index b0e85ff4..a392e70b 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -8,7 +8,7 @@ tilelang.testing.set_random_seed()
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_static(length=256, dtype="float32"):
+def parallel_elementwise_static(length=256, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((length,), dtype),
@@ -22,7 +22,7 @@ def parallel_elementwise_static(length=256, dtype="float32"):
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_dynamic(max_len=512, threads=256, dtype="float32"):
+def parallel_elementwise_dynamic(max_len=512, threads=256, dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((max_len,), dtype),
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 54e10550..8136e246 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -23,8 +24,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -63,9 +62,9 @@ def run_gemm(
     block_K = 32
     trans_A = False
     trans_B = False
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
     program = matmul(
         M,
@@ -101,7 +100,7 @@ def run_gemm(
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
             A = (A.view(torch.int32) - 0x1000).view(torch.float32)
@@ -127,7 +126,7 @@ def test_pipeline_order_stage():
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
     },
 )
-def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype="float16", accum_dtype="float"):
+def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype=T.float16, accum_dtype=T.float32):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     import tilelang.language as T
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index 0e60ddd7..85458139 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -6,7 +6,7 @@ import tilelang.language as T
 from tilelang.utils import map_torch_type
 
 
-def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a_ptr: T.ptr,
@@ -39,7 +39,7 @@ def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
 
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index 7ec50039..1d9bf613 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -1,13 +1,12 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import tilelang.language as T
 
 tilelang.testing.set_random_seed()
 
 
 def _make_shared_reduce(M, N, dtype, reduce_cb):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -30,7 +29,7 @@ def _run_program(program, ref_program, atol=1e-2, rtol=1e-2):
     profiler.assert_allclose(ref_program, atol=atol, rtol=rtol)
 
 
-def reduce_max_test(M, N, dtype="float16"):
+def reduce_max_test(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
@@ -49,7 +48,7 @@ def reduce_max_test(M, N, dtype="float16"):
     return main
 
 
-def reduce_sum_test(M, N, dtype="float32"):
+def reduce_sum_test(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
@@ -68,27 +67,27 @@ def reduce_sum_test(M, N, dtype="float32"):
     return main
 
 
-def reduce_sum_ss(M, N, dtype="float32"):
+def reduce_sum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_sum(src, dst, dim=1))
 
 
-def reduce_max_ss(M, N, dtype="float32"):
+def reduce_max_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_max(src, dst, dim=1))
 
 
-def reduce_min_ss(M, N, dtype="float32"):
+def reduce_min_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_min(src, dst, dim=1))
 
 
-def reduce_abssum_ss(M, N, dtype="float32"):
+def reduce_abssum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_abssum(src, dst, dim=1))
 
 
-def reduce_absmax_ss(M, N, dtype="float32"):
+def reduce_absmax_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_absmax(src, dst, dim=1))
 
 
-def run_reduce_sum(M, N, dtype="float32", mode="rr"):
+def run_reduce_sum(M, N, dtype=T.float32, mode="rr"):
     if mode == "rr":
         program = reduce_sum_test(M, N, dtype)
     elif mode == "ss":
@@ -98,12 +97,12 @@ def run_reduce_sum(M, N, dtype="float32", mode="rr"):
     _run_program(program, lambda A: A.sum(dim=1))
 
 
-def run_shared_reduce(program_builder, ref_program, M, N, dtype="float32"):
+def run_shared_reduce(program_builder, ref_program, M, N, dtype=T.float32):
     program = program_builder(M, N, dtype)
     _run_program(program, ref_program)
 
 
-def run_reduce_max(M, N, dtype="float16"):
+def run_reduce_max(M, N, dtype=T.float16):
     program = reduce_max_test(M, N, dtype)
     _run_program(program, lambda A: A.max(dim=1).values, atol=1e-2, rtol=1e-2)
 
@@ -119,28 +118,28 @@ def test_reduce_sum_shared():
 
 
 def test_reduce_max():
-    run_reduce_max(256, 256, "float16")
-    run_reduce_max(512, 128, "float16")
-    run_reduce_max(256, 256, "float32")
+    run_reduce_max(256, 256, T.float16)
+    run_reduce_max(512, 128, T.float16)
+    run_reduce_max(256, 256, T.float32)
 
 
 def test_reduce_max_shared():
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_min_shared():
-    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_abssum_shared():
-    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, "float32")
+    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, T.float32)
 
 
 def test_reduce_absmax_shared():
-    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, T.float32)
 
 
-def reduce_sum_test_clear(M, N, dtype="float32"):
+def reduce_sum_test_clear(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
@@ -160,7 +159,7 @@ def reduce_sum_test_clear(M, N, dtype="float32"):
     return main
 
 
-def run_reduce_sum_clear(M, N, dtype="float32"):
+def run_reduce_sum_clear(M, N, dtype=T.float32):
     program = reduce_sum_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -176,12 +175,12 @@ def run_reduce_sum_clear(M, N, dtype="float32"):
 
 
 def test_reduce_sum_clear():
-    run_reduce_sum_clear(256, 256, "float32")
-    run_reduce_sum_clear(512, 128, "float32")
-    run_reduce_sum_clear(128, 512, "float32")
+    run_reduce_sum_clear(256, 256, T.float32)
+    run_reduce_sum_clear(512, 128, T.float32)
+    run_reduce_sum_clear(128, 512, T.float32)
 
 
-def reduce_max_test_clear(M, N, dtype="float16"):
+def reduce_max_test_clear(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
@@ -201,7 +200,7 @@ def reduce_max_test_clear(M, N, dtype="float16"):
     return main
 
 
-def run_reduce_max_clear(M, N, dtype="float16"):
+def run_reduce_max_clear(M, N, dtype=T.float16):
     program = reduce_max_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -217,7 +216,7 @@ def run_reduce_max_clear(M, N, dtype="float16"):
 
 
 def test_reduce_max_clear():
-    run_reduce_max_clear(256, 256, "float16")
+    run_reduce_max_clear(256, 256, T.float16)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index 3c343309..10c3d0ce 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -1,13 +1,11 @@
-from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+from tilelang import language as T
 import torch
 import pytest
 
 
 def reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -42,13 +40,11 @@ def run_reshape(N, M, dtype):
 
 def test_reshape_smem():
     # Test reshape
-    run_reshape(1024, 32, "float32")
-    run_reshape(2048, 64, "float16")
+    run_reshape(1024, 32, T.float32)
+    run_reshape(2048, 64, T.float16)
 
 
 def reshape_test_smem_1d_2_2d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -86,13 +82,11 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
 
 
 def test_reshape_smem_1d_2_2d():
-    run_reshape_smem_1d_2_2d(1024, 32, "float32")
-    run_reshape_smem_1d_2_2d(2048, 64, "float16")
+    run_reshape_smem_1d_2_2d(1024, 32, T.float32)
+    run_reshape_smem_1d_2_2d(2048, 64, T.float16)
 
 
 def reshape_test_smem_2d_2_1d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N // M, M), dtype),
@@ -130,13 +124,11 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
 
 
 def test_reshape_smem_2d_2_1d():
-    run_reshape_smem_2d_2_1d(1024, 32, "float32")
-    run_reshape_smem_2d_2_1d(2048, 64, "float16")
+    run_reshape_smem_2d_2_1d(1024, 32, T.float32)
+    run_reshape_smem_2d_2_1d(2048, 64, T.float16)
 
 
 def reshape_fragment_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N // M, M), dtype),
@@ -175,12 +167,11 @@ def run_reshape_fragment(N, M, dtype):
 
 
 def test_reshape_fragment():
-    run_reshape_fragment(1024, 32, "float32")
-    run_reshape_fragment(2048, 64, "float16")
+    run_reshape_fragment(1024, 32, T.float32)
+    run_reshape_fragment(2048, 64, T.float16)
 
 
 def reshape_layout_transform_shared(N, M, dtype):
-    import tilelang.language as T
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout
 
     @T.prim_func
@@ -222,13 +213,11 @@ def run_reshape_layout_transform_shared(N, M, dtype):
 
 
 def test_reshape_layout_transform_shared():
-    run_reshape_layout_transform_shared(1024, 32, "float32")
-    run_reshape_layout_transform_shared(2048, 64, "float16")
+    run_reshape_layout_transform_shared(1024, 32, T.float32)
+    run_reshape_layout_transform_shared(2048, 64, T.float16)
 
 
 def reduce_after_reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -267,13 +256,11 @@ def run_reduce_after_reshape(N, M, dtype):
 
 
 def test_reduce_after_reshape():
-    run_reduce_after_reshape(1024, 32, "float32")
-    run_reduce_after_reshape(2048, 64, "float16")
+    run_reduce_after_reshape(1024, 32, T.float32)
+    run_reduce_after_reshape(2048, 64, T.float16)
 
 
 def reshape_shape_mismatch_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor((N,), dtype),
@@ -288,7 +275,7 @@ def reshape_shape_mismatch_test(N, M, dtype):
 
 def test_reshape_shape_mismatch():
     with pytest.raises(AssertionError):
-        reshape_shape_mismatch_test(1024, 32, "float32")
+        reshape_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_ternary.py b/testing/python/language/test_tilelang_language_ternary.py
index 632dcf7b..20c7b5e7 100644
--- a/testing/python/language/test_tilelang_language_ternary.py
+++ b/testing/python/language/test_tilelang_language_ternary.py
@@ -7,7 +7,7 @@ import tilelang.testing
 @tilelang.jit(
     out_idx=[1],
 )
-def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
+def tilelang_ternary(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
         A: T.Tensor((M, N), dtype),
@@ -21,7 +21,7 @@ def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
     return main
 
 
-def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_ternary(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
index 90022b5e..9cb79c10 100644
--- a/testing/python/language/test_tilelang_language_tma_1d.py
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -34,7 +34,7 @@ def run_elementwise_add(M, N):
     # Default config
     block_M, block_N = 128, 128
     config = {"block_M": block_M, "block_N": block_N, "threads": 128}
-    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+    kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
index 416840a1..06367e97 100644
--- a/testing/python/language/test_tilelang_language_unroll.py
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -6,7 +6,7 @@ from tilelang import language as T
 def test_unroll_with_step():
     @T.prim_func
     def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
 
         for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
             for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
@@ -20,7 +20,7 @@ def test_unroll_with_step():
 def test_unroll_with_unroll_factor():
     @T.prim_func
     def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
 
         for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
             for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index d4f9062b..36d9bf01 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -7,12 +7,12 @@ def test_var_assign() -> None:
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
         @T.prim_func
-        def test_var_assign(A: T.Tensor((2,), "int32")):
+        def test_var_assign(A: T.Tensor((2,), T.int32)):
             with T.Kernel(1) as _:
-                a = T.alloc_var("int32", init=1)
-                b = T.alloc_var("int32", init=a)  # b gets value of a
+                a = T.alloc_var(T.int32, init=1)
+                b = T.alloc_var(T.int32, init=a)  # b gets value of a
                 a = 2
-                d = T.alloc_var("int32", init=a)  # c gets new value of a
+                d = T.alloc_var(T.int32, init=a)  # c gets new value of a
                 A[0] = b
                 A[1] = d
 
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index 6867079c..75360bb1 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -7,8 +7,8 @@ import tilelang.language as T
 def vectorize_test(N, M, stride_A, stride_B):
     @T.prim_func
     def main(
-        A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
-        B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
+        A: T.StridedTensor[(N, M), (1, stride_A), T.float32],  # noqa: F821
+        B: T.StridedTensor[(N, M), (1, stride_B), T.float32],  # noqa: F821
     ):
         with T.Kernel(M // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
@@ -60,9 +60,9 @@ def test_vectorize():
 def vectorize_test_invariant_index(N, M, K):
     @T.prim_func
     def main(
-        A: T.Tensor[(N, M), "float32"],  # noqa: F821
-        B: T.Tensor[(N, M), "float32"],  # noqa: F821
-        C: T.Tensor[(N, M // K), "float32"],  # noqa: F821
+        A: T.Tensor[(N, M), T.float32],  # noqa: F821
+        B: T.Tensor[(N, M), T.float32],  # noqa: F821
+        C: T.Tensor[(N, M // K), T.float32],  # noqa: F821
     ):
         with T.Kernel(N // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index a9ab8698..1a0a0942 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -4,11 +4,11 @@ import tilelang.testing
 import tilelang.language as T
 
 str2dtype = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "float8_e5m2": torch.float8_e5m2,
+    T.float32: torch.float32,
+    T.float16: torch.float16,
+    T.bfloat16: torch.bfloat16,
+    T.float8_e4m3fn: torch.float8_e4m3fn,
+    T.float8_e5m2: torch.float8_e5m2,
 }
 
 
@@ -81,22 +81,22 @@ def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str,
 @pytest.mark.parametrize(
     "src_dtype, dst_dtype, check_str, lanes",
     [
-        ("float32", "float16", "__float22half2_rn", 2),
-        ("float32", "float16", "__float22half2_rn", 4),
-        ("float16", "float32", "__half22float2", 2),
-        ("float16", "float32", "__half22float2", 4),
-        ("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2),
-        ("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4),
-        ("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2),
-        ("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4),
-        ("float32", "bfloat16", "__float22bfloat162_rn", 2),
-        ("float32", "bfloat16", "__float22bfloat162_rn", 4),
-        ("bfloat16", "float32", "__bfloat1622float2", 2),
-        ("bfloat16", "float32", "__bfloat1622float2", 4),
-        ("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 2),
-        ("float8_e4m3", "float32", "__tl_cvt_fp8x2_to_float2", 4),
-        ("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 2),
-        ("float8_e5m2", "float32", "__tl_cvt_fp8x2_to_float2", 4),
+        (T.float32, T.float16, "__float22half2_rn", 2),
+        (T.float32, T.float16, "__float22half2_rn", 4),
+        (T.float16, T.float32, "__half22float2", 2),
+        (T.float16, T.float32, "__half22float2", 4),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 2),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 4),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 2),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 4),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
     ],
 )
 def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index ff050e31..dc4c3711 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -1,3 +1,4 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
@@ -5,8 +6,6 @@ import pytest
 
 
 def view_test(N, M, dtype, new_dtype=None):
-    import tilelang.language as T
-
     new_shape = [N // M, M]
     if new_dtype:
         from tvm import DataType
@@ -37,9 +36,7 @@ def run_view(N, M, dtype, new_dtype=None):
 
     def ref_program(A):
         if new_dtype:
-            from tilelang.utils.tensor import map_torch_type
-
-            torch_dtype = map_torch_type(new_dtype)
+            torch_dtype = T.dtype(new_dtype).as_torch()
             return A.view(N // M, M).view(dtype=torch_dtype)
         return A.view(N // M, M)
 
@@ -48,17 +45,15 @@ def run_view(N, M, dtype, new_dtype=None):
 
 def test_reshape_view():
     # Test view with same dtype
-    run_view(1024, 32, "float32")
-    run_view(2048, 64, "float16")
+    run_view(1024, 32, T.float32)
+    run_view(2048, 64, T.float16)
 
     # Test view with dtype conversion
-    run_view(1024, 32, "float32", "float16")
-    run_view(2048, 64, "float16", "float32")
+    run_view(1024, 32, T.float32, T.float16)
+    run_view(2048, 64, T.float16, T.float32)
 
 
 def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
-    import tilelang.language as T
-
     new_shape = [N // M, M + 1]
     if new_dtype:
         from tvm import DataType
@@ -84,7 +79,7 @@ def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
 
 def test_view_shape_mismatch():
     with pytest.raises(AssertionError):
-        view_shape_mismatch_test(1024, 32, "float32")
+        view_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
index 0a0fb70b..a8868013 100644
--- a/testing/python/language/test_tilelang_language_warp_reduce.py
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -33,7 +33,7 @@ def get_kernel(reduce_op: str, dtype: str):
 
 def test_warp_reduce_sum():
     a = torch.randn((32,), dtype=torch.float32, device="cuda")
-    kernel = get_kernel("sum", "float32")
+    kernel = get_kernel("sum", T.float32)
     ref = torch.full_like(a, a.sum())
     kernel(a)
     torch.testing.assert_close(a, ref)
@@ -41,7 +41,7 @@ def test_warp_reduce_sum():
 
 def test_warp_reduce_max():
     a = torch.randn((32,), dtype=torch.float32, device="cuda")
-    kernel = get_kernel("max", "float32")
+    kernel = get_kernel("max", T.float32)
     print(kernel.get_kernel_source())
     ref = torch.full_like(a, a.max())
     kernel(a)
@@ -50,7 +50,7 @@ def test_warp_reduce_max():
 
 def test_warp_reduce_min():
     a = torch.randn((32,), dtype=torch.float32, device="cuda")
-    kernel = get_kernel("min", "float32")
+    kernel = get_kernel("min", T.float32)
     ref = torch.full_like(a, a.min())
     kernel(a)
     torch.testing.assert_close(a, ref)
@@ -58,7 +58,7 @@ def test_warp_reduce_min():
 
 def test_warp_reduce_bitand():
     a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
-    kernel = get_kernel("bitand", "int32")
+    kernel = get_kernel("bitand", T.int32)
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val & a[i]
@@ -69,7 +69,7 @@ def test_warp_reduce_bitand():
 
 def test_warp_reduce_bitor():
     a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
-    kernel = get_kernel("bitor", "int32")
+    kernel = get_kernel("bitor", T.int32)
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val | a[i]
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
index 6d3c2682..8aa5f6c4 100644
--- a/testing/python/layout/test_tilelang_layout_fused_replicate.py
+++ b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -14,8 +14,8 @@ VEC_SIZE = 32
 def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
     @T.prim_func
     def main(
-        a: T.Buffer((B, M, N), "bfloat16"),
-        a_out: T.Buffer((B, M, N), "float32"),
+        a: T.Buffer((B, M, N), T.bfloat16),
+        a_out: T.Buffer((B, M, N), T.float32),
     ):
         with T.Kernel(
             T.ceildiv(M, BLOCK_MN),
@@ -23,7 +23,7 @@ def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
             B,
             threads=128,
         ) as (pid_m, pid_n, pid_b):
-            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), "float32")
+            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), T.float32)
             offs_m = pid_m * BLOCK_MN
             offs_n = pid_n * BLOCK_K
 
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 8d7f5a1a..044e0ea3 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -21,15 +21,15 @@ def bitwise_reduce(
 ):
     @T.prim_func
     def reduce_func(
-        A: T.Tensor((M, N), "int32"),
-        B: T.Tensor((M), "int32"),
-        Output: T.Tensor((M), "int32"),
+        A: T.Tensor((M, N), T.int32),
+        B: T.Tensor((M), T.int32),
+        Output: T.Tensor((M), T.int32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_N), "int32")
-            A_fragment = T.alloc_fragment((block_M, block_N), "int32")
-            B_shared = T.alloc_shared((block_M,), "int32")
-            B_fragment = T.alloc_fragment((block_M), "int32")
+            A_shared = T.alloc_shared((block_M, block_N), T.int32)
+            A_fragment = T.alloc_fragment((block_M, block_N), T.int32)
+            B_shared = T.alloc_shared((block_M,), T.int32)
+            B_fragment = T.alloc_fragment((block_M), T.int32)
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(A_shared, A_fragment)
             T.copy(B[by * block_M], B_shared)
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index 7809983e..3c50e95f 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -49,7 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -85,7 +85,7 @@ def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=3
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
@@ -133,7 +133,7 @@ def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -159,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-        A: T.Tensor((M, N), "float32"),
-        B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -188,7 +188,7 @@ def run_abs_test():
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
@@ -221,7 +221,7 @@ def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32,
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -273,7 +273,7 @@ def test_mathops_generate_no_fastmath():
     ]
 
     for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
+        run_single_arg_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
@@ -287,7 +287,7 @@ def test_two_arg_mathops_fastmath():
     ]
 
     for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+        run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -312,7 +312,7 @@ def test_fastmath_versions():
     ]
 
     for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
+        run_fastmath_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 193092ec..5d498800 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -5,7 +5,7 @@ import tilelang.testing
 import pytest
 
 
-def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype="float32"):
+def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -75,7 +75,7 @@ def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=12
     print(f"✓ {mathop_name} compilation test passed")
 
     # Test numerical execution
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     if num_inputs >= 2:
@@ -186,8 +186,8 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-        A: T.Tensor((128, 128), "float32"),
-        B: T.Tensor((128, 128), "float32"),
+        A: T.Tensor((128, 128), T.float32),
+        B: T.Tensor((128, 128), T.float32),
     ):
         with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
             for i, j in T.Parallel(32, 32):
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index ea088aea..5349bbec 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -6,7 +6,7 @@ import torch
 
 
 @tilelang.jit(execution_backend="torch")
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float32", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
         A: T.Tensor((M, K), dtype),
@@ -39,13 +39,13 @@ def assert_gemm(
     block_M,
     block_N,
     block_K,
-    dtype="float32",
-    accum_dtype="float",
+    dtype=T.float32,
+    accum_dtype=T.float32,
     atol=1e-8,
 ):
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
 
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a, b = None, None
     if "int" in dtype:
         a = torch.randint(100, (M, K), dtype=torch_dtype, device="mps")
@@ -69,12 +69,12 @@ def test_gemm_float32():
 
 @tilelang.testing.requires_metal
 def test_gemm_float16():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype="float16", atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.float16, atol=1)
 
 
 @tilelang.testing.requires_metal
 def test_gemm_int32():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype="int32", atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.int32, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index 8aa54708..09d894c5 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -3,7 +3,7 @@ import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
         A: T.Tensor((M, K), dtype),
diff --git a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
index 7a42b23b..083373eb 100644
--- a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
+++ b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
@@ -9,14 +9,14 @@ import tilelang.testing
 @tilelang.jit
 def dynamic_smem_kernel():
     # Symbolic length to drive dynamic shared memory allocation
-    length = T.symbolic("len", dtype="int32")  # noqa: F821
+    length = T.symbolic("len", dtype=T.int32)  # noqa: F821
 
     @T.prim_func
-    def main(global_tensor: T.Tensor[(length,), "int32"]):  # noqa: F821
+    def main(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
         # Launch a simple kernel that copies from global memory into shared memory
         # using a dynamically-sized allocation. No writes back to global_tensor.
         with T.Kernel(1, threads=32) as _:
-            buffer_shared = T.alloc_shared((length,), dtype="int32")  # noqa: F821
+            buffer_shared = T.alloc_shared((length,), dtype=T.int32)  # noqa: F821
             T.copy(buffer_shared, global_tensor)
 
     return main
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index de8a9f9d..67123cb8 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -1,3 +1,4 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import pytest
@@ -23,8 +24,6 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -112,20 +111,20 @@ def run_gemm_ss(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2, 128),
-        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2, 128),
-        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
     ],
 )
 def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -153,8 +152,6 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -247,20 +244,20 @@ def run_gemm_rs(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
     ],
 )
 def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -288,8 +285,6 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A: T.Tensor(A_shape, in_dtype),
@@ -381,20 +376,20 @@ def run_gemm_sr(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128),
-        (128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
     ],
 )
 def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -519,22 +514,22 @@ def run_gemm_rr(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2, 128),
-        (128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2, 128),
-        (128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 32, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
     ],
 )
 def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 6c47bb5e..b0f4a29c 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -2,6 +2,7 @@ import pytest
 import torch
 import tilelang
 import tilelang.testing
+import tilelang.language as T
 
 from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
 from tilelang.layout import make_cutlass_metadata_layout
@@ -44,14 +45,12 @@ def matmul_sp_sm90(
     trans_A,
     trans_B,
 ):
-    E_factor = 4 if in_dtype == "float32" else 8
+    E_factor = 4 if in_dtype == T.float32 else 8
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A_sparse: T.Tensor(A_sparse_shape, in_dtype),
@@ -104,15 +103,13 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    metadata_dtype = "int32" if is_8_bit else "int16"
+    metadata_dtype = T.int32 if is_8_bit else T.int16
     E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A_sparse: T.Tensor(A_sparse_shape, in_dtype),
@@ -312,19 +309,18 @@ def run_gemm_sp_sm80(
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
     [
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True, True),
-        (512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 0, 256, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float8_e4m3fn, T.float16, T.float16, 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
     ],
 )
 def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
@@ -337,21 +333,20 @@ def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
     [
-        (512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False, True),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False, True),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False, True),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128, False, False),
-        (512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128, False, False),
-        (512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True),
-        (512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 32, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 1, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 3, 128, False, False),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 128, 128, 128, 0, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 1, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
     ],
 )
 def test_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
index cd4123d9..9d232902 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -7,6 +7,7 @@ from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmi
 
 import tilelang.testing
 import torch
+import tilelang.language as T
 
 
 def matmul(
@@ -31,8 +32,6 @@ def matmul(
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
         A_sparse: T.Tensor(A_sparse_shape, in_dtype),
@@ -83,7 +82,7 @@ def run_gemm_ss(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
     program = matmul(
         M,
         N,
@@ -157,17 +156,17 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 128, 32, 2, 128),
-        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 128, 32, 2, 128),
-        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
-        (128, 128, 128, False, True, "int8", "int32", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, False, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int32, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
     ],
 )
 def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -252,7 +251,7 @@ def run_gemm_rs(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
     program = matmul_rs(
         M,
         N,
@@ -308,16 +307,16 @@ def run_gemm_rs(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
     ],
 )
 def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -402,7 +401,7 @@ def run_gemm_sr(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
     program = matmul_sr(
         M,
         N,
@@ -458,16 +457,16 @@ def run_gemm_sr(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (128, 8, 64, False, True, "float16", "float16", "float", 128, 8, 32, 0, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 128, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 128, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
     ],
 )
 def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
@@ -556,7 +555,7 @@ def run_gemm_rr(
     num_stages=3,
     num_threads=128,
 ):
-    metadata_dtype = "int32" if ("8" in in_dtype) else "int16"
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
     program = matmul_rr(
         M,
         N,
@@ -612,18 +611,18 @@ def run_gemm_rr(
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, "float16", "float16", "float", 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2, 128),
-        (128, 8, 128, False, True, "float16", "float16", "float", 128, 8, 32, 2, 128),
-        (128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 64, 2, 128),
-        (128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 64, 2, 128),
-        (128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 64, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 64, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
     ],
 )
 def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
diff --git a/testing/python/transform/test_readonly_param_const_codegen.py b/testing/python/transform/test_readonly_param_const_codegen.py
index d0a2bbbf..0d255b46 100644
--- a/testing/python/transform/test_readonly_param_const_codegen.py
+++ b/testing/python/transform/test_readonly_param_const_codegen.py
@@ -6,8 +6,8 @@ from tilelang.jit.adapter.utils import match_declare_kernel
 def _simple_add_kernel():
     @T.prim_func
     def main(
-        x: T.Tensor((128,), "float32"),
-        y: T.Tensor((128,), "float32"),
+        x: T.Tensor((128,), T.float32),
+        y: T.Tensor((128,), T.float32),
     ):
         # One-dimensional kernel; writes y from x without modifying x
         with T.Kernel(128, threads=32) as pid:
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index d3f45c5e..cdff6fb1 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -16,13 +16,13 @@ def _check(original, transformed):
 
 def test_trival_pipeline():
     @T.prim_func
-    def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
+    def before(A: T.Tensor((16, 1), T.float32), C: T.Tensor((16, 1), T.float32)):
         for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
             for i in T.serial(0, 1, annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]}):
                 with T.block():
                     T.reads(A[tx, i])
                     T.writes(C[tx, i])
-                    B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                    B = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
                     with T.block():
                         T.reads(A[tx, i])
                         T.writes(B[tx, 0])
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index 2ec6321e..296c6ce9 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -22,11 +22,11 @@ def _check(original, transformed):
 
 def test_cluster_planning():
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor((1024, 1024), "float16")):
+    def before(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -39,12 +39,12 @@ def test_cluster_planning():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor((1024, 1024), "float16")):
+    def after(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index 339b283e..559b2ffb 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -19,8 +19,8 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    dtype = T.bfloat16
+    accum_dtype = T.float32
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 854a2617..533a62fc 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -25,8 +25,8 @@ def test_lower_fence_proxy():
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
                 C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
@@ -34,16 +34,16 @@ def test_lower_fence_proxy():
                 "handle",
                 tir.op.Op.get("tl.tl_gemm"),
                 "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
             )
 
     @T.prim_func
     def after():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
                 C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
@@ -52,9 +52,9 @@ def test_lower_fence_proxy():
                 "handle",
                 tir.op.Op.get("tl.tl_gemm"),
                 "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
             )
 
     _check(before, after)
@@ -64,8 +64,8 @@ def test_async_to_generic_no_double_fence():
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
-            B_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
             T.ptx_cp_async("uint8", A_shared.data, 0, B_shared.data, 0, 16)
             T.fence_proxy_async()
             T.call_extern("handle", "generic_op")
@@ -129,7 +129,7 @@ def test_tma_store_sync_injection():
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_global = T.decl_buffer((128,), "float16", scope="global")
+            A_global = T.decl_buffer((128,), T.float16, scope="global")
             T.evaluate(T.call_intrin("handle", tir.op.Op.get("tl.tma_store"), A_global.data))
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
@@ -159,14 +159,14 @@ def test_wgmma_marked_async():
     @T.prim_func
     def before():
         with T.Kernel(1):
-            A_shared = T.decl_buffer((1,), "float16", scope="shared")
-            desc_a = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            desc_b = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            C_local = T.decl_buffer((32,), "float16", scope="local")
+            A_shared = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
             A_shared[0] = T.float16(0)
             T.warpgroup_arrive()
             T.ptx_wgmma_ss(
-                "float16",
+                T.float16,
                 "m64n64k16",
                 T.bool(True),
                 T.bool(True),
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 0cc79b92..1885c7c4 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -9,7 +9,7 @@ def test_inject_set_max_nreg():
     """Test the InjectSetMaxNReg pass"""
 
     @T.prim_func
-    def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16")):
+    def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 128)
@@ -22,8 +22,8 @@ def test_inject_set_max_nreg():
             T.annotate_producer_reg_dealloc(24)  # Producer: decrease to 24
             T.annotate_consumer_reg_alloc(240)  # Consumer: increase to 240
 
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
 
             T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
@@ -37,7 +37,7 @@ def test_inject_set_max_nreg():
                         T.tma_load(
                             T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
                             T.get_mbarrier(k % 3),
-                            T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                            T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
                             k * 32,
                             by * 64,
                         )
@@ -49,9 +49,9 @@ def test_inject_set_max_nreg():
                     T.call_extern(
                         "handle",
                         "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                     )
                     T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
@@ -86,7 +86,7 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     """Test the InjectSetMaxNReg pass with no_set_max_nreg"""
 
     @T.prim_func
-    def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
+    def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         v = T.launch_thread("threadIdx.x", 128)
 
diff --git a/testing/python/transform/test_tilelang_transform_layout_inference.py b/testing/python/transform/test_tilelang_transform_layout_inference.py
index 270dd31e..82fcd19a 100644
--- a/testing/python/transform/test_tilelang_transform_layout_inference.py
+++ b/testing/python/transform/test_tilelang_transform_layout_inference.py
@@ -11,7 +11,7 @@ auto_target = tvm.target.Target(determine_target("auto"))
 @pytest.mark.parametrize(
     "block_M, block_N, block_K, threads, vec_load_b, dtype",
     [
-        (64, 64, 32, 128, 8, "float16"),
+        (64, 64, 32, 128, 8, T.float16),
     ],
 )
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
@@ -102,4 +102,4 @@ def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    test_loop_tail_split(64, 64, 32, 128, 8, "float16")
+    test_loop_tail_split(64, 64, 32, 128, 8, T.float16)
diff --git a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
index c5dd065a..26c15114 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
@@ -19,15 +19,15 @@ def test_buffer_load_negative_index_legalized():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         value = A[-1]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         value = A[1023]  # A[-1] becomes A[1023]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     _check(before, after)
@@ -39,15 +39,15 @@ def test_buffer_load_mixed_negative_positive_indices():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024, 512), "float32")):
+    def before(A: T.Tensor((1024, 512), T.float32)):
         value = A[-1, 10]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 512), "float32")):
+    def after(A: T.Tensor((1024, 512), T.float32)):
         value = A[1023, 10]  # A[-1, 10] becomes A[1023, 10]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     _check(before, after)
@@ -59,15 +59,15 @@ def test_buffer_load_multiple_negative_indices():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024, 512, 256), "float32")):
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
         value = A[-1, -2, -3]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 512, 256), "float32")):
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
         value = A[1023, 510, 253]  # -1+1024=1023, -2+512=510, -3+256=253
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     _check(before, after)
@@ -79,15 +79,15 @@ def test_buffer_load_negative_index_in_expression():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
-        B = T.alloc_buffer((1024,), "float32")
+    def before(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
         for i in T.serial(1, 1024):
             value = A[-i]
             B[-i] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
-        B = T.alloc_buffer((1024,), "float32")
+    def after(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
         for i in T.serial(1, 1024):
             value = A[1024 - i]
             B[1024 - i] = value
@@ -101,16 +101,16 @@ def test_buffer_load_non_negative_index_unchanged():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         value = A[0]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         # No changes expected for non-negative indices
         value = A[0]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     _check(before, after)
@@ -123,18 +123,18 @@ def test_buffer_load_unknown_sign_index_warning():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
-        i = T.Var("i", "int32")
+    def before(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
         value = A[i]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
-        i = T.Var("i", "int32")
+    def after(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
         # Unknown sign indices should remain unchanged
         value = A[i]
-        B = T.alloc_buffer((1,), "float32")
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = value
 
     _check(before, after)
@@ -146,18 +146,18 @@ def test_buffer_load_vector_index_negative_broadcast():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         vec = T.Broadcast(-1, 4)
         value = A[vec]
-        B = T.alloc_buffer((4,), "float32")
+        B = T.alloc_buffer((4,), T.float32)
         B[T.Ramp(0, 1, 4)] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         # vec is unused and can be delimed by Simplify.
         vec = T.Broadcast(-1, 4)  # noqa: F841
         value = A[T.Broadcast(1023, 4)]
-        B = T.alloc_buffer((4,), "float32")
+        B = T.alloc_buffer((4,), T.float32)
         B[T.Ramp(0, 1, 4)] = value
 
     _check(before, after)
@@ -169,18 +169,18 @@ def test_buffer_load_vector_index_negative_ramp():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
         value = A[vec]
-        B = T.alloc_buffer((4,), "float32")
+        B = T.alloc_buffer((4,), T.float32)
         B[T.Ramp(0, 1, 4)] = value
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         # vec is unused and can be delimed by Simplify.
         vec = T.Ramp(-4, 1, 4)  # noqa: F841
         value = A[T.Ramp(1020, 1, 4)]
-        B = T.alloc_buffer((4,), "float32")
+        B = T.alloc_buffer((4,), T.float32)
         B[T.Ramp(0, 1, 4)] = value
 
     _check(before, after)
@@ -192,17 +192,17 @@ def test_buffer_load_nested_buffer_loads():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024, 512), "float32")):
+    def before(A: T.Tensor((1024, 512), T.float32)):
         inner_val = A[-1, 10]
-        outer_val = A[inner_val.astype("int32"), -2]
-        B = T.alloc_buffer((1,), "float32")
+        outer_val = A[inner_val.astype(T.int32), -2]
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = outer_val
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 512), "float32")):
+    def after(A: T.Tensor((1024, 512), T.float32)):
         inner_val = A[1023, 10]
-        outer_val = A[inner_val.astype("int32"), 510]
-        B = T.alloc_buffer((1,), "float32")
+        outer_val = A[inner_val.astype(T.int32), 510]
+        B = T.alloc_buffer((1,), T.float32)
         B[0] = outer_val
 
     _check(before, after)
@@ -214,11 +214,11 @@ def test_buffer_store_negative_index():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         A[-1] = 42.0
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         A[1023] = 42.0
 
     _check(before, after)
@@ -230,11 +230,11 @@ def test_buffer_store_mixed_negative_positive_indices():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024, 512), "float32")):
+    def before(A: T.Tensor((1024, 512), T.float32)):
         A[-1, 10] = 42.0
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 512), "float32")):
+    def after(A: T.Tensor((1024, 512), T.float32)):
         A[1023, 10] = 42.0
 
     _check(before, after)
@@ -246,11 +246,11 @@ def test_buffer_store_multiple_negative_indices():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024, 512, 256), "float32")):
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
         A[-1, -2, -3] = 42.0
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 512, 256), "float32")):
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
         A[1023, 510, 253] = 42.0  # -1+1024=1023, -2+512=510, -3+256=253
 
     _check(before, after)
@@ -262,12 +262,12 @@ def test_buffer_store_negative_index_in_expression():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         for i in T.serial(1, 1024):
             A[-i] = i * 2.0
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         for i in T.serial(1, 1024):
             A[1024 - i] = i * 2.0
 
@@ -280,13 +280,13 @@ def test_buffer_store_vector_index_negative_broadcast():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         vec = T.Broadcast(-1, 4)
         values = T.Broadcast(42.0, 4)
         A[vec] = values
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         # vec is unused and can be delimed by Simplify.
         vec = T.Broadcast(-1, 4)  # noqa: F841
         values = T.Broadcast(42.0, 4)
@@ -301,13 +301,13 @@ def test_buffer_store_vector_index_negative_ramp():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32")):
+    def before(A: T.Tensor((1024,), T.float32)):
         vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
         values = T.Ramp(0.0, 1.0, 4)  # values: [0.0, 1.0, 2.0, 3.0]
         A[vec] = values
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32")):
+    def after(A: T.Tensor((1024,), T.float32)):
         # vec is unused and can be delimed by Simplify.
         vec = T.Ramp(-4, 1, 4)  # noqa: F841
         values = T.Ramp(0.0, 1.0, 4)
@@ -322,14 +322,14 @@ def test_buffer_store_nested_in_condition():
     """
 
     @T.prim_func
-    def before(A: T.Tensor((1024,), "float32"), flag: T.int32):
+    def before(A: T.Tensor((1024,), T.float32), flag: T.int32):
         if flag > 0:
             A[-1] = 42.0
         else:
             A[-2] = 24.0
 
     @T.prim_func
-    def after(A: T.Tensor((1024,), "float32"), flag: T.int32):
+    def after(A: T.Tensor((1024,), T.float32), flag: T.int32):
         if flag > 0:
             A[1023] = 42.0
         else:
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index de2e61ee..4f75fa05 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -5,7 +5,7 @@ import tilelang.testing
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def main(
@@ -41,39 +41,8 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
-# def issue_1013_buggy_kernel():
-#     # NOTE: This kernel is mainly to test some corner cases in boundary check
-
-#     num_tokens = T.dynamic('num_tokens')
-#     num_threads = 128
-
-#     @T.prim_func
-#     def main(x: T.Tensor((num_tokens,), dtype="int64")):
-#         with T.Kernel(1, threads=num_threads) as _:
-#             count = T.alloc_var('int')
-#             thread_idx = T.get_thread_binding()
-#             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-#                 idx = thread_idx + i * num_threads
-#                 count += x[idx] == 2
-
-#     # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
-#     # and the padding value is not used. However, the current prover cannot handle this case.
-#     # So for now the expected kernel is a if-else statement to check the boundary.
-#     @T.prim_func
-#     def expected(x: T.Tensor((num_tokens,), dtype="int64")):
-#         with T.Kernel(1, threads=num_threads) as _:
-#             count = T.alloc_var('int')
-#             thread_idx = T.get_thread_binding()
-#             for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-#                 idx = thread_idx + i * num_threads
-#                 count += T.Cast("int32",
-#                                 value=T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
-
-#     return main, expected
-
-
 def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def main(
@@ -115,7 +84,7 @@ def assert_vectorize_access_with_atmoic_add(M: int = 64, N: int = 64):
 
 
 def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def main(
@@ -152,13 +121,6 @@ def test_vectorize_access():
     assert_vectorize_access(64, 64)
 
 
-# def test_issue_1013():
-#     func, expected = issue_1013_buggy_kernel()
-#     mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-#     transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-#     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
-
-
 def test_vectorize_access_with_atmoic_add():
     assert_vectorize_access_with_atmoic_add(64, 64)
 
diff --git a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
index ec570d41..3cc7541c 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
@@ -5,12 +5,12 @@ import tilelang.testing
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64):
-    dtype = "float32"
+    dtype = T.float32
     vec_len = 8
 
     @T.prim_func
     def main(
-        A: T.Tensor((M, N, vec_len), dtype="float32"),
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
     ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
@@ -21,7 +21,7 @@ def vectorize_access_legalize(M: int = 64, N: int = 64):
 
     @T.prim_func
     def expected(
-        A: T.Tensor((M, N, vec_len), dtype="float32"),
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
     ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
index 6603ecab..e773e3fe 100644
--- a/testing/python/transform/test_tilelang_transform_let_inline.py
+++ b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -13,7 +13,7 @@ def _check(original, transformed):
 
 def test_let_binding():
     @T.prim_func
-    def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def before(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -22,7 +22,7 @@ def test_let_binding():
                     B[i, j] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def expected(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -33,14 +33,14 @@ def test_let_binding():
 
 def test_parallel_scope():
     @T.prim_func
-    def before(A: T.Tensor((128,), "float32")):
+    def before(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 value = T.float32(1.0)
                 A[i] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128,), "float32")):
+    def expected(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 A[i] = T.float32(1.0)
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index ac584185..16c7cb80 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -11,7 +11,7 @@ auto_target = tvm.target.Target(determine_target("auto"))
 @pytest.mark.parametrize(
     "block_M, block_N, block_K, threads, vec_load_b, dtype",
     [
-        (64, 64, 32, 128, 8, "float16"),
+        (64, 64, 32, 128, 8, T.float16),
     ],
 )
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index 0d56ab1a..e85fd8db 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -24,7 +24,7 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
@@ -39,8 +39,8 @@ def test_multi_version_buffer():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -50,7 +50,7 @@ def test_multi_version_buffer():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 2),
                         k * 32,
                         by * 64,
                     )
@@ -58,16 +58,16 @@ def test_multi_version_buffer():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 2),
                         bx * 64,
                         k * 32,
                     )
                 T.call_extern(
                     "handle",
                     "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                 )
 
     @T.prim_func
@@ -78,8 +78,8 @@ def test_multi_version_buffer():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -89,7 +89,7 @@ def test_multi_version_buffer():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
                         k * 32,
                         by * 64,
                     )
@@ -97,16 +97,16 @@ def test_multi_version_buffer():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
                         bx * 64,
                         k * 32,
                     )
                 T.call_extern(
                     "handle",
                     "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                 )
 
     _check(before, after)
@@ -114,10 +114,10 @@ def test_multi_version_buffer():
 
 def test_multi_version_buffer_with_let():
     @T.prim_func
-    def before(scales: T.Tensor((4,), "float32")):
+    def before(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((8,), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((8,), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
@@ -126,10 +126,10 @@ def test_multi_version_buffer_with_let():
                     accum[i] = accum[i] + shared[i]
 
     @T.prim_func
-    def after(scales: T.Tensor((4,), "float32")):
+    def after(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((2, 8), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((2, 8), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
                 value = scales[k]
                 for i in T.serial(8):
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index f38d6079..83db7f75 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -20,11 +20,11 @@ def _check(original, transformed):
 
 def test_simple_pipeline():
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor((1024, 1024), "float32")):
+    def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -37,11 +37,11 @@ def test_simple_pipeline():
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor((1024, 1024), "float32")):
+    def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index 657a2e8a..3b737682 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -22,9 +22,9 @@ def modify(
             T.gemm(A, B, D)
         else:
             with T.block():
-                A_shared = T.alloc_shared((64, 64), dtype="float32")
-                C_shared = T.alloc_shared((64, 64), dtype="float32")
-                D_shared = T.alloc_shared((64, 64), dtype="float32")
+                A_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                C_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                D_shared = T.alloc_shared((64, 64), dtype=T.float32)
                 T.copy(A, A_shared)
                 T.copy(C, C_shared)
                 T.gemm(A_shared, C_shared, D_shared)
@@ -40,7 +40,7 @@ def test_modify(with_B=False, with_bias=False):
     assert mod != mod2
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a: T.handle,
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index 2e101bf8..0171fab8 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -25,7 +25,7 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
@@ -40,15 +40,15 @@ def test_warp_specialized():
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
                         k * 32,
                         by * 64,
                     )
@@ -56,16 +56,16 @@ def test_warp_specialized():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
                         0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
                         bx * 64,
                         k * 32,
                     )
                 T.call_extern(
                     "handle",
                     "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                 )
 
     @T.prim_func
@@ -73,8 +73,8 @@ def test_warp_specialized():
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 256)
-        A_shared = T.decl_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-        B_shared = T.decl_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+        A_shared = T.decl_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+        B_shared = T.decl_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
         C_local = T.decl_buffer((32,), scope="local")
         T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
         T.attr([128, 128], "kWarpSpecializationScope", 0)
@@ -88,7 +88,7 @@ def test_warp_specialized():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
                         T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
                         k * 32,
                         by * 64,
                     )
@@ -98,7 +98,7 @@ def test_warp_specialized():
                     T.tma_load(
                         T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
                         T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
                         bx * 64,
                         k * 32,
                     )
@@ -110,9 +110,9 @@ def test_warp_specialized():
                 T.call_extern(
                     "handle",
                     "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                 )
                 T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
diff --git a/testing/python/webgpu/test_webgpu_codegen.py b/testing/python/webgpu/test_webgpu_codegen.py
index ed175279..b8b199e7 100644
--- a/testing/python/webgpu/test_webgpu_codegen.py
+++ b/testing/python/webgpu/test_webgpu_codegen.py
@@ -4,7 +4,7 @@ import tilelang.testing
 import tilelang.language as T
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -38,8 +38,8 @@ def assert_gemm_codegen(
     block_M,
     block_N,
     block_K,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     func = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
     # Because the current pass context have been polluted by previous testing.
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 1f2a4f40..87176b20 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -141,6 +141,7 @@ from . import (
     engine,  # noqa: F401
     tools,  # noqa: F401
 )
+from .language.v2 import dtypes  # noqa: F401
 from .autotuner import autotune  # noqa: F401
 from .transform import PassConfigKey  # noqa: F401
 
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index bb9872e4..fe023f83 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 import torch
 from tilelang import tvm as tvm
 from tvm.tir import Buffer, IntImm, Var, PrimExpr
-from tilelang.utils.tensor import map_torch_type
+import tilelang.language as T
 
 
 @dataclass
@@ -138,7 +138,7 @@ class KernelParam:
             >>> param = KernelParam.from_buffer(buffer)
             >>> tensor = torch.empty(shape, dtype=param.torch_dtype())
         """
-        return map_torch_type(str(self.dtype))
+        return T.dtype(self.dtype).as_torch()
 
 
 @dataclass
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index 1e97bd0f..ad219206 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -61,9 +61,9 @@ class MatrixCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -105,9 +105,9 @@ class MatrixCoreIntrinEmitter:
         self.num_elems_per_byte = num_elems_per_byte
         self.thread_var = thread_var
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", "int8"]:
+            if a_dtype in ["float8_e4m3fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -132,7 +132,7 @@ class MatrixCoreIntrinEmitter:
     def _initialize_mfma_prefix(self, k_dim=16):
         in_dtype, out_dtype = self.a_dtype, self.accum_dtype
         M_DIM, N_DIM = self.M_DIM, self.N_DIM
-        out_dtype_abbrv = {"float16": "f16", "float32": "f32", "int8": "i8", "int32": "i32"}[out_dtype]
+        out_dtype_abbrv = {T.float16: "f16", T.float32: "f32", T.int8: "i8", T.int32: "i32"}[out_dtype]
 
         in_dtype_abbrv = {
             "bfloat16": "bf16",
@@ -221,7 +221,7 @@ class MatrixCoreIntrinEmitter:
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype="int32")
+        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -521,7 +521,7 @@ class MatrixCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -670,9 +670,9 @@ class MatrixCoreIntrinEmitter:
 class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 28afdb29..4b41eef2 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -60,9 +60,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -108,7 +108,7 @@ class TensorCoreIntrinEmitter:
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
         self.k_dim = 256 // a_dtype.bits
@@ -194,9 +194,9 @@ class TensorCoreIntrinEmitter:
 
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         if DataType(self.accum_dtype).bits == 64:
-            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype=T.int32)
         else:
-            index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+            index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -649,7 +649,7 @@ class TensorCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -806,9 +806,9 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -839,7 +839,7 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
         )
         self._initialize_transform_kind(transform_kind_a, transform_kind_b)
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 256 // DataType(a_dtype).bits
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
@@ -1266,7 +1266,7 @@ class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWith
         a_dtype_abbrv = "int4"
         b_dtype_abbrv = "int4"
         accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = "int32"
+        accum_dtype_abbrv = T.int32
         mma_prefix = "m16n8k32"
 
         @T.macro
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index 3186adb2..6acc40a4 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -46,9 +46,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -89,7 +89,7 @@ class TensorCoreIntrinEmitter:
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 4
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16):
@@ -147,8 +147,8 @@ class TensorCoreIntrinEmitter:
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
         index_map = IndexMap.from_func(
-            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == "float32" else mma_32x8_to_shared_16x16_layout_fp16,
-            index_dtype="int32",
+            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == T.float32 else mma_32x8_to_shared_16x16_layout_fp16,
+            index_dtype=T.int32,
         )
         if not inverse:
             return index_map
@@ -383,7 +383,7 @@ class TensorCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward(i: int, j: int, rep: int) -> int:
             """
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
index ea7aa899..3e375b46 100644
--- a/tilelang/intrinsics/mma_sp_macro_generator.py
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -133,10 +133,10 @@ class SparseTensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        e_dtype: str = "uint8",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        e_dtype: str = T.uint8,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         e_transposed: bool = False,
@@ -181,7 +181,7 @@ class SparseTensorCoreIntrinEmitter:
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
         # NOTE: k_dim here represents the logical shape of the MMA operation.
@@ -250,7 +250,7 @@ class SparseTensorCoreIntrinEmitter:
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+        index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -708,7 +708,7 @@ class SparseTensorCoreIntrinEmitter:
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 26208d6c..923bb0e1 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -73,9 +73,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -245,7 +245,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
         )
         # Allocate an instruction descriptor wrapper and initialize it
         a_dtype_abbrv = self.a_dtype_abbrv
-        mask_zero = T.Cast("int32", 0)
+        mask_zero = T.Cast(T.int32, 0)
         mask0 = mask1 = mask2 = mask3 = mask_zero
 
         # TCGEN05 only has one warp group
diff --git a/tilelang/intrinsics/wgmma_macro_generator.py b/tilelang/intrinsics/wgmma_macro_generator.py
index 483b6e73..864420c7 100644
--- a/tilelang/intrinsics/wgmma_macro_generator.py
+++ b/tilelang/intrinsics/wgmma_macro_generator.py
@@ -83,9 +83,9 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -515,7 +515,7 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index b26f0b8f..e9338fa6 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -28,6 +28,7 @@ from tvm.tir import PrimExpr
 from tvm.script.parser.tir import block_attr
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
+from .v2 import dtypes as _dtypes
 from .v2.dtypes import dtype as tl_dtype
 from .v2.builder import OutTensor
 from .v2.annot import Tensor, SharedBuffer, LocalBuffer, FragmentBuffer
@@ -158,7 +159,7 @@ def alloc_barrier(arrive_count: int):
     Returns:
         T.Buffer: A TVM buffer object allocated as a barrier
     """
-    return T.alloc_buffer([arrive_count], "uint64", scope="shared.barrier")
+    return T.alloc_buffer([arrive_count], _dtypes.uint64, scope="shared.barrier")
 
 
 def alloc_tmem(shape, dtype):
@@ -231,7 +232,7 @@ DescKind = Literal["wgmma", "tcgen05_smem", "tcgen05_instr"]
 
 def alloc_descriptor(
     kind: DescKind = "wgmma",
-    dtype: str = "uint64",
+    dtype: str = _dtypes.uint64,
 ):
     """Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
 
@@ -248,28 +249,28 @@ def alloc_descriptor(
     return T.alloc_buffer([1], dtype, scope=scope)
 
 
-def alloc_wgmma_desc(dtype: str = "uint64"):
+def alloc_wgmma_desc(dtype: str = _dtypes.uint64):
     return alloc_descriptor("wgmma", dtype=dtype)
 
 
-def alloc_tcgen05_smem_desc(dtype: str = "uint64"):
+def alloc_tcgen05_smem_desc(dtype: str = _dtypes.uint64):
     return alloc_descriptor("tcgen05_smem", dtype=dtype)
 
 
-def alloc_tcgen05_instruction_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instruction_desc(dtype: str = _dtypes.uint32):
     return alloc_descriptor("tcgen05_instr", dtype=dtype)
 
 
 # Alias: short name consistent with imports
-def alloc_tcgen05_instr_desc(dtype: str = "uint32"):
+def alloc_tcgen05_instr_desc(dtype: str = _dtypes.uint32):
     return alloc_tcgen05_instruction_desc(dtype)
 
 
 @overload
-def empty(shape: tuple[Unpack[_Shapes]], dtype: str = "float32") -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
+def empty(shape: tuple[Unpack[_Shapes]], dtype: str = _dtypes.float32) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]: ...
 
 
-def empty(*shape: Unpack[_Shapes], dtype: str = "float32") -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
+def empty(*shape: Unpack[_Shapes], dtype: str = _dtypes.float32) -> Tensor[Callable[[Unpack[_Shapes]]], _DType]:
     if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
         return OutTensor(shape[0], dtype)
     elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index 03525143..a4caefc2 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -92,7 +92,7 @@ from tvm.script.ir_builder.tir import frame
 
 def buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -143,7 +143,7 @@ def buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.Buffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -244,7 +244,7 @@ def func_ret(ret_type: Type) -> Type:
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -266,11 +266,11 @@ def match_buffer(
     -------
     Match buffer from function parameter
     .. code-block:: python
-        A = T.match_buffer(a, (128, 128), dtype="float32")
+        A = T.match_buffer(a, (128, 128), dtype=T.float32)
 
     Match buffer from Buffer subregion
     .. code-block:: python
-        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
+        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype=T.float32)
 
     Parameters
     ----------
@@ -320,7 +320,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else T.int32
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -440,7 +440,7 @@ def block_attr(attrs: Dict[str, Any]) -> None:
 
 def alloc_buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -491,7 +491,7 @@ def alloc_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -537,7 +537,7 @@ class axis:  # pylint: disable=invalid-name
     def spatial(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -565,7 +565,7 @@ class axis:  # pylint: disable=invalid-name
     def reduce(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -593,7 +593,7 @@ class axis:  # pylint: disable=invalid-name
     def scan(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -621,7 +621,7 @@ class axis:  # pylint: disable=invalid-name
     def opaque(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -646,7 +646,7 @@ class axis:  # pylint: disable=invalid-name
         )
 
     @staticmethod
-    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
+    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = T.int32) -> Union[List[Var], Var]:
         """The block axis remapping function.
 
         Parameters
@@ -1133,7 +1133,7 @@ def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
 
 def decl_buffer(
     shape,
-    dtype="float32",
+    dtype=T.float32,
     data=None,
     strides=None,
     elem_offset=None,
@@ -1184,7 +1184,7 @@ def decl_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.DeclBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1237,7 +1237,7 @@ def launch_thread(
     return _ffi_api.LaunchThread(thread, extent)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def env_thread(thread_tag: str, dtype: str = "int32") -> IterVar:
+def env_thread(thread_tag: str, dtype: str = T.int32) -> IterVar:
     """Bind a var to thread env
 
     Parameters
@@ -1656,7 +1656,7 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     args = []
     for name, i in zip(params.keys(), identity + identity):
         if isinstance(i, int):
-            args.append(Var(name, "int32"))
+            args.append(Var(name, T.int32))
         else:
             args.append(Var(name, i.dtype))
     res = combiner(*args)
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index 20c5d1b4..6f650470 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -94,7 +94,7 @@ def _gemm_impl(
     offset_a = A_offset[-1]
     offset_b = B_offset[-1]
 
-    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, "uint32")
+    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, T.uint32)
     C_coords = [r.min for r in C_region.region]
     # Convert BufferRegion to tl.region calls for arguments
     A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index 5f2aaab7..53316d8c 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -157,7 +157,7 @@ class BufferProxy:
     def __call__(
         self,
         shape,
-        dtype="float32",
+        dtype=T.float32,
         data=None,
         strides=None,
         elem_offset=None,
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index 82ae7d70..8d65786e 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -89,12 +89,12 @@ def macro(*args, hygienic: bool = True) -> Callable:
 
 
         @T.prim_func
-        def use1(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use1(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 static_capture(A, B)    ### Produces B[()] = A[128]
 
         @T.prim_func
-        def use2(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use2(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 dynamic_capture(A, B)   ### Produces B[()] = A[x_value]
         ```
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index 6cf78418..d622911d 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -1163,7 +1163,7 @@ def ptx_tcgen05_mma_ss(
      desc_val, scale_out, mask0, mask1, mask2, mask3[, enable_ws]).
     Aliases: you can also pass `ws` or `warp_specialized` (booleans) instead of `enable_ws`.
     Alternatively, use `variant="ws"` (or "default").
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     # Aliases precedence: if either `ws` or `warp_specialized` is provided, they override enable_ws
@@ -1224,7 +1224,7 @@ def ptx_tcgen05_mma_ts(
     Expects 13 positional arguments:
     (kind_dtype, A_ptr, A_offset, desc_b, B_offset, C_ptr, C_offset,
      desc_val, scale_out, mask0, mask1, mask2, mask3).
-    - kind_dtype: instruction kind selector (e.g., "float16" for kind::f16,
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
       "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
     """
     return call_intrin(
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
index 645a1ad9..8e8930a1 100644
--- a/tilelang/language/v2/builder.py
+++ b/tilelang/language/v2/builder.py
@@ -13,6 +13,7 @@ from .utils import construct_strides
 import tvm
 from tvm.tir import Buffer
 from tvm.script.ir_builder import tir, IRBuilder
+
 from tvm.tir.expr import BufferLoad, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
 from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union
 from collections.abc import Sequence
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index 6ed56b48..c872985f 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -11,7 +11,7 @@ _T = TypeVar("_T")
 if TYPE_CHECKING:
 
     class dtype(Generic[_T]):
-        def torch(self) -> torch.dtype: ...
+        def as_torch(self) -> torch.dtype: ...
 else:
     dtype = tvm.DataType
 
@@ -68,7 +68,32 @@ _TORCH_DTYPE_TO_STR = {
     torch.bfloat16: "bfloat16",
 }
 
-# _STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
+_extended_torch_dtypes = [
+    ("float8_e4m3fn",),
+    ("float8_e4m3fnuz",),
+    ("float8_e5m2",),
+    ("float8_e5m2fnuz",),
+    ("float8_e8m0fnu",),
+    ("float4_e2m1fnx2",),
+]
+for dtype_name_tuple in _extended_torch_dtypes:
+    dtype_name = dtype_name_tuple[0]
+    torch_dtype = getattr(torch, dtype_name, None)
+    if torch_dtype is not None:
+        _TORCH_DTYPE_TO_STR[torch_dtype] = dtype_name
+
+
+_CANONICAL_TO_DISPLAY_STR = {
+    "double": "float64",
+    "float": "float32",
+    "int": "int32",
+    "long": "int64",
+    "short": "int16",
+    "uint": "uint32",
+    "ulong": "uint64",
+}
+
+_STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
 
 # _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
 
@@ -76,7 +101,9 @@ _DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_T
 
 _STR_TO_TVM_DTYPE_CALL = {
     "bool": "Boolean",
+    "int4": "Int4",
     "int8": "Int8",
+    "int16": "Int16",
     "int32": "Int32",
     "int64": "Int64",
     "uint8": "UInt8",
@@ -127,12 +154,20 @@ def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var
     return call(expr, is_size_var)
 
 
+def __dtype_as_torch__(self: dtype) -> torch.dtype:
+    """Convert TileLang dtype to PyTorch dtype."""
+    dtype_str = str(self)
+    if dtype_str in _STR_TO_TORCH_DTYPE:
+        return _STR_TO_TORCH_DTYPE[dtype_str]
+    raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
+
+
 __orig_dtype_new = dtype.__new__
 
 
 def __dtype_new__(cls, value: AnyDType) -> dtype:
     if isinstance(value, str):
-        return __orig_dtype_new(cls, value)
+        return __orig_dtype_new(cls, _CANONICAL_TO_DISPLAY_STR.get(value, value))
     elif value in _DTYPE_TO_STR:
         return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
     else:
@@ -142,6 +177,7 @@ def __dtype_new__(cls, value: AnyDType) -> dtype:
 
 dtype.__call__ = __dtype_call__
 dtype.__new__ = __dtype_new__
+dtype.as_torch = __dtype_as_torch__
 
 
 def get_tvm_dtype(value: AnyDType) -> dtype:
@@ -155,10 +191,12 @@ if TYPE_CHECKING:
     class bool(dtype): ...
     class short(dtype): ...
     class int(dtype): ...
+    class uint(dtype): ...
     class long(dtype): ...
     class half(dtype): ...
     class float(dtype): ...
     class double(dtype): ...
+    class int4(dtype): ...
     class int8(dtype): ...
     class int16(dtype): ...
     class int32(dtype): ...
@@ -320,10 +358,12 @@ else:
     bool = dtype("bool")
     short = dtype("int16")
     int = dtype("int32")
+    uint = dtype("uint32")
     long = dtype("int64")
     half = dtype("float16")
     float = dtype("float32")
     double = dtype("float64")
+    int4 = dtype("int4")
     int8 = dtype("int8")
     int16 = dtype("int16")
     int32 = dtype("int32")
@@ -484,10 +524,12 @@ _all_dtypes = {
     "bool",
     "short",
     "int",
+    "uint",
     "long",
     "half",
     "float",
     "double",
+    "int4",
     "int8",
     "int16",
     "int32",
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index e68c1167..7ae836bc 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -31,10 +31,20 @@ def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, bl
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
         warnings.warn(f"block_k {block_k} is too large, set to 128 for {mma_dtype}.", stacklevel=2)
-    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8_e4m3", "float8_e5m2"]:
+    if mma_dtype not in [
+        T.float16,
+        T.bfloat16,
+        T.float32,
+        T.int8,
+        T.float8_e4m3,
+        T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
+        T.float8_e5m2,
+        T.float8_e5m2fnuz,
+    ]:
         raise NotImplementedError(f"Unsupported dtype: {mma_dtype}")
 
-    if buffer.dtype not in ["uint8", "int8"]:
+    if buffer.dtype not in [T.uint8, T.int8]:
         raise ValueError(f"metadata should be 8 bit, got {buffer.dtype}")
 
     bits_map = {
@@ -43,7 +53,10 @@ def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, bl
         "float32": 32,
         "int8": 8,
         "float8_e4m3": 8,
+        "float8_e4m3fn": 8,
+        "float8_e4m3fnuz": 8,
         "float8_e5m2": 8,
+        "float8_e5m2fnuz": 8,
     }
 
     # ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl#L108-L117
@@ -112,10 +125,10 @@ def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
         buffer: metadata buffer shape, for sm80 it should be a 16bit type
     """
 
-    if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
+    if mma_dtype in [T.float16, T.bfloat16] and buffer.dtype not in [T.uint16, T.int16]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8_e4m3", "float8_e5m2", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", T.int8, T.uint8] and buffer.dtype not in [T.uint32, T.int32]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
     m, k = buffer.shape
@@ -134,7 +147,7 @@ def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = "float16", arch: str | None = None, **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = T.float16, arch: str | None = None, **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index e0788dab..6f1f457d 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 from typing import Literal
+from tilelang import language as T
 
 decode_i4_to_f16 = """
 template <typename T1, typename T2, bool isSigned = false>
@@ -1088,10 +1089,10 @@ __device__ void decode_i2u_to_i4s(T1 *_i4u, T2 *B_local_decode, const int N = 16
 
 
 def get_lop3_intrin_group(
-    out_dtype: Literal["float16", "int8", "int4"],
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.int8, T.int4],
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8"] = "int8",
+    storage_dtype: Literal[T.int32, T.int8] = T.int8,
     with_scaling: bool = False,
     with_zeros: bool = False,
     zeros_mode: Literal["original", "rescale", "quantized"] = "original",
@@ -1104,10 +1105,10 @@ def get_lop3_intrin_group(
 
     Parameters
     ----------
-    in_dtype : Literal["int8"]
+    in_dtype : Literal[T.int8]
         The data type of the input. It should be "int8".
 
-    out_dtype : Literal["float16", "int8", "int4"]
+    out_dtype : Literal[T.float16, T.int8, T.int4]
         The data type of the output. It can be either "float16" or "int8" or "int4".
 
     storage_nbit : int, optional
@@ -1130,18 +1131,17 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in ["float16", "int8", "int4"], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.int8, T.int4], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
 
-    dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
+    dtype_mapping = {T.float16: "f16", T.int4: "i4", T.int8: "i8", T.int32: "i32"}
     target_dtype = dtype_mapping[out_dtype]
 
-    if source_format not in ["int", "uint"]:
-        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
-    if with_zeros and source_format == "int":
+    if source_format not in [T.int, T.uint]:
+        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}, {type(source_format)}.")
+    if with_zeros and source_format == T.int:
         raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
 
-    source_symbol = "i" if source_format == "int" else "u"
-
     import_c_map = {
         "i4_to_f16": decode_i4_to_f16,
         "i2_to_f16": decode_i2_to_f16,
@@ -1176,15 +1176,15 @@ def get_lop3_intrin_group(
     if is_ladder_stage3:
         key += "_offset"
 
-    if out_dtype == "float16":
+    if out_dtype == T.float16:
         d4f = "f16"
-    elif out_dtype == "int8":
+    elif out_dtype == T.int8:
         d4f = "i8s"
-    elif out_dtype == "int4":
+    elif out_dtype == T.int4:
         d4f = "i4s"
     else:
         raise ValueError(f"Unsupported target dtype: {target_dtype}")
-    source_symbol = "u" if source_format == "uint" else "s"
+    source_symbol = "u" if source_format == T.uint else "s"
     func_name = f"decode_i{source_bit}{source_symbol}_to_{d4f}"
     if with_scaling:
         func_name += "_scale"
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index e5c472cb..dd7100a6 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,4 +1,5 @@
 from typing import Literal
+from tilelang import language as T
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -49,10 +50,10 @@ __device__ void decode_fp4_to_bf16_twiddling(T1 *B_local, T2 *B_local_decode, co
 
 
 def get_mxfp_intrin_group(
-    out_dtype: Literal["float16", "bfloat16"] = "bfloat16",
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8", "uint8"] = "uint8",
+    storage_dtype: Literal[T.int32, T.int8, T.uint8] = T.uint8,
     use_twiddling: bool = False,
 ) -> dict[str, str]:
     """
@@ -65,10 +66,10 @@ def get_mxfp_intrin_group(
     `_twiddling`).
 
     Parameters:
-        out_dtype: Target floating-point type for decoded values; either "float16" or "bfloat16".
+        out_dtype: Target floating-point type for decoded values; either T.float16 or T.bfloat16.
         source_format: Integer source representation; "int" or "uint".
         source_bit: Bit width of the packed source format (e.g., 4).
-        storage_dtype: Underlying storage integer dtype (one of "int32", "int8", "uint8").
+        storage_dtype: Underlying storage integer dtype (one of T.int32, T.int8, T.uint8).
         use_twiddling: When True, select the twiddling variant of the decoding intrinsic.
 
     Returns:
@@ -80,11 +81,12 @@ def get_mxfp_intrin_group(
         AssertionError: if out_dtype, source_format, or storage_dtype are not supported.
         KeyError: if the constructed key does not match any available C source implementation.
     """
-    assert out_dtype in ["float16", "bfloat16"], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
-    assert source_format in ["int", "uint"], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
-    assert storage_dtype in ["int32", "int8", "uint8"], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.bfloat16], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
+    assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
+    assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    dtype_map = {"float16": "f16", "bfloat16": "bf16"}
+    dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     key = f"fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         key += "_twiddling"
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index db9d2349..13552f67 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -22,6 +22,7 @@
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable
 """TIR computation utilities for quantization."""
 
+from tilelang import language as T
 from tilelang import tvm as tvm
 from tvm import tir
 
@@ -36,7 +37,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         a bfloat16 constructed from the unpacked sign, a scaled exponent, and the 1-bit mantissa.
 
         Behavior:
-        - Validates `nbit == 4`, `dtype == "bfloat16"`, and `val.dtype == "uint8"` (AssertionError if violated).
+        - Validates `nbit == 4`, `dtype == T.bfloat16`, and `val.dtype == T.uint8` (AssertionError if violated).
         - Extracts the 4-bit field at position `pos` (fields are packed consecutively in `val`).
         - Interprets the 4-bit field as: sign = bit3, exponent = bits1-2, mantissa = bit0.
         - Converts the 2-bit exponent to bf16 exponent space by adding a bias of 126, adds `scale` to that exponent,
@@ -49,27 +50,27 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         - val: uint8 expression containing packed fields.
         - pos: index of the field within `val` (0-based); used to compute the bit shift.
         - scale: exponent-scale to add to the converted exponent (treated as an unsigned integer expression).
-        - dtype: must be "bfloat16".
+        - dtype: must be T.bfloat16.
 
         Returns:
         - A tir.PrimExpr of dtype "bfloat16" representing the decoded and scaled value.
         """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(T.bfloat16,
+                               ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16))
+                                | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16))
     return val_bf16
 
 def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_even: bool = True):
@@ -88,29 +89,29 @@ def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_eve
     Returns:
         tir.PrimExpr: A uint32 PrimExpr containing the packed bfloat16 representations (v0 low 16 bits, v1 high 16 bits).
     """
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     res = []
     for data in [v0, v1]:
-        u32_val = tir.reinterpret("uint32", data)
+        u32_val = tir.reinterpret(T.uint32, data)
         if round_to_even:
-            rounding_bias = ((u32_val >> tir.const(16, "uint32"))
-                             & tir.const(1, "uint32")) + tir.const(0x7FFF, "uint32")
+            rounding_bias = ((u32_val >> tir.const(16, T.uint32))
+                             & tir.const(1, T.uint32)) + tir.const(0x7FFF, T.uint32)
             u32_val += rounding_bias
-        res.append((u32_val >> tir.const(16, "uint32")) & mask)
-    return res[0] | (res[1] << tir.const(16, "uint32"))
+        res.append((u32_val >> tir.const(16, T.uint32)) & mask)
+    return res[0] | (res[1] << tir.const(16, T.uint32))
 
 
 def _tir_u32_to_bf16x2_to_f32x2(x: tir.PrimExpr):
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     x0 = x & mask
     x1 = (x >> 16) & mask
-    return (tir.reinterpret("float32", x << tir.const(16, "uint32")) for x in [x0, x1])
+    return (tir.reinterpret(T.float32, x << tir.const(16, T.uint32)) for x in [x0, x1])
 
 
 def _tir_u32_to_int_to_float(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
-    assert val.dtype == "uint32"
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    return tir.Cast(dtype, (val >> (pos * nbit).astype("uint32")) & mask)
+    assert val.dtype == T.uint32
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    return tir.Cast(dtype, (val >> (pos * nbit).astype(T.uint32)) & mask)
 
 
 def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
@@ -119,7 +120,7 @@ def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1)) - 1
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -130,74 +131,74 @@ def _tir_packed_int_to_int_to_float(storage_nbit: int):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
 
 def _tir_f32_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float32"
-    val_u32 = tir.reinterpret("uint32", val)
+    assert val.dtype == T.float32
+    val_u32 = tir.reinterpret(T.uint32, val)
     # e_f32 >  120 -> e_f4 = min(e_f32 - 120 + M_h, 7)
     # e_f32 == 120 -> e_f4 = 1
     # e_f32 < 120 -> e_f4 = 0
-    m_h = (val_u32 >> tir.const(22, "uint32")) & tir.const(1, "uint32")
-    e_f32 = (val_u32 >> tir.const(23, "uint32")) & tir.const(255, "uint32")
-    s = (val_u32 >> tir.const(31, "uint32"))
+    m_h = (val_u32 >> tir.const(22, T.uint32)) & tir.const(1, T.uint32)
+    e_f32 = (val_u32 >> tir.const(23, T.uint32)) & tir.const(255, T.uint32)
+    s = (val_u32 >> tir.const(31, T.uint32))
     e_f4 = tir.Select(
-        e_f32 > tir.const(120, "uint32"),
-        tir.Min(e_f32 - tir.const(120, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f32 == tir.const(120, "uint32"), tir.const(1, "uint32"),
-                   tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f32 > tir.const(120, T.uint32),
+        tir.Min(e_f32 - tir.const(120, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f32 == tir.const(120, T.uint32), tir.const(1, T.uint32),
+                   tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_f16_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float16"
-    val_u32 = tir.Cast("uint32", tir.reinterpret("uint16", val))
-    m_h = (val_u32 >> tir.const(9, "uint32")) & tir.const(1, "uint32")
-    e_f16 = (val_u32 >> tir.const(10, "uint32")) & tir.const(31, "uint32")
-    s = (val_u32 >> tir.const(15, "uint32"))
+    assert val.dtype == T.float16
+    val_u32 = tir.Cast(T.uint32, tir.reinterpret(T.uint16, val))
+    m_h = (val_u32 >> tir.const(9, T.uint32)) & tir.const(1, T.uint32)
+    e_f16 = (val_u32 >> tir.const(10, T.uint32)) & tir.const(31, T.uint32)
+    s = (val_u32 >> tir.const(15, T.uint32))
     e_f4 = tir.Select(
-        e_f16 > tir.const(8, "uint32"),
-        tir.Min(e_f16 - tir.const(8, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f16 == tir.const(8, "uint32"), tir.const(1, "uint32"), tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f16 > tir.const(8, T.uint32),
+        tir.Min(e_f16 - tir.const(8, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f16 == tir.const(8, T.uint32), tir.const(1, T.uint32), tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_u32_to_f4_to_f32(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float32"
-    assert val.dtype == "uint32"
+    assert dtype == T.float32
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f32 = 0
     # e_f4 != 0 -> e_f32 = e_f4 + 120 = e_f4 | (1111000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    f4 = (val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & mask
-    s = f4 >> tir.const(3, "uint32")
-    e_f4 = f4 & tir.const(7, "uint32")
-    e_f32 = e_f4 | tir.const(120, "uint32")
-    val_f32 = tir.reinterpret("float32",
-                              (e_f32 | (s << tir.const(8, "uint32"))) << tir.const(23, "uint32"))
-    return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float32"), val_f32)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    f4 = (val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & mask
+    s = f4 >> tir.const(3, T.uint32)
+    e_f4 = f4 & tir.const(7, T.uint32)
+    e_f32 = e_f4 | tir.const(120, T.uint32)
+    val_f32 = tir.reinterpret(T.float32,
+                              (e_f32 | (s << tir.const(8, T.uint32))) << tir.const(23, T.uint32))
+    return tir.Select(e_f4 == tir.const(0, T.uint32), tir.const(0, T.float32), val_f32)
 
 
 def _tir_packed_to_fp4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint32"
+    assert dtype == T.float16
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + 8 = e_f4 | (1000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = f4 & tir.const(7, "uint16")
-    e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
-    return tir.Select(e_f4 == tir.const(0, "uint16"), tir.const(0, "float16"), val_f16)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = f4 & tir.const(7, T.uint16)
+    e_f16 = e_f4 | tir.const(8, T.uint16)
+    val_f16 = tir.reinterpret(T.float16,
+                              ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16)).astype(T.uint16))
+    return tir.Select(e_f4 == tir.const(0, T.uint16), tir.const(0, T.float16), val_f16)
 
 def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
     storage_dtype = storage_type + str(storage_nbit)
@@ -210,37 +211,37 @@ def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
         s = f4 >> tir.const(3, storage_dtype)
         e_f4 = f4 & tir.const(7, storage_dtype)
         e_f16 = e_f4 | tir.const(8, storage_dtype)
-        val_f16 = tir.reinterpret("float16",
-                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype("uint16"))
-        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, "float16"), val_f16)
+        val_f16 = tir.reinterpret(T.float16,
+                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype(T.uint16))
+        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, T.float16), val_f16)
 
     return f_convert
 
 def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"),
-                        tir.const(0x4000, "uint16"))
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | prefix
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    prefix = tir.Select(e4 == tir.const(0, T.uint16), tir.const(0x2000, T.uint16),
+                        tir.const(0x4000, T.uint16))
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | prefix
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | (e4 << tir.const(8, "uint16")) | (e4 << tir.const(7, "uint16"))
-    e_f16 = e_f16 ^ tir.const(0x2000, "uint16")
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | (e4 << tir.const(8, T.uint16)) | (e4 << tir.const(7, T.uint16))
+    e_f16 = e_f16 ^ tir.const(0x2000, T.uint16)
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    return tir.reinterpret("float8_e5m2", val).astype("float16")
+    assert dtype == T.float16
+    return tir.reinterpret("float8_e5m2", val).astype(T.float16)
 
 
 def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
@@ -249,7 +250,7 @@ def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1))
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -283,10 +284,10 @@ def _tir_packed_int_to_int_convert(storage_type="uint", storage_nbit=8):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 5e4899b5..7d31ae46 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
+from tilelang import language as T
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
@@ -121,7 +122,7 @@ class GemmBase:
 
     @property
     def mbarptr(self) -> PrimExpr:
-        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, "uint32"))
+        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, T.uint32))
 
     @property
     def mbar(self) -> tir.Buffer:
@@ -131,7 +132,7 @@ class GemmBase:
     def C_coords(self):
         coords = getattr(self.gemm_node, "cCoords", None)
         if coords is None or len(coords) == 0:
-            zero = tvm.tir.const(0, "int32")
+            zero = tvm.tir.const(0, T.int32)
             return [zero, zero]
         return [coords[i] for i in range(len(coords))]
 
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index f93a403e..de3e7214 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -98,7 +98,7 @@ class GemmTCGEN5(GemmBase):
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype not in ["float32", "float16"]:
+        if accum_dtype not in [str(T.float32), str(T.float16)]:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index 8a2d250b..b42ccd7e 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -100,10 +100,10 @@ class PassConfigKey(str, Enum):
     such as `dst[i] = f(src[i])`, avoiding implicit aliasing:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    write = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
-    write_buf = T.Buffer((1,), "int32", data=write, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    write = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
+    write_buf = T.Buffer((1,), T.int32, data=write, scope="local.var")
     write_buf[0] = read_buf[0] * 2
     f(write_buf[0])
     ```
@@ -113,8 +113,8 @@ class PassConfigKey(str, Enum):
     like:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
     read_buf[0] = read_buf[0] * 2
     f(read_buf[0])
     ```
-- 
GitLab


From aa19342c93b965a5102422ce5280eb85a3059fbb Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 17 Dec 2025 15:19:21 +0800
Subject: [PATCH 123/139] [Issue Template] Enable blank issues in GitHub issue
 template(#1453)

---
 .github/ISSUE_TEMPLATE/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0c..0086358d 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1 @@
-blank_issues_enabled: false
+blank_issues_enabled: true
-- 
GitLab


From 6aaf3c7ae25ebbd8c5f6ad1f45081289234a7943 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:15:20 +0800
Subject: [PATCH 124/139] [CI] Moved the clang-tidy step to after pip install
 (#1456)

---
 .github/workflows/ci.yml | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bd0196b3..b12f0592 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -288,6 +288,26 @@ jobs:
           echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
           uv cache clean
 
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Install project (wheel form)
+        run: |
+          uv pip install -v .
+
       - name: Run clang-tidy
         id: clang-tidy
         if: runner.os == 'Linux'
@@ -328,26 +348,6 @@ jobs:
             exit "${rc}"
           fi
 
-      - name: Enable core dump generation (Linux / GitHub-hosted runners)
-        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kernel.core_uses_pid=0
-          sudo sysctl -w fs.suid_dumpable=1
-          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
-
-      - name: Enable core dump generation (macOS / GitHub-hosted runners)
-        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
-        run: |
-          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
-          sudo sysctl -w kern.coredump=1
-          sudo sysctl -w kern.sugid_coredump=1
-          sysctl kern.corefile kern.coredump kern.sugid_coredump
-
-      - name: Install project (wheel form)
-        run: |
-          uv pip install -v .
-
       - name: Run examples with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         if: contains(matrix.runner.toolkit, 'CUDA')
         run: |
-- 
GitLab


From 3ee0939b2da1398845b090d43ee36876ab8ce9c4 Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:52:34 +0800
Subject: [PATCH 125/139] [Bug] Fix tvm build script when patchelf is not found
 #1459)

---
 3rdparty/tvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 0a7a6eac..88778fa8 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 0a7a6eac5f10b896927610f2fff864f66753aea9
+Subproject commit 88778fa89d3203d3feb66950b86ca2f942c70fa5
-- 
GitLab


From 91cf7966e75637e7795941e9cd920e29f6b5437e Mon Sep 17 00:00:00 2001
From: Kuris <227995639+kurisu6912@users.noreply.github.com>
Date: Wed, 17 Dec 2025 18:29:52 +0800
Subject: [PATCH 126/139] [Analyzer] Fix floordiv & floormod bug in z3 prover
 (#1458)

* fix floordiv & floormod in z3 prover

* fix lint error
---
 testing/python/arith/test_arith_hard.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
index 5284bf85..6fc859ba 100644
--- a/testing/python/arith/test_arith_hard.py
+++ b/testing/python/arith/test_arith_hard.py
@@ -93,5 +93,13 @@ def test_bind():
         raise e
 
 
+def test_divmod():
+    analyzer = Analyzer()
+    a = T.Var("a", T.int32)
+
+    assert not analyzer.can_prove(a % 2 % -2 - a % 2 == 0)
+    assert analyzer.can_prove(a % -2 % 2 - a % 2 == 0)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
-- 
GitLab


From 48e70e68ce3461279fc52440a61f2a7d78e2aa01 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Wed, 17 Dec 2025 20:02:36 +0800
Subject: [PATCH 127/139] [Cache] Rename sparse compress cache directory
 (#1460)

* Enhance cache directory structure by including version information in sparse.py to ensure separate caches for different versions.

* Fix formatting in sparse.py by adding a newline for improved readability and consistency.
---
 tilelang/utils/sparse.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index d7a2c5f4..fa227b07 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -7,10 +7,13 @@ from tilelang.utils.tensor import is_float8_dtype, fp8_remove_negative_zeros_
 from torch.utils.cpp_extension import load, _import_module_from_library
 from tilelang import env
 
+# Include version information to ensure different versions use separate caches
+from tilelang import __version__
+
 # Define paths
 compress_util = os.path.join(env.TILELANG_TEMPLATE_PATH, "tl_templates/cuda/compress_sm90.cu")
 # Cache directory for compiled extensions
-_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor")
+_CACHE_DIR = os.path.join(env.TILELANG_CACHE_DIR, "sparse_compressor", __version__)
 os.makedirs(_CACHE_DIR, exist_ok=True)
 
 
-- 
GitLab


From cae06edd063d7dd1215b6ec620d02d679af5173c Mon Sep 17 00:00:00 2001
From: silentCoder-dev <silentcoder@foxmail.com>
Date: Thu, 18 Dec 2025 11:03:20 +0800
Subject: [PATCH 128/139] [Language]Adds a random number generation capability
 through curand_kernel (#1461)

* add curand.{curand_init, curand}

* run format.sh

* add default value for curand_init & add test for curand

* Update testing/python/language/test_rand.py

Remove unused thread binding

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* remove unused library

* enable tilelang cache for testing

* run format.sh

* Revert "run format.sh"

This reverts commit 5afaff782f31cdf653e2c45b469da8dead228b8a.

* Revert "enable tilelang cache for testing"

This reverts commit c277a43e77938bd88d47a108dd1bd65734d4a1ae.

* Revert "remove unused library"

This reverts commit 568ad20611f039380113937fd131151a2bffd801.

* run format.sh

* ensure FreshName for __philox_state

* ensure FreshName for __philox_state

* change the return type of T.rng_init

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 src/op/builtin.cc                    |  6 ++++
 src/op/builtin.h                     |  4 +++
 src/target/codegen_cuda.cc           | 18 ++++++++++++
 src/target/codegen_cuda.h            |  4 +++
 src/transform/layout_inference.cc    |  1 +
 src/transform/loop_vectorize.cc      |  4 +++
 testing/python/language/test_rand.py | 35 ++++++++++++++++++++++
 tilelang/language/__init__.py        |  5 ++++
 tilelang/language/random.py          | 44 ++++++++++++++++++++++++++++
 9 files changed, 121 insertions(+)
 create mode 100644 testing/python/language/test_rand.py
 create mode 100644 tilelang/language/random.py

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 2983495b..1e065ed0 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -102,6 +102,12 @@ TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
 TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(rng_rand).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
diff --git a/src/op/builtin.h b/src/op/builtin.h
index 81bc64e5..fd0bb22e 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -147,6 +147,10 @@ TVM_DLL const Op &ieee_frsqrt();
 // ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
 TVM_DLL const Op &ieee_fdiv();
 
+// random op
+TVM_DLL const Op &rng_init();
+TVM_DLL const Op &rng_rand();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 702ae017..6ea05acc 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -297,6 +297,10 @@ std::string CodeGenTileLangCUDA::Finish() {
     decl_stream << "#include <cooperative_groups.h>\n";
   }
 
+  if (need_curand_kernel_h_) {
+    decl_stream << "#include <curand_kernel.h>\n";
+  }
+
   decl_stream << "#include <tl_templates/cuda/gemm.h>\n";
   if (enable_sparse_gemm_) {
     decl_stream << "#include <tl_templates/cuda/gemm_sp.h>\n";
@@ -2730,6 +2734,20 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::rng_init())) {
+    this->need_curand_kernel_h_ = true;
+    this->curand_philox_state = name_supply_->FreshName("__philox_state");
+    this->PrintIndent();
+    this->stream << "curandStatePhilox4_32_10_t " << this->curand_philox_state
+                 << ";\n";
+    this->PrintIndent();
+    this->stream << "curand_init(" << PrintExpr(op->args[0]) << ", "
+                 << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
+                 << ", &" << this->curand_philox_state << ");\n";
+    // Store state_var for later use by rng_rand
+  } else if (op->op.same_as(tl::rng_rand())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand(&" << this->curand_philox_state << ")";
   } else if (op->op.same_as(tl::warp_reduce_sum())) {
     os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_max())) {
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 45fe5e2a..9cf46021 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -88,6 +88,8 @@ private:
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+  // Global curand state
+  std::string curand_philox_state;
 
   // whether enable fp16
   bool enable_fp16_{false};
@@ -123,6 +125,8 @@ private:
   bool need_cast_smem_ptr_to_int_{false};
   // whether need cooperative_groups.h
   bool need_cooperative_groups_{false};
+  // whether need curand_kernel.h
+  bool need_curand_kernel_h_{false};
   // Op attribute map
   OpAttrMap<bool> op_need_warp_shuffle_ =
       Op::GetAttrMap<bool>("cuda.need_warp_shuffle");
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index 0c6a7637..b44824af 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -1190,6 +1190,7 @@ private:
     });
 
     if ((has_non_local || has_cast_operations) && !has_reducer) {
+      DLOG(INFO) << "Try to vectorize loop";
       for_node = VectorizeLoop(for_node, saved_analyzer.get());
     }
 
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 72b93b78..7a446731 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -152,6 +152,10 @@ private:
     } else if (node->op == builtin::call_extern()) {
       // do not vectorize extern calls
       vector_size_ = 1;
+    } else if (node->op.same_as(tl::rng_rand()) ||
+               node->op.same_as(tl::rng_init())) {
+      // do not vectorize random operation
+      vector_size_ = 1;
     }
     return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
diff --git a/testing/python/language/test_rand.py b/testing/python/language/test_rand.py
new file mode 100644
index 00000000..5e25cc3b
--- /dev/null
+++ b/testing/python/language/test_rand.py
@@ -0,0 +1,35 @@
+import tilelang
+import tilelang.language as T  # noqa: N812
+import torch
+import triton
+import triton.language as tl
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    blk_M = 128
+    num_threads = 128
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(M // blk_M, threads=num_threads) as bx:
+            T.rng_init(seed)
+            for i in T.Parallel(blk_M):
+                A[bx * blk_M + i] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, seed):
+    pid = tl.program_id(0)
+    offset = pid * M + tl.arange(0, M)
+    rand = tl.randint(seed, offset)
+    tl.store(X + offset, rand, mask=offset < M)
+
+
+if __name__ == "__main__":
+    M = 1024
+    kernel = tilelang_rand_1d()
+    x = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(x)
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 435aa974..bf2a144a 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -111,6 +111,11 @@ from .annotations import (  # noqa: F401
     annotate_restrict_buffers,
 )
 
+from .random import (
+    rng_init,  # noqa: F401
+    rng_rand,  # noqa: F401
+)
+
 
 def import_source(source: str | None = None):
     # source is the source code to be imported
diff --git a/tilelang/language/random.py b/tilelang/language/random.py
new file mode 100644
index 00000000..a76625be
--- /dev/null
+++ b/tilelang/language/random.py
@@ -0,0 +1,44 @@
+from tvm import tir
+import tilelang.language as T
+
+
+# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview
+def rng_init(seed, seq=None, off=0):
+    """Initialize CUDA curand random number generator state
+
+    Parameters
+    ----------
+    seed : PrimExpr
+        Random seed value.
+    seq : PrimExpr
+        Sequence number for parallel random number generation.
+    off : PrimExpr
+        Offset number for parallel random number generation.
+
+    Returns
+    -------
+    state : PrimExpr
+        The random number generator state handle.
+    """
+    seed = tir.convert(seed)
+    if seq is None:
+        bx = T.get_block_binding()
+        ex = T.kernel.get_thread_extent()
+        tx = T.get_thread_binding()
+        id = tx + bx * ex
+        seq = tir.convert(id)
+    else:
+        seq = tir.convert(seq)
+    off = tir.convert(off)
+    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off)
+
+
+def rng_rand():
+    """Generate a 32-bit unsigned random integer
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A 32-bit unsigned random integer.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.rng_rand"))
-- 
GitLab


From a6f59f31973bb4a8a302e7932ba5b53995d4ccfc Mon Sep 17 00:00:00 2001
From: Jinjie Liu <68475640+sgjzfzzf@users.noreply.github.com>
Date: Thu, 18 Dec 2025 11:03:56 +0800
Subject: [PATCH 129/139] remove unused duplicated type check (#1462)

Signed-off-by: Jinjie Liu <jjliu@baai.ac.cn>
---
 src/target/codegen_cuda.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index 6ea05acc..657871d8 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -131,8 +131,7 @@ static std::string GetTileLangFP8Type(DataType type) {
   if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
       type.is_float8_e4m3()) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
-             type.is_float8_e5m2()) {
+  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz()) {
     stream << "fp8_e5" << vec << "_t";
   } else if (type.is_float8_e8m0fnu()) {
     stream << "fp8_e8" << vec << "_t";
-- 
GitLab


From 7248a810d97ca8ceb999cc0a9e2bf58adc68f263 Mon Sep 17 00:00:00 2001
From: Gabriel Wu <13583761+lucifer1004@users.noreply.github.com>
Date: Thu, 18 Dec 2025 11:55:18 +0800
Subject: [PATCH 130/139] feat(cutedsl): add CuTeDSL backend (#1421)

* feat: CuTeDSL backend

* fix: clang-tidy

* fix: clang-format

* fix: ci

* fix: revert example gemm fp8

* fix: remove duplicate code

* fix: switch-case

* fix: fp16 silence

* fix: TVM IR print

* fix: useless tir

* fix: clang-format

* fix: remove tilelang/contrib/cutedsl/.gitignore

* fix: use hexfloat

* fix: gsym guard

* fix: unknown storage sync type

* fix: string literal

* fix: add args guard

* fix: name hint dedup

* fix: better find_kernel_by_pattern

* fix: set libpath for from_database path

* fix: guard buffer.strides

* fix: from guard

* fix: eviction guard

* fix: use thread local tma descs

* fix: ruff

* fix: drop tma_init_cpp

* fix: exc_info

* fix: negative unmatch early return

* fix: rename postproc func and add test

* fix: handle fast math according to pass config

* fix: dyn_sym parse

* fix: wrap_forward

* fix: use tvm_ffi.libinfo instead of cli

* fix: keep signature

* fix: C++ string safety

* fix: mark tma_store_add as unsupported

* fix: tvm version

* resolve ldsm and cpasync issues.

* fix: minor fixes

* fix: parse signature using ast

* fix: guard global_addr

* fix: create tempfile only when necessary

* fix: use logger.execption for exceptions

* fix: guard lib_path and host_func

* fix: remove tma_cpp_init and add timeout for cpp compile

* add timeout for mbarrier_wait.

* fix: _load_kernel_from_disk signature

* resolve codegen issues.

* fix: logger.exception

* add comment for div_by=1

* merge

* fix: reserve cutlass,cute,tl

* fix: guard tma_store

* fix: allow int64 offset in make_tensor_at_offset

* fix: guard barrier

* fix: add comments for div_by=16

* fix: div_by=1 issue

* delete div_by when offset is 0

* use tl.make_tensor when offset is 0

* fix: explicitly check cutedsl target

* fix: use param.torch_dtype()

---------

Co-authored-by: yuxic <yuxic@nvidia.com>
Co-authored-by: Yong <yong@local>
Co-authored-by: LeiWang1999 <leiwang1999@outlook.com>
---
 .github/workflows/ci.yml                      |   19 +
 CMakeLists.txt                                |    4 +
 maint/scripts/run_local_ci_test.sh            |   10 +-
 requirements-test-cuda.txt                    |    2 +
 src/target/codegen_cutedsl.cc                 | 1355 +++++++++++++++++
 src/target/codegen_cutedsl.h                  |  102 ++
 src/target/codegen_py.cc                      |  715 +++++++++
 src/target/codegen_py.h                       |  255 ++++
 src/target/codegen_utils.cc                   |   41 +
 src/target/codegen_utils.h                    |   33 +
 src/target/rt_mod_cutedsl.cc                  |   69 +
 src/tl_templates/cuda/nvrtc_std.h             |    2 +-
 .../python/jit/test_tilelang_jit_cutedsl.py   |  381 +++++
 tilelang/cache/kernel_cache.py                |  188 ++-
 tilelang/contrib/cutedsl/__init__.py          |  128 ++
 tilelang/contrib/cutedsl/cpasync.py           |  215 +++
 tilelang/contrib/cutedsl/gemm_V1.py           |  569 +++++++
 tilelang/contrib/cutedsl/ldsm.py              |  127 ++
 tilelang/contrib/cutedsl/math.py              |    9 +
 tilelang/contrib/cutedsl/mbar.py              |   45 +
 tilelang/contrib/cutedsl/reduce.py            |  186 +++
 .../contrib/cutedsl/threadblock_swizzle.py    |   54 +
 tilelang/engine/lower.py                      |    6 +-
 tilelang/jit/__init__.py                      |   14 +-
 tilelang/jit/adapter/__init__.py              |    1 +
 tilelang/jit/adapter/cutedsl/__init__.py      |   16 +
 tilelang/jit/adapter/cutedsl/adapter.py       |  368 +++++
 tilelang/jit/adapter/cutedsl/checks.py        |   79 +
 tilelang/jit/adapter/cutedsl/libgen.py        |  124 ++
 tilelang/jit/adapter/cutedsl/wrapper.py       | 1354 ++++++++++++++++
 tilelang/jit/adapter/nvrtc/adapter.py         |    4 +-
 tilelang/jit/adapter/nvrtc/wrapper.py         |    2 +-
 tilelang/jit/adapter/utils.py                 |   61 +-
 tilelang/jit/adapter/wrapper.py               |   38 +-
 tilelang/jit/execution_backend.py             |   28 +-
 tilelang/jit/kernel.py                        |   46 +-
 tilelang/utils/target.py                      |   11 +
 37 files changed, 6558 insertions(+), 103 deletions(-)
 create mode 100644 src/target/codegen_cutedsl.cc
 create mode 100644 src/target/codegen_cutedsl.h
 create mode 100644 src/target/codegen_py.cc
 create mode 100644 src/target/codegen_py.h
 create mode 100644 src/target/codegen_utils.cc
 create mode 100644 src/target/codegen_utils.h
 create mode 100644 src/target/rt_mod_cutedsl.cc
 create mode 100644 testing/python/jit/test_tilelang_jit_cutedsl.py
 create mode 100644 tilelang/contrib/cutedsl/__init__.py
 create mode 100644 tilelang/contrib/cutedsl/cpasync.py
 create mode 100644 tilelang/contrib/cutedsl/gemm_V1.py
 create mode 100644 tilelang/contrib/cutedsl/ldsm.py
 create mode 100644 tilelang/contrib/cutedsl/math.py
 create mode 100644 tilelang/contrib/cutedsl/mbar.py
 create mode 100644 tilelang/contrib/cutedsl/reduce.py
 create mode 100644 tilelang/contrib/cutedsl/threadblock_swizzle.py
 create mode 100644 tilelang/jit/adapter/cutedsl/__init__.py
 create mode 100644 tilelang/jit/adapter/cutedsl/adapter.py
 create mode 100644 tilelang/jit/adapter/cutedsl/checks.py
 create mode 100644 tilelang/jit/adapter/cutedsl/libgen.py
 create mode 100644 tilelang/jit/adapter/cutedsl/wrapper.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b12f0592..d7abaeb0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -370,8 +370,27 @@ jobs:
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
           "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
+            --ignore=./python/jit/test_tilelang_jit_cutedsl.py \
             ./python
 
+      # CuTeDSL JIT tests require GEMM v1 (must be set before importing tilelang).
+      # Run them in a dedicated step to avoid changing the default GEMM selection
+      # (and to keep the rest of the CUDA tests on GEMM v2).
+      - name: Run CuTeDSL JIT tests (GEMM v1) with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
+        id: cutedsl-tests
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        env:
+          TILELANG_USE_GEMM_V1: "1"
+        run: |
+          cd testing
+          PYTEST=(
+            uv run --no-project -m --
+            pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
+          )
+          # Avoid xdist contention on a single GPU by running this file in one worker.
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
+            ./python/jit/test_tilelang_jit_cutedsl.py
+
       # AMD ROCm tests
       - name: Run ROCm tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         id: rocm-tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 109f8451..7af7f854 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,7 +215,11 @@ elseif(USE_CUDA)
     src/runtime/runtime.cc
     src/target/ptx.cc
     src/target/codegen_cuda.cc
+    src/target/codegen_py.cc
+    src/target/codegen_utils.cc
+    src/target/codegen_cutedsl.cc
     src/target/rt_mod_cuda.cc
+    src/target/rt_mod_cutedsl.cc
   )
   list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
 
diff --git a/maint/scripts/run_local_ci_test.sh b/maint/scripts/run_local_ci_test.sh
index f8fe5438..ef560437 100755
--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -14,7 +14,13 @@ cd examples
 python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
 
-# Run pytest in parallel (4 workers) for all tests in the testing/python directory
+# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
+# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
+# Do NOT export it globally here, or you'll silently change the default GEMM selection
+# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
 cd testing/python
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
+python -m pytest -n 4 . --ignore=jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
+
+# CuTeDSL JIT tests (isolate env + avoid xdist contention on a single GPU)
+TILELANG_USE_GEMM_V1=1 python -m pytest -n 1 jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index 12232023..52a403aa 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -7,3 +7,5 @@
 # CUDA specific requirements
 flash-attn==2.5.8
 cuda-python==12.9.4
+# CuTeDSL (CUTLASS Python DSL with CuTe support)
+nvidia-cutlass-dsl>=4.3.1
diff --git a/src/target/codegen_cutedsl.cc b/src/target/codegen_cutedsl.cc
new file mode 100644
index 00000000..8279710d
--- /dev/null
+++ b/src/target/codegen_cutedsl.cc
@@ -0,0 +1,1355 @@
+/*!
+ * \file target/codegen_cutedsl.cc
+ */
+
+#include "codegen_cutedsl.h"
+#include "codegen_utils.h"
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/index_map.h>
+#include <tvm/tir/op.h>
+
+#include <cmath>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op/builtin.h"
+#include "arith/pattern_match.h"
+
+namespace tvm {
+namespace codegen {
+namespace {
+
+// The threshold of the loop extent to use cutlass.range_constexpr
+// Higher values would lead to DSLOptimizationWarning:
+// This static loop has 128 iterations, which may be very slow to compile,
+//  consider using `cutlass.range(..., unroll_full=True)` instead.
+const int64_t LOOP_UNROLL_THRESHOLD = 64;
+
+void ReplaceAll(std::string &str, const std::string &from,
+                const std::string &to) {
+  ICHECK(!from.empty()) << "ReplaceAll(): `from` must be non-empty";
+  auto pos = str.find(from);
+  while (pos != std::string::npos) {
+    str.replace(pos, from.size(), to);
+    pos = str.find(from, pos + to.size());
+  }
+}
+
+} // namespace
+
+CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
+  // Read fastmath configuration from current PassContext
+  auto pass_ctx = tvm::transform::PassContext::Current();
+
+  // Read tl.enable_fast_math config, default to false
+  enable_fastmath_ =
+      pass_ctx->GetConfig<Bool>(tl::kEnableFastMath, Bool(false)).value();
+}
+
+std::string CodeGenTileLangCuTeDSL::CanonicalizeFastmathFunctionName_(
+    const std::string &func_name) const {
+  static const std::unordered_map<std::string, std::string> kFastMathMap = {
+      {"divf", "tl.divf"},   {"exp", "tl.exp"},    {"expf", "tl.exp"},
+      {"exp2", "tl.exp2"},   {"exp2f", "tl.exp2"}, {"log", "tl.log"},
+      {"logf", "tl.log"},    {"log2", "tl.log2"},  {"log2f", "tl.log2"},
+      {"log10", "tl.log10"}, {"tan", "tl.tan"},    {"cos", "tl.cos"},
+      {"sin", "tl.sin"},     {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+  };
+
+  auto it = kFastMathMap.find(func_name);
+  if (it != kFastMathMap.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+void CodeGenTileLangCuTeDSL::PrintFuncDecorator_(
+    std::ostream &os) { // NOLINT(*)
+  os << "@cute.kernel\n";
+}
+
+void CodeGenTileLangCuTeDSL::PreFunctionBody_(const PrimFunc &f) {
+  PrintIndent();
+  stream << "threadIdx = tl.ThreadIdx()" << "\n";
+  PrintIndent();
+  stream << "blockIdx = tl.BlockIdx()" << "\n";
+}
+
+namespace {
+std::string DTypeToString(DataType t) {
+  ICHECK(t.is_scalar()) << "unsupported type " << t;
+
+  if (t.is_void()) {
+    return "void";
+  }
+  if (t == tl::cuTensorMapType()) {
+    return "CUtensorMap";
+  }
+
+  int bits = t.bits();
+  std::string elem_type;
+  if (t.is_float()) {
+    if (bits == 16 || bits == 32 || bits == 64) {
+      elem_type = "Float" + std::to_string(bits);
+    }
+  } else if (t.is_bfloat16()) {
+    elem_type = "BFloat16";
+  } else if (t.is_float8()) {
+    if (t.is_float8_e3m4()) {
+      // unsupported
+    } else if (t.is_float8_e4m3()) {
+      elem_type =
+          "Float8E4M3FN"; // Only Float8E4M3FN is supported at the moment
+    } else if (t.is_float8_e4m3b11fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e4m3fn()) {
+      elem_type = "Float8E4M3FN";
+    } else if (t.is_float8_e4m3fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e5m2()) {
+      elem_type = "Float8E5M2";
+    } else if (t.is_float8_e5m2fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e8m0fnu()) {
+      elem_type = "Float8E8M0FNU";
+    }
+  } else if (t.is_float6()) {
+    if (t.is_float6_e3m2fn()) {
+      elem_type = "Float6E3M2FN";
+    } else if (t.is_float6_e2m3fn()) {
+      elem_type = "Float6E2M3FN";
+    }
+  } else if (t.is_float4()) {
+    if (t.is_float4_e2m1fn()) {
+      elem_type = "Float4E2M1FN";
+    }
+  } else if (t.is_bool()) {
+    elem_type = "Boolean";
+  } else if (t.is_uint()) {
+    if (bits == 8 || bits == 16 || bits == 32 || bits == 64 || bits == 128) {
+      elem_type = "Uint" + std::to_string(bits);
+    }
+  } else if (t.is_int()) {
+    if (bits == 4 || bits == 8 || bits == 16 || bits == 32 || bits == 64 ||
+        bits == 128) {
+      elem_type = "Int" + std::to_string(bits);
+    }
+  }
+
+  if (elem_type.empty()) {
+    LOG(FATAL) << "Cannot convert type " << t << " to CuTeDSL type!";
+  }
+
+  return "cutlass." + elem_type;
+}
+} // namespace
+
+void CodeGenTileLangCuTeDSL::PrintType(DataType t,
+                                       std::ostream &os) { // NOLINT(*)
+  CHECK(t.is_scalar()) << "Should not print a non-scalar type in CuTeDSL: "
+                       << t;
+  os << DTypeToString(t);
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BroadcastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  os << "tl.make_filled_tensor((" << PrintExpr_(op->lanes) << ",), "
+     << PrintExpr_(op->value) << ").load()";
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32:
+  case 16:
+  case 8:
+  case 4: {
+    std::ostringstream temp;
+    if (std::isinf(op->value)) {
+      // For CuTeDSL, use Python's float('inf') instead of CUDA macros
+      PrintType(op->dtype, temp);
+      temp << "(";
+      if (op->value < 0) {
+        temp << "float('-inf')";
+      } else {
+        temp << "float('inf')";
+      }
+      temp << ")";
+    } else if (std::isnan(op->value)) {
+      // For CuTeDSL, use Python's float('nan')
+      PrintType(op->dtype, temp);
+      temp << "(float('nan'))";
+    } else {
+      // For CuTeDSL, use Python's float.fromhex() with hexfloat for full
+      // precision
+      PrintType(op->dtype, temp);
+      temp << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    }
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  DataType from_ty = op->value.dtype();
+  DataType target_ty = op->dtype;
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
+
+  if (from_ty.is_scalar())
+    return CodeGenTileLangPY::VisitExpr_(op, os);
+
+  // Emit this as vectorized unary ops.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << target_ty.lanes() << ",), ";
+  PrintType(target_ty.element_of(), stream);
+  stream << ")\n";
+
+  std::string src = SSAGetID(PrintExpr_(op->value), from_ty);
+
+  PrintIndent();
+  stream << sret << ".store(" << src << ".to(";
+  PrintType(target_ty.element_of(), stream);
+  stream << "))\n";
+  os << sret << ".load()";
+  return;
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const DivNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    if (enable_fastmath_) {
+      os << "tl.divf(" << PrintExpr_(op->a) << ", " << PrintExpr_(op->b)
+         << ", fastmath=True)";
+    } else {
+      PrintBinaryExpr_("tl.divf", op->dtype, op->a, op->b, os);
+    }
+  }
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MinNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MaxNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.max", op->dtype, op->a, op->b, os);
+}
+
+/**
+ * @brief Emit CuTeDSL-specific code for a call expression.
+ *
+ * This visitor handles CallNode intrinsics and builtins that require emitting
+ * CuTeDSL-specific code (inline PTX/ASM sequences, TensorLanguage runtime
+ * calls, WMMA/TMA helpers, barriers, cp.async primitives, index-map based
+ * stores, reinterpret/packing helpers, and various mma/ldmatrix patterns). The
+ * function writes the generated code to the provided output stream and falls
+ * back to the Python codegen for unrecognized calls.
+ *
+ * The method recognizes and emits code for (non-exhaustive): cp.async and its
+ * commit/wait variants, tma_load/store and im2col variants, ptX
+ * ldmatrix/stmatrix helpers, mbarrier APIs, cooperative grid sync, WMMA/legacy
+ * MMA intrinsics (fill/load/store/mma/bmma/ptx_mma/ptx_mma_sp), low-level PTX
+ * asm helpers (ldg32, cp_async bulk/init/arrive/wait barriers), reinterpret
+ * paths for special small-float encodings (e.g., float4 e2m1fn), tl::tl_gemm
+ * and related external calls, and other TL runtime calls.
+ *
+ * Side effects:
+ * - Emits to `os` and the internal codegen output stream.
+ * - May set internal feature flags (e.g., need_cooperative_groups_).
+ * - May open/close SSA scopes and mutate internal variable mappings.
+ * - May call LOG(FATAL) / CHECK / ICHECK on invalid or unsupported argument
+ *   patterns.
+ *
+ * @param op The call node to generate code for; the function inspects op->op
+ *           and op->args to determine the appropriate emission.
+ * @param os  Output stream to receive expression-level output when the caller
+ *            expects an expression result (some paths write directly to the
+ *            member stream instead).
+ */
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  auto print_extern_call_stmt = [&](std::string name, size_t start = 0,
+                                    size_t end = 0) {
+    // Cache context into a private ss, otherwise the let node may generate
+    // within the function call arguments.
+    std::ostringstream ss;
+    for (size_t i = start; i < op->args.size() - end; i++) {
+      if (i > start)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+
+    PrintIndent();
+    stream << name << "(";
+    stream << ss.str();
+    stream << ")\n";
+  };
+
+  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
+    std::ostringstream ss;
+    if (barrier_id.as<IntImmNode>()) {
+      // incase the barrier_id is an integer, we need to print the barrier_id as
+      // an integer
+      ss << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+    } else {
+      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
+      // we need to print the barrier_id as a string
+      ss << PrintExpr_(barrier_id);
+    }
+    return ss.str();
+  };
+
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    std::string dst = PrintExpr_(op->args[0]);
+    std::string dst_offset = PrintExpr_(op->args[1]);
+    std::string src = PrintExpr_(op->args[2]);
+    std::string src_offset = PrintExpr_(op->args[3]);
+    std::string size = PrintExpr_(op->args[4]);
+    // use size of argument list to indicate whether or not to use predicated
+    // cp.async
+    if (op->args.size() == 5) {
+      PrintIndent();
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << dst_offset
+             << ", " << src << ", " << src_offset << ")\n";
+    } else {
+      std::string condition = PrintExpr_(op->args[5]);
+      PrintIndent();
+      stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
+             << dst_offset << ", " << src << ", " << src_offset << ", "
+             << condition << ")\n";
+    }
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    print_extern_call_stmt("tl.cp_async_commit");
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    print_extern_call_stmt("tl.cp_async_wait");
+  } else if (op->op.same_as(builtin::create_barriers())) {
+    PrintIndent();
+    int barrier_count = Downcast<IntImm>(op->args[0])->value;
+    stream << mbarrier_name_
+           << " = tl.alloc_smem(cutlass.Uint64, size_in_elems=" << barrier_count
+           << ")\n";
+  } else if (op->op.same_as(tl::get_mbarrier())) {
+    ICHECK_EQ(op->args.size(), 1);
+    std::string barrier_id = PrintExpr_(op->args[0]);
+    os << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+    if (op->args.size() == 1) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
+    } else if (op->args.size() == 3) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto cta_id = PrintExpr_(op->args[1]);
+      auto pred = PrintExpr_(op->args[2]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ", "
+             << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto arrive_count = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_init(" << mbarrier_obj << ", " << arrive_count
+           << ")\n";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
+    if (op->args.size() == 2) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ")\n";
+    } else if (op->args.size() == 4) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      auto cta_id = PrintExpr_(op->args[2]);
+      auto pred = PrintExpr_(op->args[3]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ", " << cta_id << ", " << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier_expect_tx "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive");
+  } else if (op->op.same_as(tl::ptx_fence_barrier_init())) {
+    print_extern_call_stmt("tl.fence_barrier_init");
+  } else if (op->op.same_as(tl::ptx_cp_async_barrier_noinc())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive_noinc");
+  } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto transaction_bytes = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_expect_tx(" << mbarrier_obj << ", "
+           << transaction_bytes << ")\n";
+  } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto phase = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_wait(" << mbarrier_obj << ", " << phase << ")\n";
+  } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    // do nothing
+  } else if (op->op.same_as(tl::tma_load())) {
+    std::ostringstream ss;
+    ICHECK_GE(op->args.size(), 2);
+    auto pol = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(pol) << "Eviction policy must be IntImm";
+    ICHECK_GE(pol->value, 0);
+    ICHECK_LT(static_cast<size_t>(pol->value), eviction_policy_names_.size());
+    auto eviction_policy = eviction_policy_names_[pol->value];
+    // Simplify the code by using the default eviction policy
+    if (eviction_policy != "EVICT_NORMAL") {
+      LOG(FATAL) << "Eviction policy " << eviction_policy
+                 << " is not supported currently";
+    } else {
+      ss << "tl.tma_load(";
+    }
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << PrintExpr_(op->args[2]) << ", (";
+    for (size_t i = 3; i < op->args.size() - 1; i++) {
+      if (i > 3)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << "))\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::tma_load_im2col())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tma_store())) {
+    std::stringstream ss;
+    // Check minimum argument count (desc, data, at least one coord,
+    // need_reduce, eviction)
+    ICHECK_GE(op->args.size(), 4) << "tma_store requires at least 4 arguments "
+                                     "(desc, data, coords..., need_reduce, "
+                                     "eviction_policy), got "
+                                  << op->args.size();
+
+    // Safely extract need_reduce flag
+    auto need_reduce_ptr = op->args[op->args.size() - 2].as<IntImmNode>();
+    ICHECK(need_reduce_ptr)
+        << "tma_store need_reduce flag (args[-2]) must be IntImm, got "
+        << op->args[op->args.size() - 2]->GetTypeKey();
+    auto need_reduce = need_reduce_ptr->value;
+    if (need_reduce) {
+      LOG(FATAL) << "Currently unsupported op: " << op->op;
+    }
+
+    // Safely extract and validate eviction policy index
+    auto eviction_idx_ptr = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(eviction_idx_ptr)
+        << "tma_store eviction policy (args[-1]) must be IntImm, got "
+        << op->args[op->args.size() - 1]->GetTypeKey();
+    ICHECK_GE(eviction_idx_ptr->value, 0)
+        << "tma_store eviction policy index must be >= 0, got "
+        << eviction_idx_ptr->value;
+    ICHECK_LT(static_cast<size_t>(eviction_idx_ptr->value),
+              eviction_policy_names_.size())
+        << "tma_store eviction policy index " << eviction_idx_ptr->value
+        << " out of bounds (max " << eviction_policy_names_.size() - 1 << ")";
+    auto eviction_policy = eviction_policy_names_[eviction_idx_ptr->value];
+
+    ss << "tl.tma_store(";
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", (";
+    for (size_t i = 2; i < op->args.size() - 2; i++) {
+      if (i > 2)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << ")";
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << ", eviction_kind = nvvm.EvictKind." << eviction_policy.substr(6);
+    }
+    ss << ")\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::ptx_ldmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_ldmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::ptx_stmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_stmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::fence_proxy_async())) {
+    print_extern_call_stmt("tl.fence_proxy_async");
+  } else if (op->op.same_as(tl::tma_store_arrive())) {
+    print_extern_call_stmt("tl.tma_store_arrive");
+  } else if (op->op.same_as(tl::tma_store_wait())) {
+    PrintIndent();
+    stream << "tl.tma_store_wait(0)\n";
+  } else if (op->op.same_as(tl::warpgroup_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_wait())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::set_max_nreg())) {
+    PrintIndent();
+    int nreg = Downcast<IntImm>(op->args[0])->value;
+    int is_inc = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name =
+        is_inc ? "tl.warpgroup_reg_alloc" : "tl.warpgroup_reg_dealloc";
+    stream << func_name << "(" << nreg << ")\n";
+  } else if (op->op.same_as(tl::wait_wgmma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::pack_b16())) {
+    os << "tl.pack_half2(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::sync_grid())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::loop_break())) {
+    PrintIndent();
+    stream << "break\n";
+  } else if (op->op.same_as(builtin::ptx_mma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_mma_sm70())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_mma_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_store())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_wait_barrier())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldg32())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    ICHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+              src_dtype.lanes() * src_dtype.bits())
+        << "reinterpret expects source and target to have the same number of "
+           "bits";
+
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+
+    PrimExpr index = load->indices[0];
+    if (const RampNode *node = index.as<RampNode>(); node) {
+      auto *p_stride = as_const_int(node->stride);
+      CHECK(p_stride);
+      ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
+      index = node->base;
+    }
+
+    auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
+    os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
+    PrintType(tgt_dtype.element_of(), os);
+    os << "), (" << tgt_dtype.lanes() << ",)).load()";
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    PrintCallExtern_(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                     op_instance->value, op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_lane_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx_sync())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_group_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tl_shuffle_elect())) {
+    os << "tl.shuffle_elect(" << PrintExpr_(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::increase_descriptor_offset())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__exp())) {
+    os << "tl.exp2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__exp10())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__log())) {
+    os << "tl.log(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log2())) {
+    os << "tl.log2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log10())) {
+    os << "tl.log10(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__tan())) {
+    os << "tl.tan(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__cos())) {
+    os << "tl.cos(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__sin())) {
+    os << "tl.sin(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::ieee_add())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_sub())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_mul())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fmaf())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frcp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fdiv())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::address_of())) {
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+    os << GetBufferPtr_(load->buffer.get(), load->indices[0]);
+  } else {
+    CodeGenTileLangPY::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  const int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index);
+    if (ref.back() == ')') {
+      ref += ".load()";
+    }
+    os << ref;
+  } else {
+    ICHECK_GE(value_lanes, element_dtype.lanes())
+        << "Unsupported load/store: value lanes < buffer element lanes";
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      std::string ref =
+          GetBufferRef_(value_dtype, op->buffer.get(), base.Eval());
+      if (ref.back() == ')') {
+        ref += ".load()";
+      }
+      os << ref;
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous load must be scalar "
+             "currently";
+
+      std::string sret = name_supply_->FreshName("_");
+      PrintIndent();
+      stream << sret << " = tl.make_rmem_tensor((" << value_lanes << ",), ";
+      PrintType(element_dtype, stream);
+      stream << ")\n";
+
+      std::string vid = GetVarID(buffer_var.get());
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp)
+          << "Expected Ramp index for vectorized non-contiguous access";
+      for (int i = 0; i < value_lanes; ++i) {
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << sret << "[" << i << "] = "
+               << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << "\n";
+      }
+      os << sret << ".load()";
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  std::string value_str = PrintExpr_(op->value);
+
+  int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+    PrintIndent();
+
+    if (ref.back() != ')') {
+      stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+    } else {
+      stream << ref << ".store(" << RemoveOutermostParentheses(value_str)
+             << ")\n";
+    }
+  } else {
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index_expr)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      PrintVecStore_(op->buffer.get(), value_dtype, base.Eval(), value_str);
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous store must be scalar "
+             "currently";
+
+      // store elements separately
+      value_str = SSAGetID(value_str, element_dtype);
+      for (int i = 0; i < value_lanes; ++i) {
+        const RampNode *ramp = index_expr.as<RampNode>();
+        ICHECK(ramp);
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << " = ";
+        PrintVecElemLoad_(value_str, value_dtype, i, stream);
+        stream << "\n";
+      }
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  PrintIndent();
+  std::string scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  if (scope == "local.descriptor.wgmma") {
+    stream << vid << " = tl.GmmaDescriptor()\n";
+  } else if (scope == "local.descriptor.tcgen05_smem") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "local.descriptor.tcgen05_instr") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "shared.dyn") {
+    stream << vid << " = tl.make_tensor(tl.get_dyn_smem(";
+    PrintType(op->dtype, stream);
+    // there is no bound check for Tensor access, so just set shape to 1
+    stream << ", alignment=1024), (1,))\n";
+  } else {
+    size_t constant_size = op->ConstantAllocationSize();
+    ICHECK_GT(constant_size, 0)
+        << "Can only handle constant size stack allocation for now, but get "
+        << constant_size << " for " << op->buffer_var->name_hint;
+
+    if (scope == "shared") {
+      stream << vid << " = tl.make_tensor(tl.alloc_smem(";
+      PrintType(op->dtype, stream);
+      stream << ", " << constant_size << "), (" << constant_size << ",))\n";
+    } else if (scope == "shared.barrier") {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    } else if (scope == "local") {
+      stream << vid << " = tl.make_rmem_tensor((" << constant_size << "),";
+      PrintType(op->dtype, stream);
+      stream << ")\n";
+    } else if (scope == "local.var") {
+      PrimExpr init = tir::make_const(op->dtype, 0);
+      auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
+      if (init_it != op->annotations.end()) {
+        PrimExpr user_init = Downcast<PrimExpr>((*init_it).second);
+        if (!user_init.dtype().is_void() && user_init.dtype() != op->dtype) {
+          user_init = tir::Cast(op->dtype, user_init);
+        }
+        init = user_init;
+      }
+      stream << vid << " = " << PrintExpr_(init) << "\n";
+    } else {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    }
+  }
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
+  if (op->attr_key == tir::attr::thread_extent) {
+    IterVar iv = Downcast<IterVar>(op->node);
+    if (!iv->thread_tag.empty()) {
+      if (!var_idmap_.count(iv->var.get())) {
+        BindThreadIndex_(iv);
+      }
+    }
+    VisitStmt(op->body);
+  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
+    const IntImmNode *queue_id = op->value.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    VisitStmt(op->body);
+    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
+    VisitExpr(commit_group, stream);
+  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
+    auto wait_attrs = GetAsyncWaitAttributes(op);
+    auto queue_id = wait_attrs.first.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    auto wait_cnt = wait_attrs.second;
+    auto wait_group =
+        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
+    VisitExpr(wait_group, stream);
+    auto inner = op->body.as<AttrStmtNode>();
+    ICHECK(inner);
+    VisitStmt(inner->body);
+  } else if (op->attr_key == "threadblock_swizzle_pattern") {
+    this->PrintIndent();
+    const StringImmNode *pattern = op->value.as<StringImmNode>();
+    ICHECK(pattern);
+    std::string call_str = pattern->value;
+    // replace :: with . and replace < with ( and replace > with )
+    ReplaceAll(call_str, "::", ".");
+    ReplaceAll(call_str, "<", "(");
+    ReplaceAll(call_str, ">", ")");
+    this->stream << "blockIdx = " << call_str << "\n";
+    this->VisitStmt(op->body);
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor_[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
+    CodeGenTileLangPY::VisitStmt_(op);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
+  if (op->kind != tir::ForKind::kUnrolled) {
+    CodeGenTileLangPY::VisitStmt_(op);
+    return;
+  }
+
+  auto start_expr = arith::Analyzer().Simplify(op->min);
+  auto stop_expr = arith::Analyzer().Simplify(op->extent + op->min);
+  std::string unroll_factor;
+  if (auto it = unroll_factor_.find(op->loop_var.get());
+      it != unroll_factor_.end()) {
+    unroll_factor = PrintExpr_(it->second);
+  }
+  bool use_range_constexpr = unroll_factor.empty() &&
+                             as_const_int(op->extent) != nullptr &&
+                             *as_const_int(op->extent) <= LOOP_UNROLL_THRESHOLD;
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in cutlass.range";
+  if (use_range_constexpr) {
+    stream << "_constexpr";
+  }
+  stream << "(";
+  if (!is_zero(start_expr)) {
+    PrintExpr_(start_expr, stream);
+    stream << ", ";
+  }
+  PrintExpr_(stop_expr, stream);
+  if (!unroll_factor.empty()) {
+    stream << ", unroll=" << unroll_factor;
+  } else if (!use_range_constexpr) {
+    stream << ", unroll_full=True";
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  if (const CallNode *call = op->condition.as<CallNode>();
+      call && call->op.same_as(tl::tl_shuffle_elect())) {
+    PrintIndent();
+    stream << "with cute.arch.elect_one():\n";
+    int with_scope = BeginScope();
+    PrintStmt_(op->then_case);
+    EndScope(with_scope);
+  } else {
+    PrintStmt_(op->then_case);
+  }
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+  const CallNode *call = op->value.as<CallNode>();
+  if (call && call->op.same_as(builtin::tvm_global_barrier_kinit())) {
+    LOG(FATAL) << "Currently unsupported op: " << call->op;
+  }
+  if (call && (call->op.same_as(tvm::tl::device_assert()))) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    PrintIndent();
+    stream << "assert " << cond << "\n";
+  } else if (call && call->op.same_as(tvm::tl::device_assert_with_msg())) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    std::string msg_expr = PrintExpr_(call->args[1]);
+    PrintIndent();
+    stream << "assert " << cond << ", " << msg_expr << "\n";
+  } else if (call && call->op.same_as(builtin::tvm_storage_sync())) {
+    PrintStorageSync_(call);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemLoad_(const std::string &vec,
+                                               DataType t, int i,
+                                               std::ostream &os) { // NOLINT(*)
+  if (t.is_scalar()) {
+    os << vec;
+    return;
+  }
+  os << vec << "[" << i << "]";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemStore_(const std::string &vec,
+                                                DataType t, int i,
+                                                const std::string &value) {
+  PrintIndent();
+  stream << vec << "[" << i << "] = " << value << "\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecStore_(const BufferNode *buffer,
+                                            DataType t, PrimExpr base,
+                                            const std::string &value) {
+  ICHECK(!t.is_scalar()) << "PrintVecStore_() should not be used for scalar";
+
+  std::string ref = GetBufferRef_(t, buffer, base);
+  PrintIndent();
+  stream << ref << ".store(" << value << ")\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecBinaryOp_(const std::string &opstr,
+                                               DataType dtype, PrimExpr lhs,
+                                               PrimExpr rhs,
+                                               std::ostream &os) { // NOLINT(*)
+  // Declare the result.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << dtype.lanes() << ",), ";
+  PrintType(dtype.element_of(), stream);
+  stream << ")\n";
+
+  std::string vlhs = SSAGetID(PrintExpr_(lhs), lhs.dtype());
+  std::string vrhs = SSAGetID(PrintExpr_(rhs), rhs.dtype());
+
+  const std::string one_char_op{"+-*%<>^|&"};
+  const std::string two_char_op{"// == != <= >="};
+  if ((opstr.size() == 1 && one_char_op.find(opstr) != std::string::npos) ||
+      (opstr.size() == 2 && two_char_op.find(opstr) != std::string::npos)) {
+    PrintIndent();
+    stream << sret << ".store(" << vlhs << " " << opstr << " " << vrhs << ")\n";
+  } else {
+    // Unpack into individual ops.
+    for (int i = 0, lanes = dtype.lanes(); i < lanes; ++i) {
+      std::ostringstream value_temp;
+      if (isalpha(opstr[0])) {
+        value_temp << opstr << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << ", ";
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      } else {
+        value_temp << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << opstr;
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      }
+      PrintVecElemStore_(sret, dtype, i, value_temp.str());
+    }
+  }
+  os << sret << ".load()";
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryExpr_(const std::string &opstr,
+                                              DataType dtype, PrimExpr lhs,
+                                              PrimExpr rhs,
+                                              std::ostream &os) { // NOLINT(*)
+  if (dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryExpr_(opstr, dtype, lhs, rhs, os);
+  } else {
+    PrintVecBinaryOp_(opstr, dtype, lhs, rhs, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryIntrinsic_(
+    const CallNode *op, const char *opstr,
+    std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryIntrinsic_(op, opstr, os);
+  } else {
+    PrintVecBinaryOp_(opstr, op->dtype, op->args[0], op->args[1], os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintCallExtern_(Type ret_type,
+                                              ffi::String global_symbol,
+                                              const ffi::Array<PrimExpr> &args,
+                                              bool skip_first_arg,
+                                              std::ostream &os) { // NOLINT(*)
+  DataType ret_dtype = GetRuntimeDataType(ret_type);
+
+  std::string global_symbol_str = global_symbol;
+  ReplaceAll(global_symbol_str, "::", ".");
+
+  std::vector<std::string> sargs;
+  // when the template arguments occurs at the end, merge them with function
+  // arguments
+  if (global_symbol_str.back() == '>') {
+    auto pos = global_symbol_str.rfind('<');
+    ICHECK(pos != std::string::npos);
+    std::string template_args =
+        global_symbol_str.substr(pos + 1, global_symbol_str.size() - pos - 2);
+    ReplaceAll(template_args, "true", "True");
+    ReplaceAll(template_args, "false", "False");
+    sargs.push_back(template_args);
+
+    global_symbol_str.resize(pos);
+  }
+  const size_t arg_begin = static_cast<size_t>(skip_first_arg);
+  for (size_t i = arg_begin; i < args.size(); ++i) {
+    std::string sarg = PrintExpr_(args[i]);
+    if (ret_dtype.is_fixed_length_vector()) {
+      std::string val = SSAGetID(sarg, args[i].dtype());
+      sargs.push_back(std::move(val));
+    } else {
+      sargs.push_back(sarg);
+    }
+  }
+
+  // Replace "<...>" with "(...)". Nested "<" is not supported
+  {
+    auto pos_left = global_symbol_str.find('<');
+    while (pos_left != std::string::npos) {
+      auto pos_right = global_symbol_str.find('>', pos_left + 1);
+      if (pos_right != std::string::npos) {
+        auto args =
+            global_symbol_str.substr(pos_left + 1, pos_right - pos_left - 1);
+        ReplaceAll(args, "true", "True");
+        ReplaceAll(args, "false", "False");
+        global_symbol_str.replace(pos_left, args.size() + 2, "(" + args + ")");
+      }
+      pos_left = global_symbol_str.find('<');
+    }
+  }
+
+  // Special cases:
+  // Map C math functions to Python/cutedsl equivalents
+  const auto canonicalized_global_symbol_str =
+      CanonicalizeFastmathFunctionName_(global_symbol_str);
+  const bool canonicalized = !canonicalized_global_symbol_str.empty();
+  if (canonicalized) {
+    global_symbol_str = canonicalized_global_symbol_str;
+  }
+
+  // Atomic Functions
+  if (global_symbol_str.substr(0, 6) == "Atomic") {
+    global_symbol_str = "tl." + global_symbol_str;
+    // Convert first argument (Buffer) to pointer for atomic operations
+    if (const BufferLoadNode *load = args[arg_begin].as<BufferLoadNode>()) {
+      ICHECK_EQ(load->indices.size(), 1)
+          << "CodeGenTileLangCuTeDSL only supports flat memory";
+      sargs[0] = GetBufferPtr_(load->buffer.get(), load->indices[0]);
+    }
+  }
+  // some optional template arguments might be ommited, so add names explicitly
+  // for remain arguments
+  if (global_symbol_str == "tl.gemm_ss" || global_symbol_str == "tl.gemm_rs" ||
+      global_symbol_str == "tl.gemm_sr" || global_symbol_str == "tl.gemm_rr") {
+    ICHECK(sargs.size() >= 3);
+    sargs[sargs.size() - 3] = "A_ptr=" + sargs[sargs.size() - 3];
+    sargs[sargs.size() - 2] = "B_ptr=" + sargs[sargs.size() - 2];
+    sargs[sargs.size() - 1] = "C_ptr=" + sargs[sargs.size() - 1];
+  }
+
+  if (ret_dtype.is_fixed_length_vector()) {
+    // maybe simplify this if TensorSSA suppports this OP
+    std::string sret = name_supply_->FreshName("_");
+    PrintIndent();
+    stream << sret << " = tl.make_rmem_tensor((" << ret_dtype.lanes() << ",), ";
+    PrintType(ret_dtype.element_of(), stream);
+    stream << ")\n";
+
+    // Emit a scalar call for each lane.
+    bool has_template_arg = (sargs.size() > args.size() - arg_begin);
+    for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      std::ostringstream scall;
+      scall << global_symbol_str << "(";
+      for (size_t j = 0; j < sargs.size(); ++j) {
+        if (j != 0) {
+          scall << ", ";
+        }
+
+        if (j == 0 && has_template_arg) {
+          scall << sargs[j];
+        } else {
+          PrintVecElemLoad_(
+              sargs[j],
+              args[arg_begin + j - static_cast<size_t>(has_template_arg)]
+                  .dtype(),
+              i, scall);
+        }
+      }
+      if (canonicalized && enable_fastmath_) {
+        if (!sargs.empty()) {
+          scall << ", ";
+        }
+        scall << "fastmath=True";
+      }
+      scall << ")";
+      PrintVecElemStore_(sret, ret_dtype, i, scall.str());
+    }
+    os << sret << ".load()";
+  } else {
+    os << global_symbol_str << "(";
+    for (size_t i = 0; i < sargs.size(); ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+      os << sargs[i];
+    }
+    if (canonicalized && enable_fastmath_) {
+      if (!sargs.empty()) {
+        os << ", ";
+      }
+      os << "fastmath=True";
+    }
+    os << ")";
+  }
+}
+
+std::string CodeGenTileLangCuTeDSL::GetBufferPtr_(const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  const std::string vid = GetVarID(buffer_var);
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  std::string index_str = PrintExpr_(index);
+  return "(" + ptr_str + " + " + index_str + ")";
+}
+
+// The following forms can be returned:
+// (1) vid
+// (2) vid[i]
+// (3) tl.make_tensor_at_offset(...)[0]
+// (4) tl.make_tensor_at_offset(...)
+//
+// Form (4) is needed when the whole tensor is loaded or stored.
+// It's the only form that ends with ")". Using this fact, BufferLoadNode will
+// add ".load()" and BufferStoreNode will add ".store()".
+std::string CodeGenTileLangCuTeDSL::GetBufferRef_(DataType t,
+                                                  const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  std::string scope;
+  if (alloc_storage_scope_.count(buffer_var)) {
+    scope = alloc_storage_scope_.at(buffer_var);
+  }
+  if (scope.empty()) {
+    scope = GetPtrStorageScope(buffer->data);
+  }
+  if (scope == "local.var" || scope.find("local.descriptor") == 0) {
+    return vid;
+  }
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  const std::string index_str = PrintExpr_(index);
+
+  if (t == buffer_element_dtype) {
+    if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
+        (scope == "local" || scope == "shared" || scope == "shared.dyn" ||
+         scope == "shared.barrier")) {
+      // Tensors in these scopes are allocated as one-dimensional, so can be
+      // assessed via "[]" correctly. Other tensors may be multi-dimensional,
+      // and must be assessed via ptr, otherwise CuTeDSL will interpret "[]"
+      // access using its visiting order and layout.
+      return vid + "[" + index_str + "]";
+    } else {
+      std::ostringstream os;
+      os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str
+         << ", (1,), div_by=" << buffer_element_dtype.lanes() << ")";
+      // for vector data types, ".load()" (added by BufferLoadNode) is neeed
+      // instead of "[0]"
+      if (buffer_element_dtype.is_scalar()) {
+        os << "[0]";
+      }
+      return os.str();
+    }
+  } else {
+    const int num = t.bits() * t.lanes();
+    const int den = buffer_element_dtype.bits() * buffer_element_dtype.lanes();
+    ICHECK_EQ(num % den, 0) << "Cannot form view: bitwidth not divisible";
+    int buffer_size = num / den;
+
+    std::ostringstream os;
+    os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str << ", ("
+       << buffer_size << ",), div_by=" << buffer_size << ")";
+    return os.str();
+  }
+}
+
+void CodeGenTileLangCuTeDSL::BindThreadIndex_(const IterVar &iv) {
+  ICHECK(!var_idmap_.count(iv->var.get()));
+
+  auto &thread_tag = iv->thread_tag;
+  ICHECK(thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+         thread_tag == "threadIdx.z" || thread_tag == "blockIdx.x" ||
+         thread_tag == "blockIdx.y" || thread_tag == "blockIdx.z");
+
+  // cute.arch.thread_idx() and block_idx() are Int32
+  DataType from_dtype = DataType::Int(32);
+  var_idmap_[iv->var.get()] =
+      CastFromTo_(thread_tag, from_dtype, iv->var.dtype());
+}
+
+void CodeGenTileLangCuTeDSL::PrintStorageSync_(const CallNode *op) {
+  auto args = op->args;
+  const std::string &sync = args[0].as<StringImmNode>()->value;
+  if (sync == "warp") {
+    // do nothing
+  } else if (sync == "shared" || sync == "shared.dyn") {
+    PrintIndent();
+    if (args.size() == 1) {
+      stream << "tl.sync_threads()\n";
+    } else if (args.size() == 2) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ")\n";
+    } else if (args.size() == 3) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto thread_count_ptr = args[2].as<IntImmNode>();
+      ICHECK(thread_count_ptr)
+          << "storage_sync thread_count (args[2]) must be IntImm, got "
+          << args[2]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      auto thread_count = thread_count_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ", " << thread_count
+             << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid number of arguments for storage sync: "
+                 << args.size();
+    }
+  } else if (sync == "global") {
+    LOG(FATAL) << "PrintStorageSync_ for global is not supported for now";
+  } else {
+    LOG(FATAL) << "Unknown storage sync scope: " << sync;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_cutedsl.h b/src/target/codegen_cutedsl.h
new file mode 100644
index 00000000..1d4edc53
--- /dev/null
+++ b/src/target/codegen_cutedsl.h
@@ -0,0 +1,102 @@
+/*!
+ * \file target/codegen_cutedsl.h
+ * \brief Utility to generate CuTeDSL code
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+#define TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+
+#include <tvm/target/codegen.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "codegen_py.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
+public:
+  CodeGenTileLangCuTeDSL();
+
+protected:
+  void PrintFuncDecorator_(std::ostream &os) override; // NOLINT(*)
+  void PreFunctionBody_(const PrimFunc &f) override;
+
+protected:
+  void PrintType(DataType t, std::ostream &os) override; // NOLINT(*)
+
+  void VisitExpr_(const BroadcastNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override;                     // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+
+protected:
+  virtual void PrintVecElemLoad_(const std::string &vec, DataType t, int i,
+                                 std::ostream &os); // NOLINT(*)
+  virtual void PrintVecElemStore_(const std::string &vec, DataType t, int i,
+                                  const std::string &value);
+  virtual void PrintVecStore_(const BufferNode *buffer, DataType t,
+                              PrimExpr base, const std::string &value);
+  void PrintVecBinaryOp_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                         PrimExpr rhs,
+                         std::ostream &os); // NOLINT(*)
+  void PrintBinaryExpr_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                        PrimExpr rhs,
+                        std::ostream &os) override; // NOLINT(*)
+  void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                             std::ostream &os) override; // NOLINT(*)
+
+  void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                        const ffi::Array<PrimExpr> &args, bool skip_first_arg,
+                        std::ostream &os) override; // NOLINT(*)
+
+  std::string GetBufferPtr_(const BufferNode *buffer, PrimExpr index);
+  std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                            PrimExpr index) override;
+
+  /*!
+   * \brief Print expr representing the thread tag
+   * \param IterVar iv The thread index to be binded;
+   */
+  virtual void BindThreadIndex_(const IterVar &iv); // NOLINT(*)
+
+  virtual void PrintStorageSync_(const CallNode *op);
+
+  std::string
+  CanonicalizeFastmathFunctionName_(const std::string &func_name) const;
+
+private:
+  // The name of the mbarrier array in shared memory
+  const std::string mbarrier_name_ = "mbarrier";
+
+  std::unordered_map<const VarNode *, IntImm> unroll_factor_;
+
+  std::vector<std::string> eviction_policy_names_ = {
+      "EVICT_NORMAL", "EVICT_FIRST", "EVICT_LAST"};
+
+  // Fastmath configuration (read from PassContext)
+  bool enable_fastmath_ = false;
+};
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
diff --git a/src/target/codegen_py.cc b/src/target/codegen_py.cc
new file mode 100644
index 00000000..aa12eef0
--- /dev/null
+++ b/src/target/codegen_py.cc
@@ -0,0 +1,715 @@
+/*!
+ * \file codegen_py.cc
+ */
+#include "codegen_py.h"
+#include "codegen_utils.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ir/name_supply.h>
+
+#include <cctype>
+
+namespace tvm {
+namespace codegen {
+
+void CodeGenTileLangPY::AddFunction(const GlobalVar &gvar, const PrimFunc &f) {
+  RegisterFunction_(gvar, f);
+  auto function_name = GetFunctionName_(gvar);
+
+  // clear previous generated state.
+  InitFuncState_(f);
+
+  PrintFuncDecorator_(stream);
+  PrintFunctionSignature_(function_name, f, stream);
+  stream << ":\n";
+
+  int func_scope = BeginScope();
+  PreFunctionBody_(f);
+  PrintStmt_(f->body);
+  EndScope(func_scope);
+}
+
+std::string CodeGenTileLangPY::Finish() {
+  std::ostringstream code;
+  code << decl_stream.str();
+  code << stream.str();
+  return code.str();
+}
+
+ffi::String CodeGenTileLangPY::GetFunctionName_(const GlobalVar &gvar) {
+  auto it = internal_functions_.find(gvar);
+  ICHECK(it != internal_functions_.end())
+      << "Attempted to find name of " << gvar
+      << ", but no function with this GlobalVar has been declared";
+  return it->second;
+}
+
+void CodeGenTileLangPY::RegisterFunction_(const GlobalVar &gvar,
+                                          const PrimFunc &func) {
+  if (internal_functions_.count(gvar)) {
+    return;
+  }
+
+  auto function_name = [&]() -> ffi::String {
+    if (auto global_symbol =
+            func->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol)) {
+      auto name = global_symbol.value();
+      ICHECK(!func_name_supply_->ContainsName(name))
+          << "Function " << gvar << " must use global symbol " << name
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(name);
+      return name;
+    } else {
+      ICHECK(!func_name_supply_->ContainsName(gvar->name_hint))
+          << "Function " << gvar << " must use name hint " << gvar->name_hint
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(gvar->name_hint);
+      return gvar->name_hint;
+    }
+  }();
+  internal_functions_.insert({gvar, function_name});
+}
+
+void CodeGenTileLangPY::InitFuncState_(const PrimFunc &f) {
+  alloc_storage_scope_.clear();
+  handle_data_type_.clear();
+  CodeGenSourceBase::ClearFuncState();
+  ReserveKeywordsAsUnique_();
+}
+
+void CodeGenTileLangPY::PrintFunctionSignature_(
+    const ffi::String &function_name, const PrimFunc &func,
+    std::ostream &os) { // NOLINT(*)
+  os << "def " << function_name << "(";
+  for (size_t i = 0; i < func->params.size(); ++i) {
+    tir::Var v = func->params[i];
+    if (i > 0) {
+      os << ", ";
+    }
+    os << AllocVarID(v.get());
+  }
+  os << ")";
+
+  // Register handle data type
+  for (const auto &param : func->params) {
+    if (auto *ptr = param->type_annotation.as<PointerTypeNode>()) {
+      if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
+        RegisterHandleType_(param.get(), prim->dtype);
+      }
+    }
+  }
+}
+
+void CodeGenTileLangPY::ReserveKeywordsAsUnique_() {
+  // skip the first underscore, so SSA variable starts from _1
+  name_supply_->ReserveName("_");
+  name_supply_->ReserveName("False");
+  name_supply_->ReserveName("None");
+  name_supply_->ReserveName("True");
+  name_supply_->ReserveName("and");
+  name_supply_->ReserveName("as");
+  name_supply_->ReserveName("assert");
+  name_supply_->ReserveName("async");
+  name_supply_->ReserveName("await");
+  name_supply_->ReserveName("break");
+  name_supply_->ReserveName("class");
+  name_supply_->ReserveName("continue");
+  name_supply_->ReserveName("def");
+  name_supply_->ReserveName("del");
+  name_supply_->ReserveName("elif");
+  name_supply_->ReserveName("else");
+  name_supply_->ReserveName("except");
+  name_supply_->ReserveName("finally");
+  name_supply_->ReserveName("for");
+  name_supply_->ReserveName("from");
+  name_supply_->ReserveName("global");
+  name_supply_->ReserveName("if");
+  name_supply_->ReserveName("import");
+  name_supply_->ReserveName("in");
+  name_supply_->ReserveName("is");
+  name_supply_->ReserveName("lambda");
+  name_supply_->ReserveName("nonlocal");
+  name_supply_->ReserveName("not");
+  name_supply_->ReserveName("or");
+  name_supply_->ReserveName("pass");
+  name_supply_->ReserveName("raise");
+  name_supply_->ReserveName("return");
+  name_supply_->ReserveName("try");
+  name_supply_->ReserveName("while");
+  name_supply_->ReserveName("with");
+  name_supply_->ReserveName("yield");
+
+  name_supply_->ReserveName("void");
+  name_supply_->ReserveName("int");
+  name_supply_->ReserveName("float");
+  name_supply_->ReserveName("double");
+  name_supply_->ReserveName("char");
+  name_supply_->ReserveName("unsigned");
+  name_supply_->ReserveName("short");
+  name_supply_->ReserveName("long");
+
+  name_supply_->ReserveName("cutlass");
+  name_supply_->ReserveName("cute");
+  name_supply_->ReserveName("tl");
+}
+
+void CodeGenTileLangPY::PrintSSAAssign(const std::string &target,
+                                       const std::string &src, DataType t) {
+  stream << target << " = " << RemoveOutermostParentheses(src) << "\n";
+}
+
+void CodeGenTileLangPY::PrintType(DataType type,
+                                  std::ostream &os) { // NOLINT(*)
+  if (type.is_float()) {
+    if (type.bits() == 16 || type.bits() == 32 || type.bits() == 64) {
+      os << "float";
+    } else {
+      LOG(FATAL) << "Cannot convert float" << type.bits() << " to Python type";
+    }
+  } else if (type.is_uint()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert uint" << type.bits() << " to Python type";
+    }
+  } else if (type.is_int()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert int" << type.bits() << " to Python type";
+    }
+  } else {
+    LOG(FATAL) << "Cannot convert type " << type << " to Python type";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const VarNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << GetVarID(op);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const IntImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype == DataType::Bool()) {
+    os << (op->value ? "True" : "False");
+  } else {
+    std::ostringstream temp;
+    temp << op->value;
+    MarkConst(temp.str());
+    os << temp.str();
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const FloatImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32: {
+    std::ostringstream temp;
+    temp << "float.fromhex('" << std::hexfloat << op->value << "')";
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  case 16: {
+    PrintType(op->dtype, os);
+    os << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const StringImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  EscapeStringLiteral_(op->value, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CastNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  std::stringstream value;
+  PrintExpr_(op->value, value);
+  os << CastFromTo_(value.str(), op->value.dtype(), op->dtype);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const AddNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("+", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const SubNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("-", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MulNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("*", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const DivNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    PrintBinaryExpr_("/", op->dtype, op->a, op->b, os);
+  }
+}
+void CodeGenTileLangPY::VisitExpr_(const ModNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK(op->dtype.is_int() || op->dtype.is_uint() || op->dtype.is_float())
+      << "Expected floating point or integer dtype in Mod, but got "
+      << op->dtype;
+  PrintBinaryExpr_("%", op->dtype, op->a, op->b, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const MinNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MaxNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("max", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const EQNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("==", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("!=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const AndNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("and", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const OrNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("or", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NotNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(not ";
+  PrintExpr_(op->a, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const SelectNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(";
+  PrintExpr_(op->true_value, os);
+  os << " if ";
+  PrintExpr_(op->condition, os);
+  os << " else ";
+  PrintExpr_(op->false_value, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const RampNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  int lanes = op->dtype.lanes();
+  os << "(";
+  for (int i = 0; i < lanes; i++) {
+    os << "(" << PrintExpr_(op->base) << ")"
+       << "+(" << PrintExpr_(op->stride) << "*" << i << ")";
+    if (i != lanes - 1)
+      os << ", ";
+  }
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CallNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (auto opt_call_op = op->op.as<Op>()) {
+    const auto &call_op = opt_call_op.value();
+
+    if (op->op.same_as(builtin::ret())) {
+      os << "return " << RemoveOutermostParentheses(PrintExpr_(op->args[0]));
+    } else if (op->op.same_as(builtin::continue_loop())) {
+      os << "continue";
+    } else if (op->op.same_as(builtin::break_loop())) {
+      os << "break";
+    } else if (op->op.same_as(builtin_call_extern_) ||
+               op->op.same_as(builtin_call_pure_extern_)) {
+      ICHECK_GE(op->args.size(), 1U);
+      auto func = Downcast<StringImm>(op->args[0]);
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), func->value,
+                       op->args, true, os);
+    } else if (op_attr_global_symbol_.count(call_op)) {
+      // call extern if the op itself have a global symbol.
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)),
+                       op_attr_global_symbol_[call_op], op->args, false, os);
+    } else if (op->op.same_as(builtin::large_uint_imm())) {
+      ICHECK_EQ(op->args.size(), 2U);
+      uint64_t low =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
+      uint64_t high =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
+      uint64_t val = (high << 32U) | low;
+
+      if (op->dtype == DataType::UInt(32)) {
+        std::ostringstream temp;
+        temp << val;
+        MarkConst(temp.str());
+        os << temp.str();
+      } else {
+        PrintType(op->dtype, os);
+        os << "(" << val << ")";
+      }
+    } else if (op->op.same_as(builtin::bitwise_and())) {
+      PrintBinaryIntrinsic_(op, "&", os);
+    } else if (op->op.same_as(builtin::bitwise_or())) {
+      PrintBinaryIntrinsic_(op, "|", os);
+    } else if (op->op.same_as(builtin::bitwise_xor())) {
+      PrintBinaryIntrinsic_(op, "^", os);
+    } else if (op->op.same_as(builtin::bitwise_not())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "~";
+      PrintExpr_(op->args[0], os);
+    } else if (op->op.same_as(builtin::shift_left())) {
+      PrintBinaryIntrinsic_(op, "<<", os);
+    } else if (op->op.same_as(builtin::shift_right())) {
+      PrintBinaryIntrinsic_(op, ">>", os);
+    } else if (op->op.same_as(builtin::if_then_else())) {
+
+      std::string cond = PrintExpr_(op->args[0]);
+      std::string true_val = PrintExpr_(op->args[1]);
+      std::string false_val = PrintExpr_(op->args[2]);
+      os << "(" << true_val << " if " << cond << " else " << false_val << ")";
+    } else if (op->op.same_as(builtin::isnullptr())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " is None)";
+    } else if (op->op.same_as(builtin::isnan())) {
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " != ";
+      PrintExpr_(op->args[0], os);
+      os << ")";
+    } else {
+      LOG(FATAL) << "Unresolved call " << op->op;
+    }
+  } else if (auto opt = op->op.as<GlobalVar>()) {
+    const auto &gvar = opt.value();
+    auto callee_name = GetFunctionName_(gvar);
+    PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), callee_name, op->args,
+                     false, os);
+  } else {
+    LOG(FATAL)
+        << "CodeGenTileLangPY: Unknown operation " << op->op
+        << " is neither a recognized built-in, "
+        << "nor a GlobalVar reference to another function in the IRModule";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const BufferLoadNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferLoadNode";
+  std::string ref = GetBufferRef_(op->dtype, op->buffer.get(), index);
+  os << ref;
+}
+
+void CodeGenTileLangPY::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferStoreNode";
+  std::string value = PrintExpr_(op->value);
+  std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+  PrintIndent();
+  stream << ref << " = " << RemoveOutermostParentheses(value) << "\n";
+}
+
+void CodeGenTileLangPY::VisitStmt_(const DeclBufferNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const LetStmtNode *op) {
+  std::string value = PrintExpr_(op->value);
+  PrintIndent();
+  stream << AllocVarID(op->var.get()) << " = " << value << "\n";
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+
+  PrintIndent();
+  size_t constant_size = op->ConstantAllocationSize();
+  ICHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+
+  auto scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  stream << vid << " = [None] * " << constant_size << "\n";
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AttrStmtNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const ForNode *op) {
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in range(";
+  if (is_zero(op->min)) {
+    PrintExpr_(op->extent, stream);
+  } else {
+    PrintExpr_(op->min, stream);
+    stream << ", ";
+    PrimExpr upper_bound = arith::Analyzer().Simplify(op->extent + op->min);
+    PrintExpr_(upper_bound, stream);
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const WhileNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "while " << RemoveOutermostParentheses(cond) << ":\n";
+  int while_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(while_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  PrintStmt_(op->then_case);
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const SeqStmtNode *op) {
+  for (Stmt stmt : op->seq) {
+    PrintStmt_(stmt);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+
+  std::string vid = PrintExpr_(op->value);
+  if (!vid.empty()) {
+    PrintIndent();
+    stream << vid << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AssertStmtNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  if (const auto *str = op->message.as<StringImmNode>()) {
+    stream << "assert " << cond << ", ";
+    EscapeStringLiteral_(str->value, stream);
+    stream << "\n";
+  } else {
+    stream << "assert " << cond << "\n";
+  }
+  PrintStmt_(op->body);
+}
+
+std::string CodeGenTileLangPY::CastFromTo_(const std::string &value,
+                                           DataType from, DataType target) {
+  if (from == target)
+    return value;
+  std::ostringstream os;
+  PrintType(target, os);
+  os << "(" << value << ")";
+  return os.str();
+}
+
+void CodeGenTileLangPY::PrintBinaryExpr_(const std::string &opstr,
+                                         DataType dtype, PrimExpr lhs,
+                                         PrimExpr rhs,
+                                         std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(dtype.lanes(), 1);
+  if (isalpha(opstr[0]) && opstr != "and" && opstr != "or") {
+    os << opstr << '(';
+    PrintExpr_(lhs, os);
+    os << ", ";
+    PrintExpr_(rhs, os);
+    os << ')';
+  } else {
+    os << '(';
+    PrintExpr_(lhs, os);
+    os << ' ' << opstr << ' ';
+    PrintExpr_(rhs, os);
+    os << ')';
+  }
+}
+
+void CodeGenTileLangPY::PrintBinaryIntrinsic_(const CallNode *op,
+                                              const char *opstr,
+                                              std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->dtype.lanes(), 1);
+  ICHECK_EQ(op->args.size(), 2U);
+  os << '(';
+  PrintExpr_(op->args[0], os);
+  os << ' ' << opstr << ' ';
+  PrintExpr_(op->args[1], os);
+  os << ')';
+}
+
+void CodeGenTileLangPY::PrintCallExtern_(Type ret_type,
+                                         ffi::String global_symbol,
+                                         const ffi::Array<PrimExpr> &args,
+                                         bool skip_first_arg,
+                                         std::ostream &os) { // NOLINT(*)
+  os << global_symbol << "(";
+  for (size_t i = static_cast<size_t>(skip_first_arg); i < args.size(); ++i) {
+    PrintExpr_(args[i], os);
+    if (i < args.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << ")";
+}
+
+// Print a reference expression to a buffer.
+std::string CodeGenTileLangPY::GetBufferRef_(DataType t,
+                                             const BufferNode *buffer,
+                                             PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  DataType buffer_element_dtype = buffer->dtype;
+
+  ICHECK(HandleTypeMatch_(buffer_var, buffer_element_dtype));
+  ICHECK_EQ(t, buffer_element_dtype);
+
+  std::string index_str = PrintExpr_(index);
+  return vid + "[" + index_str + "]";
+}
+
+void CodeGenTileLangPY::RegisterHandleType_(const VarNode *buf_var,
+                                            DataType t) {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end()) {
+    handle_data_type_[buf_var] = t;
+  } else {
+    ICHECK(it->second == t) << "conflicting buf var type";
+  }
+}
+
+bool CodeGenTileLangPY::HandleTypeMatch_(const VarNode *buf_var,
+                                         DataType t) const {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end())
+    return false;
+  return it->second == t;
+}
+
+void CodeGenTileLangPY::EscapeStringLiteral_(const std::string &s,
+                                             std::ostream &os) {
+  os << '"';
+  for (unsigned char c : s) {
+    switch (c) {
+    case '\\':
+      os << "\\\\";
+      break;
+    case '"':
+      os << "\\\"";
+      break;
+    case '\n':
+      os << "\\n";
+      break;
+    case '\r':
+      os << "\\r";
+      break;
+    case '\t':
+      os << "\\t";
+      break;
+    case '\f':
+      os << "\\f";
+      break;
+    case '\b':
+      os << "\\b";
+      break;
+    default:
+      // Handle non-printable and non-ASCII characters
+      if (c < 32 || c == 127) {
+        // Output as \xHH
+        os << "\\x";
+        const char hex[] = "0123456789abcdef";
+        os << hex[(c >> 4) & 0xF];
+        os << hex[c & 0xF];
+      } else {
+        os << c;
+      }
+      break;
+    }
+  }
+  os << '"';
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_py.h b/src/target/codegen_py.h
new file mode 100644
index 00000000..431fe933
--- /dev/null
+++ b/src/target/codegen_py.h
@@ -0,0 +1,255 @@
+/*!
+ * \file codegen_py.h
+ * \brief Common utilities to generate simple Python code.
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_PY_H_
+#define TVM_TL_TARGET_CODEGEN_PY_H_
+
+#include <tvm/ir/op.h>
+#include <tvm/target/codegen.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+
+// from tvm/src/
+#include "target/source/codegen_source_base.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+using namespace tir;
+/*!
+ * \brief A base class to generate simple Python code.
+ */
+class CodeGenTileLangPY
+    : public ExprFunctor<void(const PrimExpr &, std::ostream &)>,
+      public StmtFunctor<void(const Stmt &)>,
+      public CodeGenSourceBase {
+public:
+  /*!
+   * \brief Add the function definition to the generated module.
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   */
+  virtual void AddFunction(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The code.
+   */
+  virtual std::string Finish();
+
+protected:
+  /*!
+   * \brief Get the name of a declared function
+   * \param gvar The GlobalVar of the function
+   * \returns The string name of the function
+   */
+  ffi::String GetFunctionName_(const GlobalVar &gvar);
+
+  /*!
+   * \brief Reserve the function name in the generated module.
+   *
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   * \param whether to append return 0 in the end.
+   */
+  virtual void RegisterFunction_(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Initialize codegen state for generating f.
+   * \param f The function to be compiled.
+   */
+  virtual void InitFuncState_(const PrimFunc &f);
+
+  /*! \brief Print the function signature before ":"
+   * \param function_name The name of the function
+   * \param func The function whose signature should be printed
+   * \param os The output stream
+   */
+  virtual void PrintFunctionSignature_(const ffi::String &function_name,
+                                       const PrimFunc &func,
+                                       std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print the function decorator
+   * \param os The output stream
+   */
+  virtual void PrintFuncDecorator_(std::ostream &os) {} // NOLINT(*)
+
+  /*!
+   * \brief Insert statement before function body.
+   * \param f The function to be compiled.
+   */
+  virtual void PreFunctionBody_(const PrimFunc &f) {}
+
+protected:
+  /*! \brief reserves common Python keywords */
+  void ReserveKeywordsAsUnique_();
+
+  void PrintSSAAssign(const std::string &target, const std::string &src,
+                      DataType t) override;
+
+protected:
+  /*!
+   * \brief Print Type representation of type type.
+   * \param t The type representation.
+   * \param os The output stream
+   */
+  void PrintType(DataType type, std::ostream &os) override; // NOLINT(*)
+
+  /*!
+   * \brief Print the Stmt n to CodeGenTileLangPY->stream
+   * \param n The statement to be printed.
+   */
+  void PrintStmt_(const Stmt &n) { VisitStmt(n); }
+  /*!
+   * \brief Print the expression n into os
+   * \param n The expression to be printed.
+   * \param os The output stream
+   */
+  void PrintExpr_(const PrimExpr &n, std::ostream &os) { // NOLINT(*)
+    VisitExpr(n, os);
+  }
+  /*!
+   * \brief Same as PrintExpr_, but simply returns result string
+   * \param n The expression to be printed.
+   */
+  std::string PrintExpr_(const PrimExpr &n) {
+    std::ostringstream os;
+    PrintExpr_(n, os);
+    return os.str();
+  }
+
+  // expression
+  void VisitExpr_(const VarNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const IntImmNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const StringImmNode *op,
+                  std::ostream &os) override;                       // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const AddNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SubNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MulNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const ModNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const EQNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const AndNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const OrNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NotNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const RampNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  // statment
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const DeclBufferNode *op) override;
+  void VisitStmt_(const LetStmtNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const WhileNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const SeqStmtNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+  void VisitStmt_(const AssertStmtNode *op) override;
+
+protected:
+  // Get a string of type casting
+  virtual std::string CastFromTo_(const std::string &value, DataType from,
+                                  DataType target);
+
+  virtual void PrintBinaryExpr_(const std::string &opstr, DataType dtype,
+                                PrimExpr lhs, PrimExpr rhs,
+                                std::ostream &os); // NOLINT(*)
+  virtual void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                                     std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print external function call.
+   * \param ret_type The return type.
+   * \param global_symbol The symbolc of the target function.
+   * \param args The arguments to the function.
+   * \param skip_first_arg Whether to skip the first arguments.
+   * \param os The output stream.
+   */
+  virtual void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                                const ffi::Array<PrimExpr> &args,
+                                bool skip_first_arg,
+                                std::ostream &os); // NOLINT(*)
+
+  // Print reference to a buffer as type t in index.
+  virtual std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                                    PrimExpr index);
+
+  /*!
+   * \brief Register the data type of buf_var
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  void RegisterHandleType_(const VarNode *buf_var, DataType t);
+
+  /*!
+   * \brief If buffer is allocated as type t.
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  bool HandleTypeMatch_(const VarNode *buf_var, DataType t) const;
+
+protected:
+  /*! \brief the storage scope of allocation */
+  std::unordered_map<const VarNode *, std::string> alloc_storage_scope_;
+
+  /*! \brief Record of ops that have pre-defined global symbol. */
+  OpAttrMap<TGlobalSymbol> op_attr_global_symbol_ =
+      Op::GetAttrMap<TGlobalSymbol>("TGlobalSymbol");
+
+  // cache commonly used ops
+  const Op &builtin_call_extern_ = builtin::call_extern();
+  const Op &builtin_call_pure_extern_ = builtin::call_pure_extern();
+
+private:
+  /*! \brief the data type of allocated buffers */
+  std::unordered_map<const VarNode *, DataType> handle_data_type_;
+
+  /* \brief Map of GlobalVar to their symbol.
+   *
+   * For externally-exposed functions, this is given by the
+   * tvm::attr::kTarget attribute of the PrimFunc.  For internal
+   * functions, this is the name of the function's GlobalVar, possibly
+   * altered to prevent duplicate names.
+   */
+  std::unordered_map<GlobalVar, ffi::String> internal_functions_;
+
+  /* \brief Name supply to generate unique function names */
+  NameSupply func_name_supply_;
+
+  /*!
+   * \brief Escape a string to be a valid Python double-quoted string literal.
+   * \param s The input string to escape.
+   * \param os The output stream to write the escaped string to.
+   */
+  void EscapeStringLiteral_(const std::string &s, std::ostream &os);
+};
+
+} // namespace codegen
+} // namespace tvm
+#endif // TVM_TL_TARGET_CODEGEN_PY_H_
diff --git a/src/target/codegen_utils.cc b/src/target/codegen_utils.cc
new file mode 100644
index 00000000..75d038d3
--- /dev/null
+++ b/src/target/codegen_utils.cc
@@ -0,0 +1,41 @@
+/*!
+ * \file target/codegen_utils.cc
+ * \brief Shared utility functions for code generation
+ */
+
+#include "codegen_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+bool CheckOutermostParenthesesMatch(const std::string &s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched < 0) {
+        return false;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
+std::string RemoveOutermostParentheses(const std::string &s) {
+  if (CheckOutermostParenthesesMatch(s)) {
+    return s.substr(1, s.size() - 2);
+  } else {
+    return s;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_utils.h b/src/target/codegen_utils.h
new file mode 100644
index 00000000..1ef52d4b
--- /dev/null
+++ b/src/target/codegen_utils.h
@@ -0,0 +1,33 @@
+/*!
+ * \file target/codegen_utils.h
+ * \brief Shared utility functions for code generation
+ */
+
+#ifndef TVM_TARGET_CODEGEN_UTILS_H_
+#define TVM_TARGET_CODEGEN_UTILS_H_
+
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Check if the outermost parentheses match
+ * \param s The input string
+ * \return true if the first character is '(' and the last character is ')'
+ *         and they form a matching pair
+ */
+bool CheckOutermostParenthesesMatch(const std::string &s);
+
+/*!
+ * \brief Remove outermost parentheses if they match
+ * \param s The input string
+ * \return The string with outermost parentheses removed if they match,
+ *         otherwise return the original string
+ */
+std::string RemoveOutermostParentheses(const std::string &s);
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TARGET_CODEGEN_UTILS_H_
diff --git a/src/target/rt_mod_cutedsl.cc b/src/target/rt_mod_cutedsl.cc
new file mode 100644
index 00000000..a2b6d05d
--- /dev/null
+++ b/src/target/rt_mod_cutedsl.cc
@@ -0,0 +1,69 @@
+#include "codegen_cutedsl.h"
+#include "runtime/cuda/cuda_module.h"
+#include "runtime/pack_args.h"
+#include <tvm/ffi/reflection/registry.h>
+
+namespace tvm {
+namespace codegen {
+
+static std::unordered_map<std::string, runtime::FunctionInfo>
+ExtractFuncInfo(const IRModule &mod) {
+  std::unordered_map<std::string, runtime::FunctionInfo> fmap;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    auto f = Downcast<tir::PrimFunc>(kv.second);
+
+    runtime::FunctionInfo info;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      if (f->params[i]->dtype.is_handle()) {
+        auto ptr = f->params[i]->type_annotation.as<PointerTypeNode>();
+        if (ptr && ptr->storage_scope == "grid_constant") {
+          info.arg_types.push_back(DataType(runtime::kDLGridConstant, 64, 1));
+          continue;
+        }
+      }
+      info.arg_types.push_back(f->params[i].dtype());
+    }
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
+      for (const auto &tag : opt.value()) {
+        info.launch_param_tags.push_back(tag);
+      }
+    }
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
+    fmap[static_cast<std::string>(global_symbol.value())] = info;
+  }
+  return fmap;
+}
+
+ffi::Module BuildTileLangCuTeDSLWithoutCompile(IRModule mod, Target target) {
+  CodeGenTileLangCuTeDSL cg;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<PrimFuncNode>())
+        << "CodeGenTileLangCuTeDSL: Can only take PrimFunc";
+    auto gvar = Downcast<GlobalVar>(kv.first);
+    auto f = Downcast<PrimFunc>(kv.second);
+    auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch);
+    cg.AddFunction(gvar, f);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cutedsl_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_cutedsl_without_compile",
+                        BuildTileLangCuTeDSLWithoutCompile);
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/tl_templates/cuda/nvrtc_std.h b/src/tl_templates/cuda/nvrtc_std.h
index 1e6800e5..34cd58bb 100644
--- a/src/tl_templates/cuda/nvrtc_std.h
+++ b/src/tl_templates/cuda/nvrtc_std.h
@@ -173,4 +173,4 @@ template <class T, unsigned I = 0>
 inline constexpr size_t extent_v = extent<T, I>::value;
 } // namespace std
 
-#endif
\ No newline at end of file
+#endif // __CUDACC_RTC__
diff --git a/testing/python/jit/test_tilelang_jit_cutedsl.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
new file mode 100644
index 00000000..7c613c4d
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -0,0 +1,381 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    stramp = "&*(XS)"
+
+    @tvm.register_global_func("tilelang_callback_cutedsl_postproc", override=True)
+    def tilelang_callback_cutedsl_postproc(code, _):
+        code = f"# {stramp}\n" + code
+        return code
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
+
+    kernel_source = matmul_kernel.get_kernel_source()
+
+    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
+
+
+def test_gemm_f16f16f16_nn():
+    run_gemm(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def matmul_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        "float16",
+        "float16",
+        "float16",
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_cutedsl_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
+
+    profiler = matmul_kernel.get_profiler()
+
+    cutedsl_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"CuTeDSL Latency: {cutedsl_latency} ms")
+
+    assert cutedsl_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_cutedsl_kernel_do_bench():
+    run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_cutedsl_kernel_multi_stream():
+    run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_cutedsl_dynamic_shape():
+    run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+    )
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index 58295406..cf6a5591 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -29,6 +29,11 @@ KERNEL_CUBIN_PATH = "kernel.cubin"
 KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
+# CuTeDSL C++ launcher specific
+LAUNCHER_LIB_PATH = "launcher_lib.so"
+LAUNCHER_CPP_PATH = "launcher.cpp"
+CUTEDSL_CUBIN_PATH = "kernel.cubin"
+
 
 class KernelCache:
     """
@@ -43,7 +48,7 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi"
+    execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi"
 
     def __new__(cls):
         """
@@ -72,7 +77,7 @@ class KernelCache:
         self,
         func: Callable,
         out_idx: list[int],
-        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         args=None,
         target: str | Target = "auto",
         target_host: str | Target = None,
@@ -85,7 +90,7 @@ class KernelCache:
         Args:
             func (Callable): The function to be compiled.
             out_idx (List[int]): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             args: Arguments passed to the function.
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
@@ -118,7 +123,7 @@ class KernelCache:
         *args,
         target: str | Target = "auto",
         target_host: str | Target = None,
-        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+        execution_backend: Literal["auto", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
         verbose: bool = False,
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
@@ -217,7 +222,11 @@ class KernelCache:
         )
         with self._lock:
             if env.is_cache_enabled():
+                cache_path = self._get_cache_path(key)
                 self._save_kernel_to_disk(key, kernel, func, verbose)
+                # Set cache path on adapter so it can save cubin after first execution
+                if hasattr(kernel, "adapter") and execution_backend == "cutedsl":
+                    kernel.adapter._cache_path = cache_path
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -287,59 +296,83 @@ class KernelCache:
 
         # Save kernel source code
         try:
-            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
-            if verbose:
-                self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
-            if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel source code to disk: {e}")
+            if self.execution_backend != "cutedsl":
+                device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+                if verbose:
+                    self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
+                if kernel.kernel_source is not None:
+                    KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
+        except Exception:
+            self.logger.exception("Error saving kernel source code to disk")
 
         # Save wrapped kernel source code
         try:
-            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH if self.execution_backend != "cutedsl" else KERNEL_PY_PATH)
             if verbose:
                 self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
             if self.execution_backend == "tvm_ffi":
                 KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_host_source()))
             else:
                 KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_kernel_source()))
-        except Exception as e:
-            self.logger.error(f"Error saving host kernel source code to disk: {e}")
+        except Exception:
+            self.logger.exception("Error saving host kernel source code to disk")
 
         # Save the kernel library
         try:
             # Save CUBIN or SO file
-            if self.execution_backend == "nvrtc":
-                kernel_lib_path = KERNEL_CUBIN_PATH
-            elif self.execution_backend == "tvm_ffi":
-                kernel_lib_path = EXECUTABLE_PATH
-            else:
-                kernel_lib_path = KERNEL_LIB_PATH
-
-            kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
+            if self.execution_backend == "cutedsl":
+                # For CuTeDSL, kernel_lib_path is the Python module
+                kernel_lib_path = os.path.join(cache_path, KERNEL_PY_PATH)
+
+                # Save C++ launcher library if it exists
+                lib_gen = getattr(kernel.adapter, "lib_generator", None)
+                if lib_gen and hasattr(lib_gen, "launcher_libpath") and lib_gen.launcher_libpath:
+                    launcher_lib_path = os.path.join(cache_path, LAUNCHER_LIB_PATH)
+                    src_launcher_path = lib_gen.launcher_libpath
+                    if verbose:
+                        self.logger.debug(f"Saving C++ launcher library to cache: {src_launcher_path}")
+                    KernelCache._safe_write_file(
+                        launcher_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_launcher_path))
+                    )
+
+                # Optionally save launcher C++ source for debugging
+                if hasattr(kernel.adapter, "launcher_cpp_code") and kernel.adapter.launcher_cpp_code:
+                    launcher_cpp_path = os.path.join(cache_path, LAUNCHER_CPP_PATH)
+                    if verbose:
+                        self.logger.debug(f"Saving C++ launcher source to: {launcher_cpp_path}")
+                    KernelCache._safe_write_file(launcher_cpp_path, "w", lambda file: file.write(kernel.adapter.launcher_cpp_code))
 
-            # Save an extra Python file for NVRTC
-            if self.execution_backend == "nvrtc":
-                src_lib_path = kernel.adapter.libpath
-                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
-                src_lib_path = src_lib_path.replace(".cubin", ".py")
-                if verbose:
-                    self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-            elif self.execution_backend == "tvm_ffi":
-                executable = kernel.adapter.executable
-                if verbose:
-                    self.logger.debug(f"Saving kernel executable to file: {executable}")
-                KernelCache._safe_write_executable(executable, kernel_lib_path)
             else:
-                src_lib_path = kernel.adapter.libpath
-                if verbose:
-                    self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-                KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-
-        except Exception as e:
-            self.logger.error(f"Error saving kernel library to disk: {e}")
+                if self.execution_backend == "nvrtc":
+                    kernel_lib_path = KERNEL_CUBIN_PATH
+                elif self.execution_backend == "tvm_ffi":
+                    kernel_lib_path = EXECUTABLE_PATH
+                else:
+                    kernel_lib_path = KERNEL_LIB_PATH
+                kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
+
+                # Save an extra Python file for NVRTC
+                if self.execution_backend == "nvrtc":
+                    src_lib_path = kernel.adapter.libpath
+                    kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                    src_lib_path = src_lib_path.replace(".cubin", ".py")
+                    if verbose:
+                        self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                    KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+                if self.execution_backend == "tvm_ffi":
+                    executable = kernel.adapter.executable
+                    if verbose:
+                        self.logger.debug(f"Saving kernel executable to file: {executable}")
+                    KernelCache._safe_write_executable(executable, kernel_lib_path)
+                else:
+                    src_lib_path = kernel.adapter.libpath
+                    if verbose:
+                        self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                    KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+        except Exception:
+            self.logger.exception("Error saving kernel library to disk")
 
         # Save kernel parameters
         try:
@@ -347,19 +380,19 @@ class KernelCache:
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
             KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel parameters to disk: {e}")
+        except Exception:
+            self.logger.exception("Error saving kernel parameters to disk")
 
     def _load_kernel_from_disk(
         self,
         key: str,
         target: str | Target = "auto",
-        target_host: str | Target = None,
-        out_idx: list[int] = None,
-        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
-        pass_configs: dict = None,
+        target_host: str | Target | None = None,
+        out_idx: list[int] | None = None,
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
-        func: Callable = None,
+        func: Callable | None = None,
         verbose: bool = False,
     ) -> JITKernel | None:
         """
@@ -370,7 +403,7 @@ class KernelCache:
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
             out_idx (List[int], optional): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
             verbose (bool): Enable verbose log messages.
@@ -385,11 +418,21 @@ class KernelCache:
             kernel_lib_path = KERNEL_CUBIN_PATH
         elif self.execution_backend == "tvm_ffi":
             kernel_lib_path = EXECUTABLE_PATH
+        elif self.execution_backend == "cutedsl":
+            kernel_lib_path = KERNEL_PY_PATH
         else:
             kernel_lib_path = KERNEL_LIB_PATH
         kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
         params_path = os.path.join(cache_path, PARAMS_PATH)
-        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+
+        # Check required files exist
+        required_files = [kernel_lib_path, params_path]
+
+        # For CuTeDSL, also check launcher library
+        if self.execution_backend == "cutedsl":
+            required_files.append(os.path.join(cache_path, LAUNCHER_LIB_PATH))
+
+        if not all([os.path.exists(file) for file in required_files]):
             return None
 
         device_kernel_source: str | None = None
@@ -397,20 +440,25 @@ class KernelCache:
         kernel_params: list[KernelParam] | None = None
 
         # Load the kernel source file (optional)
-        try:
-            if verbose:
-                self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
-            with open(device_kernel_path) as f:
-                device_kernel_source = f.read()
-        except Exception as e:
-            self.logger.error(f"Error loading kernel source code from disk: {e}")
-        try:
-            if verbose:
-                self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
-            with open(host_kernel_path) as f:
-                host_kernel_source = f.read()
-        except Exception as e:
-            self.logger.error(f"Error loading host kernel source code from disk: {e}")
+        if self.execution_backend != "cutedsl":
+            try:
+                if verbose:
+                    self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+                with open(device_kernel_path) as f:
+                    device_kernel_source = f.read()
+            except Exception:
+                self.logger.exception("Error loading kernel source code from disk")
+            try:
+                if verbose:
+                    self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+                with open(host_kernel_path) as f:
+                    host_kernel_source = f.read()
+            except Exception:
+                self.logger.exception("Error loading host kernel source code from disk")
+        else:
+            # For CuTeDSL, set empty strings since sources aren't loaded from cache
+            device_kernel_source = ""
+            host_kernel_source = ""
 
         # Load kernel parameters
         try:
@@ -418,10 +466,10 @@ class KernelCache:
                 self.logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
-        except Exception as e:
-            self.logger.error(f"Error loading kernel parameters from disk: {e}")
+        except Exception:
+            self.logger.exception("Error loading kernel parameters from disk")
 
-        if host_kernel_source and device_kernel_source and kernel_params:
+        if ((host_kernel_source and device_kernel_source) or self.execution_backend == "cutedsl") and kernel_params:
             return JITKernel.from_database(
                 func=func,
                 host_kernel_source=host_kernel_source,
@@ -453,5 +501,5 @@ class KernelCache:
 
             # Re-create the cache directory
             KernelCache._create_dirs()
-        except Exception as e:
-            self.logger.error(f"Error clearing disk cache: {e}")
+        except Exception:
+            self.logger.exception("Error clearing disk cache")
diff --git a/tilelang/contrib/cutedsl/__init__.py b/tilelang/contrib/cutedsl/__init__.py
new file mode 100644
index 00000000..1028bade
--- /dev/null
+++ b/tilelang/contrib/cutedsl/__init__.py
@@ -0,0 +1,128 @@
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import nvvm
+from cutlass.cutlass_dsl import T
+
+# re-export cutlass.cute.arch functions first
+from cutlass.cute.arch import sync_threads  # noqa: F401
+from cutlass.cute.arch import alloc_smem, get_dyn_smem  # noqa: F401
+from cutlass.cute.arch import warpgroup_reg_alloc, warpgroup_reg_dealloc  # noqa: F401
+
+from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr  # noqa: F401
+from cutlass.cute.typing import Numeric
+
+from cutlass.base_dsl.typing import as_numeric, Int32, Uint16, Uint32  # noqa: F401
+from cutlass._mlir.dialects import llvm, arith  # noqa: F401
+from cutlass._mlir import ir as mlir_ir
+from cutlass.cutlass_dsl import dsl_user_op
+
+# Import our custom implementations (will override if names conflict)
+from .mbar import *
+from .cpasync import *
+from .gemm_V1 import *
+from .reduce import *
+from .ldsm import *
+from .math import *
+from .threadblock_swizzle import *
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def make_filled_tensor(shape, value):
+    t = cute.make_rmem_tensor(shape, type(value))
+    t.fill(value)
+    return t
+
+
+def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
+    if div_by != 1:
+        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
+    return cute.make_tensor(ptr + offset, shape)
+
+
+def shuffle_elect(thread_extent):
+    # thread_extent is the number of threads of a warpgroup
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    if thread_extent == 0:
+        return warp_idx == 0
+    else:
+        return (warp_idx % (thread_extent // 32)) == 0
+
+
+def sync_thread_partial(barrier_id=None, thread_count=None):
+    bar_sync_ptx(barrier_id, thread_count)
+
+
+# Packing functions
+def pack_half2(x, y):
+    """
+    Pack two half-precision (fp16) values into a single 32-bit value.
+    Corresponds to CUDA's __pack_half2 intrinsic.
+
+    This packs two fp16 values into a single int32 by treating the fp16 bits
+    as raw data and concatenating them.
+    """
+
+    @dsl_user_op
+    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
+        # Cast fp16 to uint16 (bitcast)
+        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
+        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
+
+        # Bitcast fp16 to i16
+        i16_type = mlir_ir.IntegerType.get_signless(16)
+        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
+        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
+
+        packed_xy = llvm.inline_asm(
+            Int32.mlir_type,
+            [x_i16, y_i16],
+            "mov.b32 $0, {$1, $2};",
+            "=r,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        return Int32(packed_xy)
+
+    return pack_half2_impl(x, y)
+
+
+def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    if ptr.dtype == cutlass.Float32:
+        ret = nvvm.atomicrmw(
+            T.f32(),
+            AtomicOpKind.FADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        raise ValueError(f"Unsupported dtype: {ptr.dtype}")
+    return ptr.dtype(ret)
diff --git a/tilelang/contrib/cutedsl/cpasync.py b/tilelang/contrib/cutedsl/cpasync.py
new file mode 100644
index 00000000..6ddeb893
--- /dev/null
+++ b/tilelang/contrib/cutedsl/cpasync.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op  # noqa: F401
+
+from cutlass._mlir.dialects import nvvm, cute_nvgpu  # noqa: F401
+from cutlass._mlir import ir
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Union  # noqa: F401
+from cutlass.impl_utils import check_value_in
+
+from cutlass.cute.arch import cp_async_commit_group as cp_async_commit  # noqa: F401
+from cutlass.cute.arch import cp_async_wait_group as cp_async_wait  # noqa: F401
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def cp_async_gs(size, dst, dst_offset, src, src_offset):
+    assert size in [16, 8, 4]
+    # use CG (cache global) to by pass L1 when loading contiguous 128B.
+    mode = nvvm.LoadCacheModifierKind.CG if size == 16 else nvvm.LoadCacheModifierKind.CA
+    if isinstance(src, cute.Tensor):
+        src_ptr = src.iterator
+    elif isinstance(src, cute.Pointer):
+        src_ptr = src
+    else:
+        raise ValueError(f"Invalid source type: {type(src)}")
+    if isinstance(dst, cute.Tensor):
+        dst_ptr = dst.iterator
+    elif isinstance(dst, cute.Pointer):
+        dst_ptr = dst
+    else:
+        raise ValueError(f"Invalid destination type: {type(dst)}")
+    cp_async_shared_global(dst_ptr + dst_offset, src_ptr + src_offset, size, mode)
+
+
+@cute.jit
+def cp_async_gs_conditional(size, dst, dst_offset, src, src_offset, cond):
+    if cond:
+        cp_async_gs(size, dst, dst_offset, src, src_offset)
+
+
+@dsl_user_op
+def extract_tensormap_ptr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    extract the tensormap pointer from a TMA Copy Atom.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    """
+    exec_value = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    ptr_type = _cute_ir.PtrType.get(Uint64.mlir_type, _cute_ir.AddressSpace.generic, 64)
+    tensormap_ptr = _cute_nvgpu_ir.get_tma_desc_addr(ptr_type, exec_value, loc=loc, ip=ip)
+    return tensormap_ptr
+
+
+@dsl_user_op
+def tma_load(tma_desc, mbar: cute.Pointer, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Load data from global memory to shared memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  CopyAtom or tensormap_ptr or Tensor of tensormap_ptr
+    :param mbar:                     Mbarrier pointer in shared memory
+    :type mbar:                      Pointer
+    :param smem_ptr:                 Destination pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    if not isinstance(crd, tuple) and isinstance(tma_desc, cute.Pointer):
+        # Legacy signature: tma_load(smem_ptr, gmem_ptr, mbar, size)
+        _smem_ptr = tma_desc
+        _gmem_ptr = mbar
+        _mbar = smem_ptr
+        nvvm.cp_async_bulk_shared_cluster_global(
+            dst_mem=_smem_ptr.llvm_ptr,
+            src_mem=_gmem_ptr.llvm_ptr,
+            mbar=_mbar.llvm_ptr,
+            size=Int32(crd).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_shared_cluster_global(
+            dst_mem=smem_ptr.llvm_ptr,
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            mbar=mbar.llvm_ptr,
+            im2col_offsets=[],
+            load_mode=nvvm.CpAsyncBulkTensorLoadMode.TILE,
+            group=nvvm.Tcgen05GroupKind.CTA_1,
+            use_intrinsic=False,  # set to True would lead to compile error
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Store data from shared memory to global memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  TMA descriptor
+    :param smem_ptr:                 Source pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    if not isinstance(crd, tuple):
+        if arch not in ("sm_90", "sm_90a"):
+            raise NotImplementedError("tma_store(size) path is only implemented for sm_90/sm_90a")
+        gmem_ptr = tma_desc.align(smem_ptr.alignment)
+        _cute_nvgpu_ir.arch_copy_SM90_bulk_copy_s2g(
+            dsmem_data_addr=smem_ptr.value,
+            gmem_data_addr=gmem_ptr.value,
+            size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), crd),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_global_shared_cta(
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            src_mem=smem_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            predicate=None,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store_arrive(*, loc=None, ip=None) -> None:
+    """
+    Indicate arrival of warp issuing TMA_STORE.
+    Corresponds to PTX instruction: cp.async.bulk.commit_group;
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def tma_store_wait(count: int, *, read=None, loc=None, ip=None) -> None:
+    """
+    Wait for TMA_STORE operations to complete.
+    Corresponds to PTX instruction: cp.async.bulk.wait_group.read <count>;
+
+    :param count: The number of outstanding bulk async groups to wait for
+    :type count: Int
+    """
+    nvvm.cp_async_bulk_wait_group(group=count, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_shared_global(
+    dst: cute.Pointer, src: cute.Pointer, cp_size: Int, modifier: nvvm.LoadCacheModifierKind, *, src_size: Int = None, loc=None, ip=None
+) -> None:
+    """
+    Asynchronously copy data from global memory to shared memory.
+
+    :param dst: Destination pointer in shared memory
+    :type dst: Pointer
+    :param src: Source pointer in global memory
+    :type src: Pointer
+    :param size: Size of the copy in bytes
+    :type size: Int
+    :param modifier: Cache modifier
+    :type modifier: Int
+    :param cp_size: Optional copy size override
+    :type cp_size: Int
+    """
+    size = src_size if src_size else cp_size
+    nvvm.cp_async_shared_global(
+        dst=dst.llvm_ptr,
+        src=src.llvm_ptr,
+        size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), size),
+        modifier=modifier,
+        cp_size=Int32(cp_size).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def prefetch_tma_descriptor(tma_desc, *, loc=None, ip=None) -> None:
+    """
+    Prefetch a TMA descriptor.
+    Corresponds to PTX instruction: prefetch.tensormap;
+    """
+    if isinstance(tma_desc, cute.CopyAtom):
+        tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+    elif isinstance(tma_desc, cute.Tensor):
+        tma_desc_ptr = tma_desc.iterator
+    else:
+        tma_desc_ptr = tma_desc
+    nvvm.prefetch_tensormap(tma_desc_ptr.llvm_ptr, loc=loc, ip=ip)
diff --git a/tilelang/contrib/cutedsl/gemm_V1.py b/tilelang/contrib/cutedsl/gemm_V1.py
new file mode 100644
index 00000000..0f6cc71e
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_V1.py
@@ -0,0 +1,569 @@
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils  # noqa: F401
+import math
+import cutlass.utils.hopper_helpers as hopper_utils
+from cutlass.utils import LayoutEnum
+from cutlass.cute.nvgpu.warpgroup import OperandMajorMode, OperandSource, make_smem_layout_atom
+
+
+def make_aligned_tensor(ptr: cute.Pointer, layout: cute.Layout, align_bytes: int, swizzle=False):
+    ptr = ptr.align(align_bytes)
+    if swizzle and isinstance(layout, cute.ComposedLayout):
+        ptr = cute.recast_ptr(ptr=ptr, swizzle_=layout.inner, dtype=ptr.dtype)
+        return cute.make_tensor(ptr, layout.outer)
+    return cute.make_tensor(ptr, layout)
+
+
+def gemm_ss(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rs(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from register/fragment and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_sr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from shared memory and B from register/fragment"""
+    # wgmma doesn't support gemm_sr, only use SM80
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    gemm.body_sr(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from register/fragment"""
+    # Both operands in register, no copy needed
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    # For gemm_rr, directly call _body_impl with copy_A=False, copy_B=False
+    gemm._body_impl(A_ptr, B_ptr, C_ptr, copy_A=False, copy_B=False)
+
+
+class Gemm_SM80:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.mma_inst_shape = (16, 8, 16)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self._make_smem_layout_AB(A_type, A_major_mode, 128, (M, K))
+            self.B_layout = self._make_smem_layout_AB(B_type, B_major_mode, 128, (N, K))
+            self.ab_dtype = A_type
+            self.acc_dtype = C_type
+            self.tiled_mma = self._make_tiled_mma(warp_m, warp_n)
+            self.clear_accum = clear_accum
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        is_row_major = major_mode == LayoutEnum.ROW_MAJOR
+        major_mode_size = smem_tiler[1] if is_row_major else smem_tiler[0]
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if is_row_major
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1) if is_row_major else (1, 0))
+        return layout
+
+    def _make_tiled_mma(self, warp_m, warp_n):
+        atom_layout_mnk = (warp_m, warp_n, 1)
+        op = cute.nvgpu.warp.MmaF16BF16Op(self.ab_dtype, self.acc_dtype, self.mma_inst_shape)
+        permutation_mnk = (
+            atom_layout_mnk[0] * self.mma_inst_shape[0],
+            atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+        tiled_mma = cute.make_tiled_mma(op, atom_layout_mnk, permutation_mnk)
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body: both A and B from shared memory"""
+        self._body_impl(sA_ptr, sB_ptr, rC_ptr, copy_A=True, copy_B=True)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_rs: A from register, B from shared memory"""
+        self._body_impl(rA_ptr, sB_ptr, rC_ptr, copy_A=False, copy_B=True)
+
+    @cute.jit
+    def body_sr(
+        self,
+        sA_ptr: cute.Pointer,  # A from shared memory
+        rB_ptr: cute.Pointer,  # B already in register
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_sr: A from shared memory, B from register"""
+        self._body_impl(sA_ptr, rB_ptr, rC_ptr, copy_A=True, copy_B=False)
+
+    @cute.jit
+    def _body_impl(
+        self,
+        A_ptr: cute.Pointer,
+        B_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        copy_A: cutlass.Constexpr = True,
+        copy_B: cutlass.Constexpr = True,
+    ):
+        """Internal implementation with configurable copy operations"""
+        tidx, _, _ = cute.arch.thread_idx()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        tCrA = None
+        tCrB = None
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Create copy operations only for operands that need copying
+        if cutlass.const_expr(copy_A):
+            sA = make_aligned_tensor(A_ptr, self.A_layout, 16)
+            tCsA = thr_mma.partition_A(sA)
+            tCrA = self.tiled_mma.make_fragment_A(tCsA)
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_A, 4),
+                sA.element_type,
+            )
+            tiled_copy_s2r_A = cute.make_tiled_copy(
+                atom_copy_s2r_A,
+                layout_tv=self.tiled_mma.tv_layout_A_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(0), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        else:
+            # A already in register
+            tCrA = cute.make_tensor(A_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        if cutlass.const_expr(copy_B):
+            sB = make_aligned_tensor(B_ptr, self.B_layout, 16)
+            tCsB = thr_mma.partition_B(sB)
+            tCrB = self.tiled_mma.make_fragment_B(tCsB)
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_B, 4),
+                sB.element_type,
+            )
+            tiled_copy_s2r_B = cute.make_tiled_copy(
+                atom_copy_s2r_B,
+                layout_tv=self.tiled_mma.tv_layout_B_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(1), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+        else:
+            # B already in register
+            tCrB = cute.make_tensor(B_ptr, self.tiled_mma.partition_shape_B((self.cta_tiler[1], self.cta_tiler[2])))
+
+        if self.clear_accum:
+            tCrC.fill(0)
+
+        for k in cutlass.range(cute.size(tCrA, mode=[2])):
+            if cutlass.const_expr(copy_A):
+                cute.copy(tiled_copy_s2r_A, tCsA_copy_view[None, None, k], tCrA_copy_view[None, None, k])
+            if cutlass.const_expr(copy_B):
+                cute.copy(tiled_copy_s2r_B, tCsB_copy_view[None, None, k], tCrB_copy_view[None, None, k])
+            cute.gemm(self.tiled_mma, tCrC, tCrA[None, None, k], tCrB[None, None, k], tCrC)
+
+
+class Gemm_SM90:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.tiler_mn = (M, N)
+            self.atom_layout_mnk = (warp_m // 4, warp_n, 1)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            self.a_leading_mode = OperandMajorMode.MN if self.trans_A else OperandMajorMode.K
+            self.b_leading_mode = OperandMajorMode.MN if self.trans_B else OperandMajorMode.K
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self.make_smem_layout_AB(A_type, A_major_mode, (M, K))
+            self.B_layout = self.make_smem_layout_AB(B_type, B_major_mode, (N, K))
+            self.a_dtype = A_type
+            self.b_dtype = B_type
+            self.acc_dtype = C_type
+            self.tiled_mma = None
+            self.A_source = None
+            self.clear_accum = clear_accum
+
+    @staticmethod
+    def make_tma_atom(
+        tensor,
+        smem_layout_staged,
+        smem_tile,
+        mcast_dim,
+    ):
+        op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp() if mcast_dim == 1 else cute.nvgpu.cpasync.CopyBulkTensorTileG2SMulticastOp()
+
+        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
+
+        tma_atom, tma_tensor = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+
+        return tma_atom
+
+    @staticmethod
+    def get_tma_atom(tensor, tiler_mk, stages=1):
+        smem_layout_staged = Gemm_SM90.make_smem_layout_AB(tensor.element_type, LayoutEnum.from_tensor(tensor), tiler_mk, stages)
+        tma_atom = Gemm_SM90.make_tma_atom(tensor, smem_layout_staged, tiler_mk, 1)
+        return tma_atom
+
+    @staticmethod
+    def make_smem_layout_AB(dtype, major_mode: LayoutEnum, tiler_mk, stages=1):
+        smem_shape = tiler_mk
+        # Determine if K is the major mode and get the major mode size
+        is_k_major = major_mode.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K
+        major_mode_size = tiler_mk[1] if is_k_major else tiler_mk[0]
+
+        # Create SMEM layout atom for A tensor based on major mode and data type
+        smem_layout_atom = make_smem_layout_atom(
+            hopper_utils.get_smem_layout_atom(major_mode, dtype, major_mode_size),
+            dtype,
+        )
+        # Tile the SMEM layout atom to the A tensor shape and add staging dimension
+        smem_layout = cute.tile_to_shape(smem_layout_atom, cute.append(smem_shape, stages), order=(0, 1, 2) if is_k_major else (1, 0, 2))
+        return smem_layout
+
+    def _make_tiled_mma(self, is_rsMode=False):
+        tiled_mma = hopper_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_leading_mode,
+            self.b_leading_mode,
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            (64, self.tiler_mn[1] // self.atom_layout_mnk[1]),
+            OperandSource.SMEM if not is_rsMode else OperandSource.RMEM,
+        )
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        sA_ptr = cute.recast_ptr(sA_ptr, self.A_layout.inner, dtype=sA_ptr.dtype)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sA = cute.make_tensor(sA_ptr, self.A_layout.outer)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+
+        tCrA = self.tiled_mma.make_fragment_A(tCsA)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        cute.nvgpu.warpgroup.fence()
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k in cutlass.range(num_k_blocks):
+            tCrA_1phase = tCrA[None, None, k, 0]
+            tCrB_1phase = tCrB[None, None, k, 0]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_1phase, tCrB_1phase, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register (Fragment)
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        """
+        GEMM body_rs for SM90/Hopper: A from register, B from shared memory.
+        Based on cute::tl_wgmma::GemmTensorOp::body_rs from gemm_sm90.h
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma(is_rsMode=True)
+        # if self.A_source != OperandSource.RMEM or self.tiled_mma is None:
+        #     self.tiled_mma = self._make_tiled_mma(is_rsMode = True)
+        #     self.A_source = OperandSource.RMEM
+        # B from shared memory (with swizzle)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        # Use the existing tiled_mma
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        # Partition B from shared memory - standard path
+        tCsB = thr_mma.partition_B(sB)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+
+        # A already in register
+        # For body_rs, A is NOT partitioned through thr_mma (it's already partitioned)
+        # We create the tensor directly with the full shape
+        # This matches C++: make_tensor(make_rmem_ptr(pA), partition_shape_A(...))
+        tCrA = cute.make_tensor(rA_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        # C accumulator
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Fence operands (prepare for wgmma)
+        cute.nvgpu.warpgroup.fence()
+        # Note: warpgroup_arrive() is called internally by wgmma
+        # Set accumulation mode
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        # GEMM loop
+        num_k_blocks = cute.size(tCrB, mode=[2])
+        for k_block in cutlass.range(num_k_blocks):
+            # Match the indexing pattern from __call__
+            # If tCrB has 4 dimensions (with pipeline), use [None, None, k, 0]
+            # Otherwise use [None, None, k]
+            tCrB_k = tCrB[None, None, k_block, 0] if cute.rank(tCrB) >= 4 else tCrB[None, None, k_block]
+            tCrA_k = tCrA[None, None, k_block, 0] if cute.rank(tCrA) >= 4 else tCrA[None, None, k_block]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_k, tCrB_k, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
diff --git a/tilelang/contrib/cutedsl/ldsm.py b/tilelang/contrib/cutedsl/ldsm.py
new file mode 100644
index 00000000..4f360269
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ldsm.py
@@ -0,0 +1,127 @@
+"""
+LDMATRIX and STMATRIX operations for CuTeDSL backend.
+Based on tl_templates/cuda/ldsm.h
+
+These functions provide wrappers around PTX ldmatrix/stmatrix instructions
+for loading/storing 8x8 matrix fragments between shared memory and registers.
+"""
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir import ir  # noqa: F401
+from cutlass.cute.typing import Pointer, Int32  # noqa: F401
+import cutlass.cute as cute
+
+
+def _to_ir_value(v, loc=None, ip=None):
+    """Convert value to MLIR IR, handling both cutlass types and raw MLIR Values"""
+    if hasattr(v, "ir_value"):
+        return v.ir_value(loc=loc, ip=ip)
+    else:
+        # Already an MLIR Value
+        return v
+
+
+def _ldmatrix(smem_ptr, local_ptr, num, transpose, loc=None, ip=None):
+    """Internal helper for ldmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    assert num in [2, 4]
+    ret_type = llvm.StructType.get_literal([T.i32()] * num)
+    out_i32 = nvvm.ldmatrix(ret_type, smem_ptr.llvm_ptr, num=num, layout=layout, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), num)
+    for i in range(num):
+        out[i] = cute.Int32(llvm.extractvalue(T.i32(), out_i32, [i], loc=loc, ip=ip))
+
+
+def _stmatrix(smem_ptr, values, transpose, loc=None, ip=None):
+    """Internal helper for stmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    ir_values = [_to_ir_value(v, loc, ip) for v in values]
+    nvvm.stmatrix(smem_ptr.llvm_ptr, ir_values, layout=layout, loc=loc, ip=ip)
+
+
+# ============================================================================
+# LDMATRIX operations (load from shared memory to registers)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) from shared memory"""
+    # _ldmatrix(smem_ptr, local_ptr, 1, False, loc, ip)
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.row, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) with transpose from shared memory"""
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.col, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, True, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, True, loc, ip)
+
+
+# ============================================================================
+# STMATRIX operations (store from registers to shared memory)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_stmatrix_x1(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) to shared memory"""
+    _stmatrix(smem_ptr, [value0], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x1_trans(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2_trans(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4_trans(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], True, loc, ip)
diff --git a/tilelang/contrib/cutedsl/math.py b/tilelang/contrib/cutedsl/math.py
new file mode 100644
index 00000000..3f775091
--- /dev/null
+++ b/tilelang/contrib/cutedsl/math.py
@@ -0,0 +1,9 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Union, Numeric
+from cutlass.cute.tensor import TensorSSA
+from cutlass._mlir.dialects import arith
+from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt  # noqa: F401
+
+
+def divf(x: Union[TensorSSA, Numeric], y: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    return cute.math._math_op(arith.divf, fastmath, x, y)
diff --git a/tilelang/contrib/cutedsl/mbar.py b/tilelang/contrib/cutedsl/mbar.py
new file mode 100644
index 00000000..ca956e2f
--- /dev/null
+++ b/tilelang/contrib/cutedsl/mbar.py
@@ -0,0 +1,45 @@
+"""
+Simple wrappers that delegate to cutlass.cute.arch implementations.
+We use the existing implementations from cutlass rather than reinventing the wheel.
+"""
+
+from cutlass.cute.typing import Pointer, Int, Int32, Boolean  # noqa: F401
+from cutlass.cutlass_dsl import CuTeDSL, dsl_user_op  # noqa: F401
+from cutlass._mlir.dialects import nvvm
+
+from cutlass.cute.arch import mbarrier_init, mbarrier_expect_tx, mbarrier_arrive  # noqa: F401
+from cutlass.cute.arch import mbarrier_arrive_and_expect_tx as arrive_and_expect_tx  # noqa: F401
+from cutlass.cute.arch import cp_async_mbarrier_arrive_noinc as mbarrier_cp_async_arrive_noinc  # noqa: F401
+
+import cutlass.cute.arch as arch
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, timeout_ns: Int = 10000000, *, loc=None, ip=None) -> None:
+    """Waits on a mbarrier with a specified phase."""
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_cp_async_arrive(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=False,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def fence_proxy_async():
+    arch.fence_proxy(arch.ProxyKind.async_shared, space=arch.SharedSpace.shared_cta)
+
+
+def fence_barrier_init():
+    arch.mbarrier_init_fence()
diff --git a/tilelang/contrib/cutedsl/reduce.py b/tilelang/contrib/cutedsl/reduce.py
new file mode 100644
index 00000000..f835b149
--- /dev/null
+++ b/tilelang/contrib/cutedsl/reduce.py
@@ -0,0 +1,186 @@
+"""
+Reduce operations for CuTeDSL backend.
+Based on tl_templates/cuda/reduce.h
+"""
+
+from __future__ import annotations
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.typing import Int32, Float32
+from cutlass.cutlass_dsl import dsl_user_op, T
+from cutlass._mlir.dialects import nvvm
+from cutlass.cute.arch.nvvm_wrappers import shuffle_sync_op
+
+
+@dsl_user_op
+def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmin(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+class SumOp:
+    """Sum reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x + y
+
+
+class MaxOp:
+    """Max reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return max(x, y)
+
+
+class MinOp:
+    """Min reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        # Use cutlass.min which is JIT-friendly
+        return min(x, y)
+
+
+class BitAndOp:
+    """Bitwise AND reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x & y
+
+
+class BitOrOp:
+    """Bitwise OR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x | y
+
+
+class BitXorOp:
+    """Bitwise XOR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x ^ y
+
+
+def bar_sync(barrier_id, number_of_threads):
+    cute.arch.barrier(barrier_id=barrier_id, number_of_threads=number_of_threads)
+
+
+def bar_sync_ptx(barrier_id, number_of_threads):
+    from cutlass._mlir.dialects import llvm
+
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(number_of_threads).ir_value()],
+        "bar.sync $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
+    """
+    AllReduce operation implementing warp/block-level reduction.
+    Based on tl::AllReduce from reduce.h
+
+    Args:
+        reducer: Reducer operator class (SumOp, MaxOp, etc.)
+        threads: Number of threads participating in reduction
+        scale: Reduction scale factor
+        thread_offset: Thread ID offset
+        all_threads: Total number of threads in block
+
+    Returns:
+        A callable object with run() and run_hopper() methods
+    """
+
+    class AllReduceInstance:
+        def __init__(self, reducer, threads, scale, thread_offset: cutlass.Constexpr[int], all_threads: cutlass.Constexpr[int]):
+            self.reducer = reducer
+            self.threads = threads
+            self.scale = scale
+            self.thread_offset = thread_offset
+            self.all_threads = all_threads if all_threads is not None else threads
+
+        def run(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce across threads.
+            Based on tl::AllReduce<...>::run from reduce.h
+            """
+            offset = self.threads // 2
+
+            if offset >= 32:
+                # Use shared memory for large thread counts
+                cute.arch.sync_threads()
+                tidx, _, _ = cute.arch.thread_idx()
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                cute.arch.sync_threads()
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run(x, red_buf)
+            )
+
+        def run_hopper(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce on Hopper architecture using bar.sync.
+            Based on tl::AllReduce<...>::run_hopper from reduce.h
+            """
+            offset = self.threads // 2
+            tidx, _, _ = cute.arch.thread_idx()
+            if offset >= 32:
+                # Use inlined asm for bar.sync to avoid instruction reordering
+                bar_sync_ptx(1, self.all_threads)
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                bar_sync_ptx(2, self.all_threads)
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run_hopper(x, red_buf)
+            )
+
+    return AllReduceInstance(reducer, threads, scale, thread_offset, all_threads)
diff --git a/tilelang/contrib/cutedsl/threadblock_swizzle.py b/tilelang/contrib/cutedsl/threadblock_swizzle.py
new file mode 100644
index 00000000..1ce78eb8
--- /dev/null
+++ b/tilelang/contrib/cutedsl/threadblock_swizzle.py
@@ -0,0 +1,54 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Constexpr
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class dim3:
+    x: int
+    y: int
+    z: int
+
+
+def ThreadIdx() -> dim3:
+    return dim3(*cute.arch.thread_idx())
+
+
+def BlockIdx() -> dim3:
+    return dim3(*cute.arch.block_idx())
+
+
+def GridDim() -> dim3:
+    return dim3(*cute.arch.grid_dim())
+
+
+@cute.jit
+def rasterization2DRow(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.x
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.x
+    col_idx = (gridDim.x - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    row_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
+
+
+@cute.jit
+def rasterization2DColumn(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.y
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.y
+    row_idx = (gridDim.y - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    col_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 8b70f6d4..fda7f750 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -197,7 +197,8 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tir.transform.Simplify()(device_mod)
 
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda")
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
     else:
@@ -211,7 +212,8 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda") + "_without_compile"
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index a61c91d1..eac206f7 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -49,7 +49,7 @@ _Ret = TypeVar("_Ret")
 def compile(
     func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     verbose: bool = False,
@@ -64,7 +64,7 @@ def compile(
         The TileLang TIR function to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
         Execution backend to use for kernel execution. Use "auto" to pick a sensible
         default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
@@ -118,7 +118,7 @@ def compile(
 def par_compile(
     funcs: Iterable[PrimFunc[_KP, _T]],
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "auto",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "auto",
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     verbose: bool = False,
@@ -135,7 +135,7 @@ def par_compile(
         The TileLang TIR functions to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
         Execution backend to use for kernel execution. Use "auto" to pick a sensible
         default per target (cuda->tvm_ffi, metal->torch, others->cython).
     target : Union[str, Target], optional
@@ -256,7 +256,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
     """
 
     out_idx: list[int] | int | None
-    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"]
     target: str | Target
     target_host: str | Target
     verbose: bool
@@ -424,7 +424,7 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
             return kernel
 
 
-ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"]
+ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"]
 
 
 @overload
@@ -473,7 +473,7 @@ def jit(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
         Backend for kernel execution and argument passing. Use "auto" to pick a sensible
         default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py
index dcfdaf5b..f511608f 100644
--- a/tilelang/jit/adapter/__init__.py
+++ b/tilelang/jit/adapter/__init__.py
@@ -4,3 +4,4 @@ from .ctypes import CtypesKernelAdapter  # noqa: F401
 from .cython import CythonKernelAdapter  # noqa: F401
 from .nvrtc import NVRTCKernelAdapter  # noqa: F401
 from .torch import MetalKernelAdapter  # noqa: F401
+from .cutedsl import CuTeDSLKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/cutedsl/__init__.py b/tilelang/jit/adapter/cutedsl/__init__.py
new file mode 100644
index 00000000..e25899a1
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/__init__.py
@@ -0,0 +1,16 @@
+"""CuTeDSL Backend for TileLang.
+
+This module provides runtime compilation support using NVIDIA's CuTeDSL API.
+"""
+
+__all__ = [
+    "CuTeDSLKernelAdapter",
+    "TLCuTeDSLSourceWrapper",
+    "CuTeDSLLibraryGenerator",
+    "check_cutedsl_available",
+]
+
+from .checks import check_cutedsl_available  # noqa: F401
+from .adapter import CuTeDSLKernelAdapter  # noqa: F401
+from .wrapper import TLCuTeDSLSourceWrapper  # noqa: F401
+from .libgen import CuTeDSLLibraryGenerator  # noqa: F401
diff --git a/tilelang/jit/adapter/cutedsl/adapter.py b/tilelang/jit/adapter/cutedsl/adapter.py
new file mode 100644
index 00000000..a0ab5db4
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/adapter.py
@@ -0,0 +1,368 @@
+from __future__ import annotations
+import logging
+from typing import Any, Callable
+
+import torch
+from tvm import tir
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.engine.param import KernelParam
+from tilelang.jit.adapter.wrapper import TLPyWrapper
+from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+from tilelang.jit.adapter.cutedsl.libgen import CuTeDSLLibraryGenerator
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class CuTeDSLKernelAdapter(BaseKernelAdapter):
+    pymodule = None
+
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        check_cutedsl_available()
+
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            self.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        self.param_dtypes = [param.torch_dtype() for param in params]
+        self.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            self.param_shapes.append(native_shape)
+
+        self.dynamic_symbolic_map, self.dynamic_symbolic_order = self._process_dynamic_symbolic()
+
+        self.target = Target.canon_target(determine_target(target))
+        self.verbose = verbose
+        self.wrapper = TLPyWrapper(self.target)
+        self.wrapper.assign_optimized_module(self.ir_module)
+        self.wrapper.assign_pass_configs(pass_configs)
+        self.wrapper.assign_host_module(host_mod)
+        self.wrapper.assign_device_module(device_mod)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
+        self.tma_cpp_init_code = wrapper_result["tma_cpp_init_code"]
+        self.tma_lib_name = wrapper_result["tma_lib_name"]
+        self.launcher_cpp_code = wrapper_result.get("launcher_cpp_code", None)
+        self.launcher_lib_name = wrapper_result.get("launcher_lib_name", None)
+
+        self.lib_generator = CuTeDSLLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
+        self.lib_generator.update_host_func(self.host_func)
+        self.lib_generator.update_tma_cpp_init_code(self.tma_cpp_init_code)
+        self.lib_generator.update_tma_lib_name(self.tma_lib_name)
+        self.lib_generator.update_launcher_cpp_code(self.launcher_cpp_code)
+        self.lib_generator.update_launcher_lib_name(self.launcher_lib_name)
+        self.lib_generator.assign_compile_flags(compile_flags)
+        self.lib_generator.compile_lib()
+        self.lib_generator.load_lib()
+        self.libpath = self.lib_generator.libpath
+        self.device_kernel_source = open(self.libpath).read()
+        self.pymodule = self.lib_generator.pymodule
+
+        self._post_init()
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            adapter.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
+        adapter.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            adapter.param_shapes.append(native_shape)
+
+        adapter.dynamic_symbolic_map, adapter.dynamic_symbolic_order = adapter._process_dynamic_symbolic()
+
+        adapter.target = Target.canon_target(determine_target(target))
+        adapter.verbose = verbose
+        adapter.lib_generator = CuTeDSLLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator.assign_compile_flags(compile_flags)
+        adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = open(adapter.libpath).read()
+        adapter.pymodule = adapter.lib_generator.pymodule
+
+        adapter._post_init()
+        return adapter
+
+    def _process_dynamic_symbolic(self) -> tuple[dict[tir.Var, tuple[int, int, int]], list[tir.Var]]:
+        """Extract information about dynamic symbols from the TIR function.
+
+        We follow the same ordering semantics as `TLCUDASourceWrapper.get_dynamic_symbolic_set()`:
+        1) dynamic symbols in buffer shapes (in prim_func param order)
+        2) then dynamic symbols in buffer strides
+
+        The mapping encodes:
+        - id=0: shape var -> (0, buffer_param_index, dim_index)
+        - id=1: stride var -> (1, buffer_param_index, stride_index)
+
+        Returns:
+            (dynamic_symbolic_map, dynamic_symbolic_order)
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] = {}
+        dynamic_symbolic_order: list[tir.Var] = []
+
+        def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
+            if v in dynamic_symbolic_map:
+                return
+            dynamic_symbolic_map[v] = entry
+            dynamic_symbolic_order.append(v)
+
+        # 1) Shapes
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            for j, shape in enumerate(buffer.shape):
+                if isinstance(shape, tir.Var):
+                    unique_push_back(shape, (0, i, j))
+
+        # 2) Strides
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            if buffer.strides is None:
+                continue
+            for j, stride in enumerate(buffer.strides):
+                if isinstance(stride, tir.Var):
+                    unique_push_back(stride, (1, i, j))
+
+        return dynamic_symbolic_map, dynamic_symbolic_order
+
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
+        """Get the CUDA kernel source code.
+
+        Returns
+        -------
+        str | None
+            The kernel source code, or None if not available
+        """
+        return self.device_kernel_source
+
+    def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
+        """Low-level function to call the compiled CUDA kernel."""
+        result = self.pymodule.call(*args, stream=stream)
+
+        # After first call, save cubin to cache if needed
+        self._save_cubin_to_cache_if_needed()
+
+        return result
+
+    def _save_cubin_to_cache_if_needed(self):
+        """Save cubin to cache directory after first execution.
+
+        This is called after the first kernel execution to ensure the generated
+        cubin file is copied to the cache directory for future reuse.
+        """
+        if getattr(self, "_cubin_saved_to_cache", False):
+            return
+        self._cubin_saved_to_cache = True
+
+        # Check if we have a cache path (set by kernel_cache)
+        cache_path = getattr(self, "_cache_path", None)
+        if cache_path is None:
+            return
+
+        import os
+        import shutil
+
+        # Source cubin path (in temp directory)
+        src_py_path = self.libpath
+        src_py_stem = os.path.splitext(os.path.basename(src_py_path))[0]
+        src_dir = os.path.dirname(src_py_path)
+        src_cubin_path = os.path.join(src_dir, f"{src_py_stem}.cubin")
+
+        if not os.path.exists(src_cubin_path):
+            return
+
+        # Destination cubin path (in cache directory)
+        dst_cubin_path = os.path.join(cache_path, "kernel.cubin")
+
+        if os.path.exists(dst_cubin_path):
+            return
+
+        # Copy cubin to cache
+        try:
+            shutil.copy2(src_cubin_path, dst_cubin_path)
+            logger.debug(f"Saved CuTeDSL cubin to cache: {dst_cubin_path}")
+        except Exception as e:
+            logger.warning(f"Failed to save cubin to cache: {e}", exc_info=True)
+
+    def _wrap_forward_from_prebuild_lib(self, *ins: Any, stream: int | None = None):
+        """High-level wrapper for kernel execution.
+
+        Handles:
+        1. Input validation
+        2. Output tensor allocation
+        3. Dynamic shape resolution
+        4. CUDA stream management
+
+        Args:
+            ins: Input arguments (may include scalars and tensors)
+            stream: Optional CUDA stream for asynchronous execution
+
+        Returns:
+            Single tensor or list of tensors containing the kernel results
+        """
+        if len(ins) + len(self.result_idx) != len(self.params):
+            raise ValueError(
+                f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
+            )
+
+        # Materialize args in PrimFunc param order (inputs + allocated outputs)
+        ins_idx = 0
+        param_values: list[Any] = [None] * len(self.params)
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                continue
+            param_values[i] = ins[ins_idx]
+            ins_idx += 1
+
+        first_tensor = next((v for v in param_values if isinstance(v, torch.Tensor)), None)
+        if first_tensor is None:
+            raise ValueError("Expected at least one torch.Tensor argument to infer CUDA device")
+
+        args: list[Any] = []
+
+        # tensor pointers
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                dtype = self.param_dtypes[i]
+                shape = []
+                # Now working with native Python list, no FFI calls needed
+                for s in self.param_shapes[i]:
+                    if isinstance(s, tir.Var):
+                        ref_id, ref_param_idx, ref_dim_idx = self.dynamic_symbolic_map[s]
+                        ref_val = param_values[ref_param_idx]
+                        if not isinstance(ref_val, torch.Tensor):
+                            raise TypeError(f"Dynamic shape/stride var {s} refers to a non-tensor param at index {ref_param_idx}")
+                        if ref_id == 0:
+                            shape.append(ref_val.shape[ref_dim_idx])
+                        elif ref_id == 1:
+                            # Stride vars are not expected in output shapes, but handle defensively.
+                            shape.append(ref_val.stride()[ref_dim_idx])
+                        else:
+                            raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+                    else:  # Already converted to Python int during initialization
+                        shape.append(s)
+                tensor = torch.empty(*shape, dtype=dtype, device=first_tensor.device)
+                param_values[i] = tensor
+            else:
+                tensor = param_values[i]
+            args.append(tensor)
+
+        # dynamic symbolics
+        for sym in self.dynamic_symbolic_order:
+            ref_id, buffer_idx, dim_idx = self.dynamic_symbolic_map[sym]
+            ref_val = param_values[buffer_idx]
+            if not isinstance(ref_val, torch.Tensor):
+                raise TypeError(f"Dynamic symbolic var {sym} refers to a non-tensor param at index {buffer_idx}")
+            if ref_id == 0:
+                args.append(ref_val.shape[dim_idx])
+            elif ref_id == 1:
+                args.append(ref_val.stride()[dim_idx])
+            else:
+                raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+
+        # if stream is not None, we need to pass the stream to the library
+        if stream is None:
+            if str(self.target).startswith("cuda") and torch.cuda.is_available():
+                stream = torch.cuda.current_stream().cuda_stream
+            else:
+                stream = 0
+
+        self._forward_from_prebuild_lib(*args, stream=stream)
+
+        if len(self.result_idx) == 1:
+            return args[self.result_idx[0]]
+        else:
+            return [args[i] for i in self.result_idx]
+
+    def _convert_torch_func(self) -> Callable[..., torch.Tensor | list[torch.Tensor]]:
+        """Convert to a PyTorch-compatible function.
+
+        Returns
+        -------
+        Callable[..., torch.Tensor | list[torch.Tensor]]
+            A callable function that takes tensors and returns tensor(s)
+        """
+        return self._wrap_forward_from_prebuild_lib
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/cutedsl/checks.py b/tilelang/jit/adapter/cutedsl/checks.py
new file mode 100644
index 00000000..ced8ea7c
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/checks.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import re
+from importlib import metadata as _importlib_metadata
+from importlib.util import find_spec as _find_spec
+import os
+
+_CUTEDSL_PUBLIC_DIST = "nvidia-cutlass-dsl"
+_CUTEDSL_MIN_VERSION = (4, 3, 1)
+_VERSION_TRIPLE_RE = re.compile(r"(\d+)\.(\d+)\.(\d+)")
+
+
+def _parse_version_triple(version_str: str) -> tuple[int, int, int] | None:
+    """Parse a best-effort (major, minor, patch) triple from a version string.
+
+    We intentionally avoid importing heavy/optional version parsers. For our
+    minimum requirement (>= 4.3.1), a numeric triple comparison is sufficient.
+    """
+    m = _VERSION_TRIPLE_RE.search(version_str)
+    if not m:
+        return None
+    return int(m.group(1)), int(m.group(2)), int(m.group(3))
+
+
+def _min_version_str() -> str:
+    return ".".join(map(str, _CUTEDSL_MIN_VERSION))
+
+
+def _requirement_spec() -> str:
+    return f"{_CUTEDSL_PUBLIC_DIST}>={_min_version_str()}"
+
+
+def check_cutedsl_available() -> None:
+    """Fail fast if the CuTeDSL backend cannot be used in this Python environment.
+
+    Policy:
+    - If the public distribution `nvidia-cutlass-dsl` is installed, require version >= a minimum supported version.
+    - Regardless of distribution metadata, require that `cutlass.cute` is importable.
+
+    This intentionally does not mention or special-case any internal distributions.
+    """
+    # 1) Version gate (only when the public dist metadata is present)
+    try:
+        dist_version = _importlib_metadata.version(_CUTEDSL_PUBLIC_DIST)
+    except _importlib_metadata.PackageNotFoundError:
+        dist_version = None
+    except Exception:
+        # Metadata is best-effort; don't block internal/nonstandard installs here.
+        dist_version = None
+
+    if dist_version is not None:
+        parsed = _parse_version_triple(dist_version)
+        if parsed is None or parsed < _CUTEDSL_MIN_VERSION:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL backend requires `{req}`, but found version `{dist_version}`. Please run: `pip install -U '{req}'`."
+            )
+
+    # 2) Capability probe: keep it cheap.
+    # Importing cutlass/cute can be expensive and defeats our lazy-import design,
+    # especially on cache hits. We only require that the module is importable.
+    cutlass_spec = _find_spec("cutlass")
+    if cutlass_spec is None:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
+
+    # Avoid find_spec("cutlass.cute") which can be surprisingly expensive.
+    # Instead, check for a 'cute' submodule/package under cutlass's search locations.
+    locs = getattr(cutlass_spec, "submodule_search_locations", None)
+    has_cute = False
+    if locs:
+        for base in locs:
+            if os.path.isdir(os.path.join(base, "cute")) or os.path.isfile(os.path.join(base, "cute.py")):
+                has_cute = True
+                break
+
+    if not has_cute:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
diff --git a/tilelang/jit/adapter/cutedsl/libgen.py b/tilelang/jit/adapter/cutedsl/libgen.py
new file mode 100644
index 00000000..3dac6b14
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/libgen.py
@@ -0,0 +1,124 @@
+"""CuTeDSL Library Generator for TileLang.
+
+This module provides library generation functionality for the CuTeDSL backend.
+"""
+
+from __future__ import annotations
+import importlib.util
+import os
+import tempfile
+import subprocess
+
+from tvm.target import Target
+
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+
+class CuTeDSLLibraryGenerator(LibraryGenerator):
+    host_func: str | None = None
+    tma_cpp_init_code: str | None = None
+    tma_lib_name: str | None = None
+    launcher_cpp_code: str | None = None
+    launcher_lib_name: str | None = None
+    pymodule = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        self.host_func = host_func
+
+    def update_tma_cpp_init_code(self, tma_cpp_init_code: str):
+        self.tma_cpp_init_code = tma_cpp_init_code
+
+    def update_tma_lib_name(self, tma_lib_name: str):
+        self.tma_lib_name = tma_lib_name
+
+    def update_launcher_cpp_code(self, launcher_cpp_code: str):
+        self.launcher_cpp_code = launcher_cpp_code
+
+    def update_launcher_lib_name(self, launcher_lib_name: str):
+        self.launcher_lib_name = launcher_lib_name
+
+    def load_lib(self, lib_path: str | None = None):
+        if lib_path is None:
+            if self.libpath is None:
+                raise RuntimeError("CuTeDSLLibraryGenerator.libpath is not set; call compile_lib() first or pass lib_path explicitly.")
+            lib_path = self.libpath
+
+        self.pymodule = self.import_from_file("kernel", lib_path)
+
+    def compile_lib(self, timeout: float = None):
+        if self.host_func is None:
+            raise RuntimeError("CuTeDSLLibraryGenerator.host_func is not set; call update_host_func() before compile_lib().")
+        target = self.target
+        if is_cutedsl_target(target):
+            # Use a dedicated temp directory per kernel so CuTeDSL artifacts (e.g. kept .cubin)
+            # never pollute user CWD, and are easy to locate alongside the generated module.
+            work_dir = tempfile.mkdtemp(prefix="tilelang_cutedsl_")
+            src_path = os.path.join(work_dir, "kernel.py")
+            with open(src_path, "w") as f:
+                # Note: lib_code (containing @cute.kernel definitions) is embedded
+                # inside host_func's _generate_cubin_if_needed function, so we only
+                # write host_func here. This ensures cute imports are lazy-loaded.
+                f.write(self.host_func)
+
+            # Compile C++ launcher library if needed
+            if self.launcher_cpp_code is not None:
+                with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    suffix=".cpp",
+                    delete=False,
+                ) as launcher_src:
+                    launcher_src.write(self.launcher_cpp_code)
+                    launcher_src_path = launcher_src.name
+
+                # Generate launcher lib under the same directory as the source file
+                launcher_lib_path = os.path.join(os.path.dirname(src_path), self.launcher_lib_name)
+
+                # Get TVM FFI compiler flags using tvm_ffi.libinfo API
+                try:
+                    import tvm_ffi.libinfo
+
+                    include_paths = tvm_ffi.libinfo.include_paths()
+                    tvm_cxxflags = [f"-I{path}" for path in include_paths]
+                    lib_path = tvm_ffi.libinfo.find_libtvm_ffi()
+                    lib_dir = os.path.dirname(lib_path)
+                    tvm_ldflags = [f"-L{lib_dir}", "-ltvm_ffi"]
+                except (ImportError, RuntimeError):
+                    # tvm_ffi unavailable or libinfo functions failed
+                    tvm_cxxflags = []
+                    tvm_ldflags = []
+
+                # Compile with nvcc (need CUDA driver API)
+                compile_cmd = [
+                    "nvcc",
+                    "-shared",
+                    "-Xcompiler=-fPIC",
+                    "-lcuda",
+                    *tvm_cxxflags,
+                    *tvm_ldflags,
+                    "-o",
+                    launcher_lib_path,
+                    launcher_src_path,
+                ]
+
+                result = subprocess.run(compile_cmd, check=False, capture_output=True, text=True, timeout=timeout)
+                if result.returncode != 0:
+                    raise RuntimeError(f"Failed to compile C++ launcher: {result.stderr}")
+
+                self.launcher_libpath = launcher_lib_path
+                self.launcher_libname = self.launcher_lib_name
+
+            self.srcpath = src_path
+            self.libpath = src_path
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/jit/adapter/cutedsl/wrapper.py b/tilelang/jit/adapter/cutedsl/wrapper.py
new file mode 100644
index 00000000..c20d2ec6
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/wrapper.py
@@ -0,0 +1,1354 @@
+"""CuTeDSL Source Wrapper for TileLang.
+
+This module provides C++ kernel launcher generation for the CuTeDSL backend.
+
+Key features:
+- Automatic C++ launcher generation with CUDA Driver API
+- TMA descriptors on HOST memory, passed via __grid_constant__ (no device copy needed)
+- cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+- Support for single and multiple kernel launches
+- Complete cache system integration
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import (
+    extract_python_func_declaration,
+    pythonic_expr,
+    parse_tma_descriptor_args,
+)
+
+# =============================================================================
+# C++ LAUNCHER TEMPLATES (using named parameters for clarity)
+# =============================================================================
+
+# TMA single descriptor initialization template (writes to caller-provided host array)
+# No device copy needed - cuLaunchKernel handles __grid_constant__ params automatically
+CPP_TMA_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name})
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t boxDim[{rank}] = {{{box_dim_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+
+    result = cuTensorMapEncodeTiled(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        boxDim,
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA single im2col descriptor initialization template (writes to caller-provided host array)
+# Align field ordering with NVRTC wrapper (cuTensorMapEncodeIm2col signature).
+CPP_TMA_IM2COL_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name}) [im2col]
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+    int32_t lowerCorner[{rank_minus_two}] = {{{lower_corner_values}}};
+    int32_t upperCorner[{rank_minus_two}] = {{{upper_corner_values}}};
+
+    result = cuTensorMapEncodeIm2col(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        lowerCorner,
+        upperCorner,
+        static_cast<uint32_t>({channels_per_pixel}),
+        static_cast<uint32_t>({pixels_per_column}),
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA im2col descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA initialization function template (writes to caller-provided host array)
+# __grid_constant__ allows kernel to receive TMA descriptor by value via param space
+CPP_TMA_INIT_FUNC_TEMPLATE = """\
+CUresult tma_init(CUtensorMap* tma_descs, {func_args}) {{
+  // Initialize {num_descs} TMA descriptor(s) in caller-provided host array
+  // cuLaunchKernel will copy 128-byte CUtensorMap to kernel param space automatically
+  CUresult result;
+
+{desc_init_code}
+
+  return CUDA_SUCCESS;
+}}
+"""
+
+# Kernel initialization template
+CPP_KERNEL_INIT_TEMPLATE = """\
+  // Find and configure kernel {kernel_idx}: {kernel_name}
+  result = find_kernel_by_pattern(g_module, "{kernel_name}", &g_kernels[{kernel_idx}]);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to find kernel {kernel_name}: " << result << "\\n";
+    return result;
+  }}
+
+  if ({smem_size} > 0) {{
+    result = cuFuncSetAttribute(g_kernels[{kernel_idx}],
+                                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                                {smem_size});
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to set smem for {kernel_name}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA launch initialization template (host memory mode - uses __grid_constant__)
+# Kernel receives TMA descriptor by value: .param .align 128 .b8 xxx_param[128]
+CPP_TMA_LAUNCH_INIT_TEMPLATE = """\
+  // Declare stack-local TMA descriptor array (eliminates concurrency race)
+  CUtensorMap tma_descs[{num_tma_descs}];
+
+  // Initialize TMA descriptors (HOST memory - passed via __grid_constant__)
+  // NOTE: We intentionally do NOT reuse/cached descriptors across launches.
+  // Pointer-only reuse is a correctness trap (shape/stride may change with same ptr),
+  // and correctness beats micro-optimizations.
+  result = tma_init(tma_descs, {tma_tensor_args});
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize TMA descriptors: " << result << "\\n";
+    return result;
+  }}
+"""
+
+# Kernel launch template
+CPP_KERNEL_LAUNCH_TEMPLATE = """\
+  // Launch kernel {kernel_idx}: {kernel_name}
+  {{
+    void* args[] = {{{kernel_args}}};
+    result = cuLaunchKernel(
+        g_kernels[{kernel_idx}],
+        {grid_x}, {grid_y}, {grid_z},
+        {block_x}, {block_y}, {block_z},
+        {smem_size},
+        stream,
+        args,
+        nullptr
+    );
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to launch kernel {kernel_name}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# Complete C++ launcher template
+CPP_LAUNCHER_TEMPLATE = """\
+#include <cuda.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+#include <string>
+
+// TVM Headers
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+// Cached module handle
+static CUmodule g_module = nullptr;
+static bool g_module_initialized = false;
+
+// Cached kernel functions
+static CUfunction g_kernels[{num_kernels}] = {{nullptr}};
+static bool g_kernels_initialized = false;
+
+// Find kernel by pattern (substring match, prefer base name over _N variants)
+CUresult find_kernel_by_pattern(CUmodule module, const char* pattern, CUfunction* out_func) {{
+  CUresult result;
+  unsigned int num_funcs = 0;
+
+  result = cuModuleGetFunctionCount(&num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get function count: " << result << "\\n";
+    return result;
+  }}
+
+  std::vector<CUfunction> func_list(num_funcs);
+  result = cuModuleEnumerateFunctions(func_list.data(), num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to enumerate functions: " << result << "\\n";
+    return result;
+  }}
+
+  // Collect substring matches, separating base name from _N variants
+  std::vector<std::pair<std::string, CUfunction>> base_matches;     // pattern not followed by _digit
+  std::vector<std::pair<std::string, CUfunction>> variant_matches;  // pattern followed by _digit
+
+  size_t pattern_len = std::strlen(pattern);
+
+  for (unsigned int i = 0; i < num_funcs; i++) {{
+    const char* func_name = nullptr;
+    result = cuFuncGetName(&func_name, func_list[i]);
+    if (result != CUDA_SUCCESS || func_name == nullptr) {{
+      std::cerr << "Failed to get function name: " << result << "\\n";
+      return result;
+    }}
+
+    std::string name_str(func_name);
+    size_t pos = name_str.find(pattern);
+
+    if (pos != std::string::npos) {{
+      // Found substring match
+      size_t after_pattern = pos + pattern_len;
+
+      // Check what follows the pattern
+      if (after_pattern < name_str.length() &&
+          name_str[after_pattern] == '_' &&
+          after_pattern + 1 < name_str.length() &&
+          std::isdigit(name_str[after_pattern + 1])) {{
+        // Pattern followed by _digit (e.g., "main_kernel_1")
+        variant_matches.push_back({{name_str, func_list[i]}});
+      }} else {{
+        // Pattern not followed by _digit (e.g., "main_kernel" itself)
+        base_matches.push_back({{name_str, func_list[i]}});
+      }}
+    }}
+  }}
+
+  // Decision logic: prefer base matches over variant matches
+  if (!base_matches.empty()) {{
+    if (base_matches.size() == 1) {{
+      *out_func = base_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple base matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << base_matches.size()
+              << " base kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : base_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern.\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No base matches, try variant matches
+  if (!variant_matches.empty()) {{
+    if (variant_matches.size() == 1) {{
+      *out_func = variant_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple variant matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << variant_matches.size()
+              << " variant kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : variant_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern (e.g., '" << pattern << "_1').\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No matches at all
+  std::cerr << "Failed to find kernel matching pattern '" << pattern << "'\\n";
+  return CUDA_ERROR_NOT_FOUND;
+}}
+
+
+// Initialize CUDA module (called once on first launch)
+static CUresult tilelang_init_cuda_module(const std::string& cubin_path) {{
+  if (g_module_initialized) return CUDA_SUCCESS;
+
+  CUresult result;
+  result = cuInit(0);
+  if (result != CUDA_SUCCESS) return result;
+
+  std::ifstream cubin_file(cubin_path.c_str(), std::ios::binary);
+  if (!cubin_file) {{
+    std::cerr << "Failed to open cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_FILE_NOT_FOUND;
+  }}
+
+  std::vector<char> cubin_data((std::istreambuf_iterator<char>(cubin_file)),
+                                std::istreambuf_iterator<char>());
+  cubin_file.close();
+
+  if (cubin_data.empty()) {{
+    std::cerr << "Empty cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_INVALID_IMAGE;
+  }}
+
+  result = cuModuleLoadData(&g_module, cubin_data.data());
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to load CUDA module: " << result << "\\n";
+    return result;
+  }}
+
+  g_module_initialized = true;
+  return CUDA_SUCCESS;
+}}
+
+// Initialize all kernel functions (called once after module load)
+static CUresult tilelang_init_kernels() {{
+  if (g_kernels_initialized) return CUDA_SUCCESS;
+  CUresult result;
+
+{kernel_inits}
+
+  g_kernels_initialized = true;
+  return CUDA_SUCCESS;
+}}
+
+// TMA descriptor initialization (host-side)
+{tma_init_func}
+
+// Main kernel launcher
+extern "C" CUresult launch_kernel({launch_func_sig}, uint64_t _stream, tvm::ffi::Bytes cubin_path) {{
+  CUresult result;
+
+  std::string cubin_path_str(reinterpret_cast<const char*>(cubin_path.data()), cubin_path.size());
+  result = tilelang_init_cuda_module(cubin_path_str);
+  if (result != CUDA_SUCCESS) return result;
+
+  result = tilelang_init_kernels();
+  if (result != CUDA_SUCCESS) return result;
+
+{get_ptr_code}
+  CUstream stream = (CUstream)_stream;
+
+{tma_init_in_launch}
+
+{kernel_launches}
+
+  return CUDA_SUCCESS;
+}}
+
+// Cleanup function
+extern "C" CUresult cleanup_module() {{
+  if (g_module_initialized && g_module != nullptr) {{
+    cuModuleUnload(g_module);
+    g_module = nullptr;
+    g_module_initialized = false;
+  }}
+
+  g_kernels_initialized = false;
+
+  return CUDA_SUCCESS;
+}}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(launch_kernel, launch_kernel);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(cleanup_module, cleanup_module);
+"""
+
+# =============================================================================
+# PYTHON CUBIN GENERATION TEMPLATES
+# =============================================================================
+
+# TMA descriptor atom initialization template
+CUBIN_TMA_ATOM_INIT_TEMPLATE = """\
+    {desc_name} = tl.Gemm_SM90.get_tma_atom(__fake_tensor__, (32, 32))"""
+
+# Kernel launch call template
+CUBIN_KERNEL_LAUNCH_TEMPLATE = """\
+    {function_name}({call_args}).launch(
+      grid=[{grid_x}, {grid_y}, {grid_z}],
+      block=[{block_x}, {block_y}, {block_z}],
+      smem={smem_size},
+      stream=stream,
+    )"""
+
+# Fake tensor creation template
+CUBIN_FAKE_TENSOR_TEMPLATE = """\
+  __fake_{arg_name}__ = make_fake_compact_tensor(_DTYPE_MAP[str({arg_name}.dtype)], {arg_name}.shape, stride_order={arg_name}.dim_order()[::-1], assumed_align=16)"""
+
+# Complete cubin generation code template
+# {lib_code} contains the @cute.kernel definitions and is embedded here
+CUBIN_GEN_CODE_TEMPLATE = """\
+{lib_code}
+
+  @cute.jit
+  def kernel_wrapper({wrapper_args}):
+{tma_init_code}{kernel_launches}
+
+  # Compile kernels to generate cubin
+{fake_tensor_code}{fake_tma_tensor_code}  __fake_stream__ = make_fake_stream()
+  # Always generate cubin under a unique staging directory to avoid concurrent
+  # processes clobbering each other's intermediate artifacts.
+  _staging_dir = Path(tempfile.mkdtemp(
+      prefix=Path(__file__).stem + ".cubin.staging.",
+      dir=_module_dir,
+  ))
+  try:
+    _kernel_wrapper = cute.compile(
+        kernel_wrapper,
+        {compile_args},
+        options=f"--enable-tvm-ffi --keep-cubin --dump-dir={{_staging_dir.as_posix()}}",
+    )
+
+    # CuTeDSL generates a long, mangled cubin filename that includes argument/type info,
+    # e.g. "cutlass_kernel_wrapper_FakeTensor...sm_90a.cubin". We expect exactly one cubin.
+    _cubin_files = sorted(_staging_dir.glob("*.cubin"), key=lambda p: p.stat().st_mtime)
+    if len(_cubin_files) != 1:
+      raise RuntimeError(
+          f"Expected exactly one .cubin under {{_staging_dir}}, got {{len(_cubin_files)}}: {{_cubin_files}}"
+      )
+    os.replace(_cubin_files[0], _cubin_path)
+  finally:
+    shutil.rmtree(_staging_dir, ignore_errors=True)"""
+
+# =============================================================================
+# PYTHON HOST FUNCTION TEMPLATE
+# =============================================================================
+
+PYTHON_HOST_FUNC_TEMPLATE = """\
+import os
+from pathlib import Path
+
+# Minimal imports for runtime (no cutlass/cute - only needed for cubin generation)
+import tvm.runtime as runtime
+
+_cpp_launcher = None
+_cpp_launcher_lib = None
+_cubin_generated = False
+
+# Pre-compute paths - cubin is stored alongside the launcher .so
+# Use module basename to avoid conflicts when multiple kernels run concurrently
+# e.g., "/tmp/tmp8liu__ho.py" -> "/tmp/tmp8liu__ho.cubin"
+#       "kernel.py" (in cache) -> "kernel.cubin"
+_module_dir = Path(os.path.dirname(__file__))
+_cubin_path = _module_dir / (Path(__file__).stem + ".cubin")
+_cubin_path_bytes = _cubin_path.as_posix().encode('utf-8')
+_cubin_needs_generation = not _cubin_path.exists()
+
+def _generate_cubin_if_needed({cubin_gen_params}):
+  \"\"\"Generate cubin file on first call.
+
+  All CuTeDSL imports are inside this function to avoid slow
+  module-level initialization when loading from cache.
+  \"\"\"
+  global _cubin_generated, _cubin_path
+
+  # Lazy import CuTeDSL only when cubin generation is needed
+  from cuda.bindings.driver import CUstream
+  import cutlass
+  import cutlass.cute as cute
+  from cutlass.cute.runtime import make_fake_stream, make_fake_compact_tensor
+  import tilelang.contrib.cutedsl as tl
+  # We rely on CuTeDSL's keep-cubin artifact rather than custom extraction.
+  import tempfile
+  import shutil
+
+  _DTYPE_MAP = {{
+      "torch.float32": cutlass.Float32,
+      "torch.float16": cutlass.Float16,
+      "torch.bfloat16": cutlass.BFloat16,
+      "torch.float8_e4m3fnuz": cutlass.Float8E4M3FN,
+      "torch.float8_e4m3fn": cutlass.Float8E4M3FN,
+      "torch.float8_e5m2": cutlass.Float8E5M2,
+      "torch.float64": cutlass.Float64,
+      "torch.int64": cutlass.Int64,
+      "torch.int32": cutlass.Int32,
+      "torch.uint32": cutlass.Uint32,
+      "torch.bool": cutlass.Boolean,
+      "torch.int8": cutlass.Int8,
+      "torch.uint8": cutlass.Uint8,
+      "torch.int16": cutlass.Int16,
+      "torch.uint16": cutlass.Uint16,
+      "torch.uchar": cutlass.Uint8,
+  }}
+
+{cubin_gen_code}
+
+  _cubin_generated = True
+
+def _load_cpp_launcher():
+  \"\"\"Load C++ kernel launcher.\"\"\"
+  global _cpp_launcher, _cpp_launcher_lib
+  if _cpp_launcher is not None:
+    return _cpp_launcher
+
+  lib_path = os.path.join(os.path.dirname(__file__), "{launcher_lib_name}")
+  if not os.path.exists(lib_path):
+    raise FileNotFoundError(f"Launcher not found: {{lib_path}}")
+
+  _cpp_launcher_lib = runtime.load_module(lib_path)
+  _cpp_launcher = _cpp_launcher_lib["launch_kernel"]
+  return _cpp_launcher
+
+def call({call_func_params}, stream):
+  \"\"\"Kernel dispatch function.\"\"\"
+  global _cubin_path_bytes, _cubin_needs_generation
+
+  if _cubin_needs_generation:
+    _generate_cubin_if_needed({cubin_gen_call_args})
+    _cubin_needs_generation = False
+
+{arg_prep_code}
+
+  launcher = _load_cpp_launcher()
+  result = launcher({launcher_call_args}, stream, _cubin_path_bytes)
+
+  if result != 0:
+    raise RuntimeError(f"Kernel launch failed with CUDA error {{result}}")
+"""
+
+# =============================================================================
+# WRAPPER CLASS
+# =============================================================================
+
+
+class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
+    """Wrapper class for TileLang CuTe DSL backend with C++ launcher.
+
+    Generates optimized C++ launcher code that:
+    - Loads cubin via CUDA Driver API
+    - Passes TMA descriptors by value (host-side, no device copy)
+    - Launches kernels with minimal Python overhead
+    - Supports both single and multiple kernel scenarios
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "cutlass.Float32",
+        "float16": "cutlass.Float16",
+        "bfloat16": "cutlass.BFloat16",
+        "float8_e4m3": "cutlass.Float8E4M3",
+        "float8_e5m2": "cutlass.Float8E5M2",
+        "float64": "cutlass.Float64",
+        "int64": "cutlass.Int64",
+        "int32": "cutlass.Int32",
+        "uint32": "cutlass.Uint32",
+        "bool": "cutlass.Boolean",
+        "int8": "cutlass.Int8",
+        "uint8": "cutlass.Uint8",
+        "int16": "cutlass.Int16",
+        "uint16": "cutlass.Uint16",
+        "uchar": "cutlass.Uint8",
+    }
+
+    # C++ launcher code must not depend on cutlass Python types.
+    # Use plain C/C++ types for expression rendering inside generated .cpp.
+    _CXX_TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "float",
+        "float64": "double",
+        "int64": "int64_t",
+        "int32": "int32_t",
+        "uint32": "uint32_t",
+        "bool": "bool",
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+    }
+
+    _CTYPES_MAP: ClassVar[dict[str, str]] = {
+        "buffer": "ctypes.c_uint64",
+        "cutlass.Float32": "ctypes.c_float",
+        "cutlass.Float16": "ctypes.c_uint16",
+        "cutlass.Float64": "ctypes.c_double",
+        "cutlass.Int64": "ctypes.c_int64",
+        "cutlass.Int32": "ctypes.c_int32",
+        "cutlass.Uint32": "ctypes.c_uint32",
+        "cutlass.Int8": "ctypes.c_int8",
+        "cutlass.Uint8": "ctypes.c_uint8",
+        "cutlass.Int16": "ctypes.c_int16",
+        "cutlass.Uint16": "ctypes.c_uint16",
+        "int": "ctypes.c_int32",
+    }
+
+    _generated_host_func: str | None = None
+    _launcher_lib_name: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    # =========================================================================
+    # Utility Methods
+    # =========================================================================
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string."""
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="//")
+
+    def _cxx_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to C++ string for generated launcher code."""
+        return pythonic_expr(expr, self._CXX_TYPE_MAP)
+
+    @staticmethod
+    def _cxx_cast(ctype: str, expr_str: str) -> str:
+        return f"static_cast<{ctype}>({expr_str})"
+
+    def _collect_function_args(self) -> tuple[list[dict], list[str]]:
+        """Collect all function arguments from primary function.
+
+        Returns:
+            Tuple of (function_args, buffer_args)
+        """
+        function_args = []
+        buffer_args = []
+
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append({"name": buffer.data.name, "type": "buffer"})
+                buffer_args.append(buffer.data.name)
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+            else:
+                raise ValueError(f"Parameter {param} not in buffer map")
+
+        existing_names = {arg["name"] for arg in function_args}
+        for dyn_sym in self.get_dynamic_symbolic_set(self.prim_func):
+            dyn_sym_name, dyn_sym_dtype = dyn_sym if isinstance(dyn_sym, tuple) else (dyn_sym, "int32")
+            if dyn_sym_name in existing_names:
+                continue
+            existing_names.add(dyn_sym_name)
+            function_args.append({"name": dyn_sym_name, "type": self._TYPE_MAP.get(dyn_sym_dtype, "int")})
+
+        return function_args, buffer_args
+
+    @staticmethod
+    def _extract_func_call_args(
+        declaration: str,
+        function_args: list[dict],
+        function_params: list,
+        desc_name_map: dict[str, str] | None = None,
+        desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    ) -> list[tuple[str, str]]:
+        """Extract function call arguments from Python function declaration."""
+
+        def maybe_desc(name: str | tuple[str, str], param_names: list[str], i: int):
+            name_str = name if isinstance(name, str) else name[0]
+            param = param_names[i]
+            if not (param == name_str + "_desc" or param.startswith(name_str + "_desc_")):
+                return False
+            if desc_name_map is not None:
+                desc_name_map[param] = name_str
+            return True
+
+        def extract_param_names_ast(decl: str) -> list[str] | None:
+            """Extract parameter names using AST parsing."""
+            import ast
+            import warnings
+
+            try:
+                # Build a syntactically valid function by adding a body
+                func_stub = decl.rstrip()
+                if not func_stub.endswith(":"):
+                    func_stub += ":"
+                func_stub += "\n    pass"
+
+                # Parse and locate the FunctionDef
+                tree = ast.parse(func_stub)
+                func_def = None
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.FunctionDef):
+                        func_def = node
+                        break
+
+                if func_def is None:
+                    return None
+
+                # Extract parameter names, skipping 'self'
+                param_names = []
+                for arg in func_def.args.args:
+                    if arg.arg != "self":
+                        param_names.append(arg.arg)
+
+                return param_names
+            except Exception as e:
+                warnings.warn(f"AST parsing failed for function declaration, falling back to split-based parsing: {e}", stacklevel=2)
+                return None
+
+        def extract_param_names_split(decl: str) -> list[str]:
+            """Fallback: extract parameter names using naive split-based parsing."""
+            paren_start = decl.find("(")
+            paren_end = decl.rfind(")")
+            if paren_start == -1 or paren_end == -1:
+                return []
+
+            params_str = decl[paren_start + 1 : paren_end].strip()
+            if not params_str:
+                return []
+
+            param_parts = params_str.split(",")
+            param_names = []
+            for param in param_parts:
+                param = param.strip()
+                if not param or param == "self":
+                    continue
+                if ":" in param:
+                    param_name = param.split(":")[0].strip()
+                else:
+                    param_name = param.strip()
+                param_names.append(param_name)
+
+            return param_names
+
+        # Try AST-based extraction first, fallback to split-based
+        param_names = extract_param_names_ast(declaration)
+        if param_names is None:
+            param_names = extract_param_names_split(declaration)
+
+        call_args = []
+        for i, param_name in enumerate(param_names):
+            for arg in function_args:
+                if arg["name"] == param_name:
+                    call_args.append((param_name, arg["type"]))
+                elif maybe_desc(arg["name"], param_names, i):
+                    call_args.append((param_name, "None"))
+                    if desc_name_var_map is not None and function_params is not None:
+                        assert len(call_args) <= len(function_params)
+                        desc_name_var_map[param_name] = function_params[len(call_args) - 1]
+        return call_args
+
+    @staticmethod
+    def _filter_non_descriptor_args(
+        call_args: list[tuple[str, str]], desc_names: list[str], tma_tensors: list[str]
+    ) -> list[tuple[str, str]]:
+        """Filter out descriptor arguments."""
+        filtered = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                continue
+            if arg_name in tma_tensors:
+                continue
+            filtered.append((arg_name, arg_type))
+        return filtered
+
+    # =========================================================================
+    # TMA Descriptor Code Generation
+    # =========================================================================
+
+    def _generate_tma_desc_init(self, desc_name: str, desc_idx: int, tensor_name: str, info: dict) -> str:
+        """Generate single TMA descriptor initialization code."""
+        if info.get("is_img2col", False):
+            rank = info["tensor_rank"]
+            return CPP_TMA_IM2COL_DESC_INIT_TEMPLATE.format(
+                desc_idx=desc_idx,
+                desc_name=desc_name,
+                tensor_name=tensor_name,
+                rank=rank,
+                stride_rank=rank - 1,
+                rank_minus_two=rank - 2,
+                global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+                global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+                elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+                lower_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["lower_corner"]),
+                upper_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["upper_corner"]),
+                # Match NVRTC wrapper naming: channelsPerPixel then pixelsPerColumn
+                channels_per_pixel=info["smem_box_channel"],
+                pixels_per_column=info["smem_box_pixel"],
+                dtype=info["dtype"],
+                interleave=info["interleave"],
+                swizzle=info["swizzle"],
+                l2_promotion=info["l2Promotion"],
+                oob_fill=info["oobFill"],
+            )
+
+        return CPP_TMA_DESC_INIT_TEMPLATE.format(
+            desc_idx=desc_idx,
+            desc_name=desc_name,
+            tensor_name=tensor_name,
+            rank=info["tensor_rank"],
+            global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+            stride_rank=info["tensor_rank"] - 1,
+            global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+            box_dim_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["box_dim"]),
+            elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+            dtype=info["dtype"],
+            interleave=info["interleave"],
+            swizzle=info["swizzle"],
+            l2_promotion=info["l2Promotion"],
+            oob_fill=info["oobFill"],
+        )
+
+    def _generate_tma_init_func(
+        self,
+        desc_names: list[str],
+        tensor_args: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+        scalar_args: list[dict[str, str]],
+    ) -> str:
+        """Generate TMA init function code (creates descriptors in caller-provided host array).
+
+        TMA descriptors are stored in stack-local tma_descs[] array in launch_kernel.
+        cuLaunchKernel automatically handles __grid_constant__ params.
+        """
+        if not desc_names:
+            return ""
+
+        func_args_parts = [f"uint64_t {arg}_ptr" for arg in tensor_args]
+        for arg in scalar_args:
+            if arg["type"] in ["int", "cutlass.Int32"]:
+                func_args_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_args_parts.append(f"float {arg['name']}")
+            else:
+                # Default to int32_t for scalars used in shape/stride math
+                func_args_parts.append(f"int32_t {arg['name']}")
+        func_args = ", ".join(func_args_parts)
+        num_descs = len(desc_names)
+
+        desc_inits = []
+        for idx, desc_name in enumerate(desc_names):
+            info = self.tma_desc_info[desc_name]
+            tensor_name, _ = tensor_arg_map[desc_name]
+            desc_inits.append(self._generate_tma_desc_init(desc_name, idx, tensor_name, info))
+
+        return CPP_TMA_INIT_FUNC_TEMPLATE.format(
+            func_args=func_args,
+            num_descs=num_descs,
+            desc_init_code="\n".join(desc_inits),
+        )
+
+    def _generate_tma_launch_init(
+        self, desc_names: list[str], tma_tensors: list[str], scalar_args: list[dict[str, str]], num_tma_descs: int
+    ) -> str:
+        """Generate TMA initialization code for launch function (host memory mode).
+
+        TMA descriptors stay on host. cuLaunchKernel copies them to param space
+        when kernel uses __grid_constant__ CUtensorMap parameter.
+        """
+        if not desc_names:
+            return ""
+
+        # Generate tma_init call args (no device_ptr needed)
+        call_args_parts = [f"{arg}_ptr" for arg in tma_tensors] + [arg["name"] for arg in scalar_args]
+        tma_tensor_args = ", ".join(call_args_parts)
+
+        return CPP_TMA_LAUNCH_INIT_TEMPLATE.format(
+            num_tma_descs=num_tma_descs,
+            tma_tensor_args=tma_tensor_args,
+        )
+
+    # =========================================================================
+    # Kernel Code Generation
+    # =========================================================================
+
+    def _generate_kernel_init(self, kernel_idx: int, kernel_name: str, smem_size: int) -> str:
+        """Generate kernel initialization code."""
+        return CPP_KERNEL_INIT_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_name,
+            smem_size=smem_size,
+        )
+
+    def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_names: list[str]) -> str:
+        """Generate single kernel launch code.
+
+        For __grid_constant__ CUtensorMap params:
+        - Pass CUtensorMap* directly (not CUtensorMap**)
+        - cuLaunchKernel copies 128 bytes to kernel param space
+        """
+        call_args = kernel_meta["call_args"]
+        desc_names = kernel_meta["desc_names"]
+        function_info = kernel_meta["function_info"]
+
+        # Build kernel args
+        kernel_args = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                # For __grid_constant__ CUtensorMap: pass host pointer directly
+                # cuLaunchKernel will copy 128-byte CUtensorMap to param space
+                desc_idx = all_desc_names.index(arg_name)
+                kernel_args.append(f"&tma_descs[{desc_idx}]")
+            elif arg_type == "buffer":
+                kernel_args.append(f"&{arg_name}_ptr")
+            else:
+                kernel_args.append(f"&{arg_name}")
+
+        grid = function_info["grid_info"]
+        block = function_info["block_info"]
+        smem_size = function_info["dynamic_smem_buf"] or 0
+
+        return CPP_KERNEL_LAUNCH_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_meta["function_name"],
+            kernel_args=", ".join(kernel_args),
+            grid_x=self._cxx_expr(grid[0]),
+            grid_y=self._cxx_expr(grid[1]),
+            grid_z=self._cxx_expr(grid[2]),
+            block_x=self._cxx_expr(block[0]),
+            block_y=self._cxx_expr(block[1]),
+            block_z=self._cxx_expr(block[2]),
+            smem_size=smem_size,
+        )
+
+    # =========================================================================
+    # C++ Launcher Generation
+    # =========================================================================
+
+    def _generate_cpp_launcher(
+        self,
+        kernel_metadata_list: list[dict],
+        function_args: list[dict],
+        all_tma_tensors: list[str],
+        all_desc_names: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+    ) -> str:
+        """Generate complete C++ launcher code using templates.
+
+        TMA descriptors are stored on HOST memory in stack-local tma_descs[] array.
+        cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+        when kernel uses __grid_constant__ parameter.
+        """
+        num_kernels = len(kernel_metadata_list)
+        num_tma_descs = max(len(all_desc_names), 1)  # At least 1 to avoid zero-size array
+
+        # Generate kernel inits
+        kernel_inits = "\n".join(
+            self._generate_kernel_init(idx, km["function_name"], km["function_info"]["dynamic_smem_buf"] or 0)
+            for idx, km in enumerate(kernel_metadata_list)
+        )
+
+        # Generate TMA init function
+        scalar_args = [arg for arg in function_args if arg["type"] != "buffer"]
+        tma_init_func = self._generate_tma_init_func(all_desc_names, all_tma_tensors, tensor_arg_map, scalar_args)
+
+        # Generate launch function signature and get_ptr code
+        func_sig_parts = []
+        get_ptr_code = ""
+        for arg in function_args:
+            if arg["type"] == "buffer":
+                func_sig_parts.append(f"tvm::ffi::TensorView {arg['name']}")
+                get_ptr_code += f"  uint64_t {arg['name']}_ptr = reinterpret_cast<uint64_t>({arg['name']}.data_ptr());\n"
+            elif arg["type"] in ["int", "cutlass.Int32"]:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_sig_parts.append(f"float {arg['name']}")
+            else:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+
+        # Generate TMA init in launch
+        tma_init_in_launch = self._generate_tma_launch_init(all_desc_names, all_tma_tensors, scalar_args, num_tma_descs)
+
+        # Generate kernel launches
+        kernel_launches = "\n".join(self._generate_kernel_launch(km, idx, all_desc_names) for idx, km in enumerate(kernel_metadata_list))
+
+        return CPP_LAUNCHER_TEMPLATE.format(
+            num_kernels=num_kernels,
+            num_tma_descs=num_tma_descs,
+            kernel_inits=kernel_inits,
+            tma_init_func=tma_init_func,
+            launch_func_sig=", ".join(func_sig_parts),
+            get_ptr_code=get_ptr_code,
+            tma_init_in_launch=tma_init_in_launch,
+            kernel_launches=kernel_launches,
+        )
+
+    # =========================================================================
+    # Python Wrapper Generation
+    # =========================================================================
+
+    def _generate_cubin_gen_code(
+        self,
+        kernel_metadata_list: list[dict],
+        buffer_args: list[str],
+        all_desc_names: list[str],
+        lib_code: str = "",
+    ) -> str:
+        """Generate cubin generation code for Python wrapper using templates.
+
+        Args:
+            lib_code: The CuTeDSL kernel definitions (@cute.kernel decorated functions).
+                      This will be embedded inside _generate_cubin_if_needed to enable
+                      lazy loading of cutlass/cute modules.
+        """
+        # Build unified wrapper parameters
+        wrapper_params_union = []
+        for kernel_meta in kernel_metadata_list:
+            for arg_name, _ in kernel_meta["call_args"]:
+                if arg_name not in wrapper_params_union:
+                    wrapper_params_union.append(arg_name)
+
+        # Build inner args for cute.compile
+        inner_args = []
+        fake_inner_args = []
+        for arg_name in wrapper_params_union:
+            if arg_name in buffer_args:
+                inner_args.append(f"{arg_name}_")
+                fake_inner_args.append(f"__fake_{arg_name}__")
+            elif arg_name in all_desc_names:
+                continue
+            else:
+                inner_args.append(arg_name)
+                fake_inner_args.append(arg_name)
+        if all_desc_names:
+            inner_args.append("__fake_tensor__")
+            fake_inner_args.append("__fake_tensor__")
+        fake_inner_args.append("__fake_stream__")
+
+        # Generate TMA init code
+        tma_init_code = ""
+        if all_desc_names:
+            tma_init_lines = ["    # Create dummy TMA atoms for compilation"]
+            tma_init_lines.extend(CUBIN_TMA_ATOM_INIT_TEMPLATE.format(desc_name=desc_name) for desc_name in all_desc_names)
+            tma_init_code = "\n".join(tma_init_lines) + "\n"
+
+        # Generate kernel launch calls
+        kernel_launches = "\n".join(
+            CUBIN_KERNEL_LAUNCH_TEMPLATE.format(
+                function_name=km["function_name"],
+                call_args=", ".join(arg[0] if arg[0] not in buffer_args else f"{arg[0]}_" for arg in km["call_args"]),
+                grid_x=self._pythonic_expr(km["function_info"]["grid_info"][0]),
+                grid_y=self._pythonic_expr(km["function_info"]["grid_info"][1]),
+                grid_z=self._pythonic_expr(km["function_info"]["grid_info"][2]),
+                block_x=self._pythonic_expr(km["function_info"]["block_info"][0]),
+                block_y=self._pythonic_expr(km["function_info"]["block_info"][1]),
+                block_z=self._pythonic_expr(km["function_info"]["block_info"][2]),
+                smem_size=km["function_info"]["dynamic_smem_buf"] or 0,
+            )
+            for km in kernel_metadata_list
+        )
+
+        # Generate fake tensor creation code
+        # IMPORTANT: Generate fake tensors based on the *union* of parameters actually
+        # passed to cute.compile (wrapper_params_union).
+        #
+        # In multi-kernel cases, a tensor may appear both as a TMA descriptor
+        # (e.g. Output_partial_desc) for one kernel and as a plain tensor argument
+        # (e.g. Output_partial_) for another kernel. Skipping fake tensor creation
+        # just because a matching "{arg}_desc" exists is a correctness bug and
+        # results in undefined names like "__fake_Output_partial__".
+        fake_tensor_code = "\n".join(
+            CUBIN_FAKE_TENSOR_TEMPLATE.format(arg_name=arg_name) for arg_name in wrapper_params_union if arg_name in buffer_args
+        )
+        if fake_tensor_code:
+            fake_tensor_code += "\n"
+
+        # Generate fake TMA tensor code
+        fake_tma_tensor_code = ""
+        if all_desc_names:
+            fake_tma_tensor_code = (
+                "  __fake_tensor__ = make_fake_compact_tensor(cutlass.Int32, (32, 32), stride_order=(1, 0), assumed_align=16)\n"
+            )
+
+        # Indent lib_code to be inside the function
+        indented_lib_code = "\n".join("  " + line if line.strip() else line for line in lib_code.split("\n")) if lib_code else ""
+
+        return CUBIN_GEN_CODE_TEMPLATE.format(
+            lib_code=indented_lib_code,
+            wrapper_args=", ".join(inner_args + ["stream: CUstream"]),
+            tma_init_code=tma_init_code,
+            kernel_launches=kernel_launches,
+            fake_tensor_code=fake_tensor_code,
+            fake_tma_tensor_code=fake_tma_tensor_code,
+            compile_args=", ".join(fake_inner_args),
+            primary_name=kernel_metadata_list[0]["function_name"],
+        )
+
+    def _generate_python_wrapper(
+        self,
+        function_args: list[dict],
+        cubin_gen_code: str,
+        cubin_gen_params: str,
+    ) -> str:
+        """Generate Python wrapper code."""
+        # Build function parameters
+        call_func_params = ", ".join(arg["name"] for arg in function_args)
+        launcher_call_args = ", ".join(arg["name"] for arg in function_args)
+
+        return PYTHON_HOST_FUNC_TEMPLATE.format(
+            cubin_gen_params=cubin_gen_params,
+            cubin_gen_code=cubin_gen_code,
+            launcher_lib_name=self._launcher_lib_name,
+            call_func_params=call_func_params,
+            cubin_gen_call_args=cubin_gen_params,
+            arg_prep_code="",
+            launcher_call_args=launcher_call_args,
+        )
+
+    # =========================================================================
+    # TMA Descriptor Processing
+    # =========================================================================
+
+    def _process_tma_descriptors(self, desc_names: list[str]) -> tuple[list[str], dict[str, tuple[str, int]]]:
+        """Process TMA descriptors and return tensor args and mapping.
+
+        Returns:
+            Tuple of (tensor_args, tensor_arg_map)
+        """
+        if not hasattr(self, "tma_desc_info") or not desc_names:
+            return [], {}
+
+        tensor_args = []
+        tensor_arg_map = {}
+
+        for desc_name in desc_names:
+            info = self.tma_desc_info[desc_name]
+            # Extract the base buffer variable name (must be a Var, not arbitrary expression)
+            global_addr = info["globalAddress"]
+            if not isinstance(global_addr, tvm.tir.Var):
+                raise ValueError(f"TMA globalAddress must be a buffer Var, got {type(global_addr)}: {global_addr}")
+            tensor_name = global_addr.name
+
+            if tensor_name not in tensor_args:
+                tensor_args.append(tensor_name)
+                tensor_arg_map[desc_name] = (tensor_name, len(tensor_args) - 1)
+            else:
+                tensor_arg_map[desc_name] = (tensor_name, tensor_args.index(tensor_name))
+
+        return tensor_args, tensor_arg_map
+
+    def generate_tma_descriptor_args(
+        self,
+        desc_name_map: dict[str, str],
+        desc_name_var_map: dict[str, tvm.tir.Var],
+        tma_desc_code_map: dict[str, str],
+    ) -> list[str]:
+        """Generate TMA descriptor information for C++ code generation.
+
+        Returns:
+            List of descriptor variable names in the order they were processed.
+        """
+        if self.tma_descriptor_args is None:
+            return []
+
+        if not hasattr(self, "tma_desc_info"):
+            self.tma_desc_info = {}
+
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        desc_names_ordered = []
+
+        for params in parsed_params:
+            handle_name = params.handle_name
+
+            if handle_name in tma_desc_code_map:
+                continue
+
+            desc_var = desc_name_var_map[handle_name]
+            args = self.tma_descriptor_args[desc_var]
+            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
+            tensor_rank = int(tensor_rank)
+
+            global_dim = remaining_args[:tensor_rank]
+            global_stride = remaining_args[tensor_rank : 2 * tensor_rank]
+
+            if not params.is_img2col:
+                box_dim = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+                element_strides = remaining_args[3 * tensor_rank : 4 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": False,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "box_dim": box_dim,
+                    "element_strides": element_strides,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+            else:
+                element_strides = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": True,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "element_strides": element_strides,
+                    "lower_corner": params.lower_corner,
+                    "upper_corner": params.upper_corner,
+                    "smem_box_channel": params.smem_box_channel,
+                    "smem_box_pixel": params.smem_box_pixel,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+
+            tma_desc_code_map[handle_name] = ""
+            desc_names_ordered.append(handle_name)
+
+        return desc_names_ordered
+
+    # =========================================================================
+    # Main Entry Points
+    # =========================================================================
+
+    def create_dispatch_func(self, code, function_informations):
+        """Create dispatch function - always use C++ launcher."""
+        return self.create_dispatch_func_cpp_launcher(code, function_informations)
+
+    def create_dispatch_func_cpp_launcher(self, code, function_informations):
+        """Create dispatch function using C++ launcher."""
+        function_args, buffer_args = self._collect_function_args()
+
+        # Process each kernel and collect metadata
+        kernel_metadata = []
+        all_desc_names_union = []
+        all_tma_tensors_union = []
+
+        for function_name, function_info in function_informations.items():
+            declaration = extract_python_func_declaration(code, function_name)
+            desc_name_map: dict[str, str] = {}
+            desc_name_var_map: dict[str, tvm.tir.Var] = {}
+            call_args = self._extract_func_call_args(
+                declaration,
+                function_args,
+                function_info["function_params"],
+                desc_name_map,
+                desc_name_var_map,
+            )
+
+            tma_desc_code_map = {}
+            desc_names = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map, tma_desc_code_map)
+
+            tma_tensor_args, _ = self._process_tma_descriptors(desc_names)
+
+            kernel_metadata.append(
+                {
+                    "function_name": function_name,
+                    "function_info": function_info,
+                    "call_args": call_args,
+                    "desc_names": desc_names,
+                    "tma_tensor_args": tma_tensor_args,
+                    "desc_name_map": desc_name_map,
+                }
+            )
+
+            for desc in desc_names:
+                if desc not in all_desc_names_union:
+                    all_desc_names_union.append(desc)
+            for t in tma_tensor_args:
+                if t not in all_tma_tensors_union:
+                    all_tma_tensors_union.append(t)
+
+        # Process all TMA descriptors
+        all_tma_tensors, tensor_arg_map = self._process_tma_descriptors(all_desc_names_union)
+
+        # Generate C++ launcher
+        launcher_cpp_code = self._generate_cpp_launcher(
+            kernel_metadata, function_args, all_tma_tensors, all_desc_names_union, tensor_arg_map
+        )
+
+        self.launcher_cpp_code = launcher_cpp_code
+        # Use a deterministic name so that:
+        # 1) the generated kernel.py can always locate the launcher in the same directory
+        # 2) KernelCache can store it under a stable filename
+        self._launcher_lib_name = "launcher_lib.so"
+        self.launcher_lib_name = self._launcher_lib_name
+
+        # Generate cubin generation code (includes lib_code with @cute.kernel definitions)
+        cubin_gen_code = self._generate_cubin_gen_code(
+            kernel_metadata, buffer_args, all_desc_names_union, lib_code=getattr(self, "lib_code", "")
+        )
+
+        # Generate Python wrapper
+        buffer_names = [arg["name"] for arg in function_args if arg["type"] == "buffer"]
+        # Cubin generation may reference scalar args (e.g., dynamic symbols like m/n/k)
+        # inside `kernel_wrapper` and `cute.compile(...)`. They must be visible in
+        # `_generate_cubin_if_needed(...)` scope, so include them in its signature.
+        scalar_names = [arg["name"] for arg in function_args if arg["type"] != "buffer"]
+        cubin_gen_params = ", ".join(buffer_names + scalar_names)
+
+        python_wrapper = self._generate_python_wrapper(function_args, cubin_gen_code, cubin_gen_params)
+
+        return python_wrapper
+
+    def get_launcher_cpp_code(self) -> str:
+        """Get the generated C++ launcher code."""
+        return getattr(self, "launcher_cpp_code", "")
+
+    def update_lib_code(self, code: str):
+        """Update the library code with the given code string."""
+        self.lib_code = code
+
+        function_informations = {}
+        for function_name in self.function_names:
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index b1b67299..083c8f21 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -76,7 +76,9 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.host_func, self.function_names = self.wrapper.wrap(device_kernel_source)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
 
         self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
         self.lib_generator.update_lib_code(self.device_kernel_source)
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
index 3df2b3bf..2316823e 100644
--- a/tilelang/jit/adapter/nvrtc/wrapper.py
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -273,7 +273,7 @@ class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
 
         Casts are noise in generated Python code - Python is dynamically typed.
         """
-        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True)
+        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True, floor_div_op="//")
 
     def create_dispatch_func(self, code, function_informations):
         """Generate Python dispatch function that launches multiple CUDA kernels.
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index 15801ffa..d43adf84 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -38,6 +38,53 @@ def match_declare_kernel(source: str, annotation: str = "__global__") -> int:
     raise ValueError("No global kernel found in the source code")
 
 
+def match_declare_kernel_cutedsl(source: str, annotation: str = "@cute.kernel") -> int:
+    # Match decorator followed by function definition across lines
+    # \s+ allows any whitespace including newlines between decorator and def
+    pattern = r"@cute\.kernel\s+def\s+(\w+)"
+    matched = re.search(pattern, source, re.MULTILINE)
+    if matched:
+        # Find the position of the opening parenthesis after the function name
+        # matched.start(1) gives position of function name
+        func_name_pos = matched.start(1)
+        # Find the '(' after function name
+        paren_pos = source.find("(", func_name_pos)
+        if paren_pos != -1:
+            return paren_pos
+    raise ValueError("No global kernel found in the source code")
+
+
+def extract_python_func_declaration(source: str, func_name: str) -> str:
+    """Extract the full Python function declaration from decorator to colon.
+
+    Args:
+        source: Source code containing the function
+        func_name: Name of the function to extract (can include '(' suffix)
+
+    Returns:
+        The function declaration from 'def' to ':', including parameters
+
+    Example:
+        For code:
+            @cute.kernel
+            def kernel(arg1: cute.Tensor, arg2: int):
+                ...
+        Returns: "def kernel(arg1: cute.Tensor, arg2: int)"
+    """
+    # Remove '(' suffix if present
+    if func_name.endswith("("):
+        func_name = func_name[:-1]
+
+    # Match from def to the closing ) followed by :
+    # This handles multi-line function signatures
+    pattern = rf"def\s+{re.escape(func_name)}\s*\([^)]*\)"
+    matched = re.search(pattern, source, re.DOTALL)
+    if matched:
+        return matched.group(0)
+
+    raise ValueError(f"No function declaration found for {func_name}")
+
+
 def match_declare_kernel_cpu(source: str, annotation: str = "int32_t") -> int:
     pattern = r"int32_t\s+\w+"
     for line in source.split("\n"):
@@ -64,6 +111,10 @@ def is_metal_target(target: Target) -> bool:
     return target.kind.name == "metal"
 
 
+def is_cutedsl_target(target: Target) -> bool:
+    return target.kind.name == "cuda" and "cutedsl" in target.keys
+
+
 def get_annotated_mod(
     func_or_mod: tir.PrimFunc | tvm.IRModule,
     target: str | Target = "auto",
@@ -102,7 +153,9 @@ def get_annotated_mod(
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False) -> str:
+def pythonic_expr(
+    expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False, floor_div_op: str = "/"
+) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
@@ -110,6 +163,10 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
         expr: The TVM PrimExpr to convert.
         dtype_map: A dictionary mapping data types to their string representations.
         ignore_cast: Whether to ignore the cast operator and return the string representation of the value without the cast.
+        floor_div_op: Operator to use for tvm.tir.FloorDiv. Default '/' preserves prior
+                      behavior (suitable for generating C/C++ expressions). For generating
+                      Python code where integer division is required (e.g. grid/block),
+                      pass '//' explicitly.
     Returns:
         A string representation of the expression.
     """
@@ -180,7 +237,7 @@ def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = Non
         ):
             op_map = {
                 tvm.tir.Mul: "*",
-                tvm.tir.FloorDiv: "/",
+                tvm.tir.FloorDiv: floor_div_op,
                 tvm.tir.Add: "+",
                 tvm.tir.Sub: "-",
                 tvm.tir.FloorMod: "%",
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index c028a58e..d83d0ccc 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -4,8 +4,10 @@ from tilelang import tvm as tvm
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
+
 from .utils import (
     is_metal_target,
+    is_cutedsl_target,
     match_declare_kernel,
     match_declare_kernel_cpu,
     is_cuda_target,
@@ -198,7 +200,9 @@ class TLCUDASourceWrapper:
         self.lib_code: str | None = self.update_lib_code(source)
 
     def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
-        return pythonic_expr(expr, self._TYPE_MAP)
+        # This wrapper generates C/CUDA source. C/C++ integer division uses '/',
+        # and '//' is not a valid operator in C/C++.
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="/")
 
     def _lookup_type(self, dtype: str | Any) -> str:
         key = dtype if isinstance(dtype, str) else str(dtype)
@@ -326,9 +330,9 @@ class TLCUDASourceWrapper:
         return init_l2_persistent_map
 
     def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
-        tma_descripter_init = ""
+        tma_descriptor_init = ""
         if self.tma_descriptor_args is None:
-            return tma_descripter_init
+            return tma_descriptor_init
 
         # Parse TMA descriptor arguments using the common utility
         parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
@@ -336,7 +340,7 @@ class TLCUDASourceWrapper:
         # Generate C++ code from parsed parameters
         for params in parsed_params:
             if not params.is_img2col:
-                tma_descripter_init += TMA_DESC_INIT_FUNC.format(
+                tma_descriptor_init += TMA_DESC_INIT_FUNC.format(
                     params.handle_name,
                     params.dtype,
                     params.tensor_rank,
@@ -351,7 +355,7 @@ class TLCUDASourceWrapper:
                     params.oob_fill,
                 )
             else:
-                tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC.format(
                     params.handle_name,
                     params.dtype,
                     params.tensor_rank,
@@ -369,7 +373,7 @@ class TLCUDASourceWrapper:
                     params.oob_fill,
                 )
 
-        return tma_descripter_init
+        return tma_descriptor_init
 
     def parse_source_information(self):
         if self.device_mod is None or self.host_mod is None:
@@ -817,6 +821,9 @@ class TLMetalSourceWrapper:
         return self.lib_code
 
 
+# TLCuTeDSLSourceWrapper has been moved to tilelang.jit.adapter.cutedsl.wrapper
+
+
 class TLWrapper(BaseWrapper):
     """
     A wrapper class for the TileLang backend.
@@ -875,9 +882,13 @@ class TLPyWrapper(TLWrapper):
     def __init__(self, target: Target):
         super().__init__(target)
 
-    def wrap(self, c_source: str):
+    def wrap(self, py_source: str):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
-        if is_cuda_target(self.target):
+        if is_cutedsl_target(self.target):
+            from tilelang.jit.adapter.cutedsl import TLCuTeDSLSourceWrapper
+
+            wrapper_class = TLCuTeDSLSourceWrapper
+        elif is_cuda_target(self.target):
             from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
 
             wrapper_class = TLNVRTCSourceWrapper
@@ -885,10 +896,17 @@ class TLPyWrapper(TLWrapper):
             raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
         wrapper = wrapper_class(
             scheduled_ir_module=self.scheduled_ir_module,
-            source=c_source,
+            source=py_source,
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
             pass_configs=self.pass_configs,
         )
-        return wrapper.host_func, wrapper.function_names
+        return {
+            "host_func": getattr(wrapper, "host_func", None),
+            "function_names": getattr(wrapper, "function_names", None),
+            "tma_cpp_init_code": getattr(wrapper, "tma_cpp_init_code", None),
+            "tma_lib_name": getattr(wrapper, "tma_lib_name", None),
+            "launcher_cpp_code": getattr(wrapper, "launcher_cpp_code", None),
+            "launcher_lib_name": getattr(wrapper, "launcher_lib_name", None),
+        }
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
index 492e8cb0..db5e4a8b 100644
--- a/tilelang/jit/execution_backend.py
+++ b/tilelang/jit/execution_backend.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 from collections.abc import Iterable
 
 from tvm.target import Target
+from tilelang.jit.adapter.utils import is_cutedsl_target
+from tilelang.env import env as _env
 
 # Canonical names for execution backends used internally
 _CANONICAL_MAP = {
@@ -30,7 +32,9 @@ def allowed_backends_for_target(target: Target, *, include_unavailable: bool = T
     """
     kind = _target_kind(target)
 
-    if kind == "cuda":
+    if is_cutedsl_target(target):
+        return ["cutedsl"]
+    elif kind == "cuda":
         allowed = ["tvm_ffi", "nvrtc", "cython", "ctypes"]
     elif kind == "hip":
         allowed = ["tvm_ffi", "cython", "ctypes"]
@@ -72,8 +76,26 @@ def resolve_execution_backend(requested: str | None, target: Target) -> str:
     allowed_all = allowed_backends_for_target(target, include_unavailable=True)
     allowed_avail = allowed_backends_for_target(target, include_unavailable=False)
 
+    def _require_gemm_v1_for_cutedsl():
+        if not _env.use_gemm_v1():
+            raise ValueError(
+                "CuTeDSL backend requires GEMM v1. Please set environment variable TILELANG_USE_GEMM_V1=1 before importing tilelang."
+            )
+        # Fail fast with a clear error if CuTeDSL dependencies are missing or incompatible.
+        try:
+            from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available  # lazy
+
+            check_cutedsl_available()
+        except ImportError as e:
+            # Keep resolve_execution_backend's error semantics (ValueError) while
+            # preserving the actionable ImportError message.
+            raise ValueError(str(e)) from e
+
     # Default selection for auto/None
     if req in (None, "auto"):
+        if is_cutedsl_target(target):
+            _require_gemm_v1_for_cutedsl()
+            return "cutedsl"
         kind = _target_kind(target)
         if kind == "cuda":
             choice = "tvm_ffi"
@@ -100,4 +122,8 @@ def resolve_execution_backend(requested: str | None, target: Target) -> str:
             f"Try one of: {_format_options(allowed_avail)}."
         )
 
+    # CuTeDSL requires GEMM v1
+    if req == "cutedsl":
+        _require_gemm_v1_for_cutedsl()
+
     return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index 9a0dab89..a788e76b 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -7,7 +7,7 @@ try:
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 
-from tilelang.jit.adapter.utils import is_metal_target, is_cuda_target
+from tilelang.jit.adapter.utils import is_cutedsl_target, is_metal_target, is_cuda_target
 from tvm.target import Target
 from tvm.tir import PrimFunc
 
@@ -15,7 +15,14 @@ import tilelang
 from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
-from tilelang.jit.adapter import BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter, TVMFFIKernelAdapter, MetalKernelAdapter
+from tilelang.jit.adapter import (
+    BaseKernelAdapter,
+    CtypesKernelAdapter,
+    CythonKernelAdapter,
+    CuTeDSLKernelAdapter,
+    TVMFFIKernelAdapter,
+    MetalKernelAdapter,
+)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
 from tilelang.contrib import nvcc as tl_nvcc
@@ -57,7 +64,7 @@ class JITKernel(Generic[_P, _T]):
         self,
         func: PrimFunc = None,
         out_idx: list[int] | int = None,
-        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"] = "tvm_ffi",
+        execution_backend: Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         target: str | Target = "auto",
         target_host: str | Target = None,
         verbose: bool = False,
@@ -74,7 +81,7 @@ class JITKernel(Generic[_P, _T]):
             The TileLang TIR function to compile and wrap.
         out_idx : Union[List[int], int], optional
             Index(es) of the output tensors to return (default: None).
-        execution_backend : Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch"], optional
+        execution_backend : Literal["tvm_ffi", "ctypes", "cython", "nvrtc", "torch", "cutedsl"], optional
             Execution backend to use for kernel execution.
         target : Union[str, Target], optional
             Compilation target, either as a string or a TVM Target object (default: "auto").
@@ -109,6 +116,7 @@ class JITKernel(Generic[_P, _T]):
             "cython",
             "nvrtc",
             "torch",
+            "cutedsl",
         ], f"Invalid execution backend. {execution_backend}"
         if execution_backend == "cython":
             from tilelang.contrib.cc import get_cplus_compiler
@@ -316,6 +324,20 @@ class JITKernel(Generic[_P, _T]):
                 # pass_configs=pass_configs,
                 # compile_flags=compile_flags,
             )
+        elif execution_backend == "cutedsl":
+            assert is_cutedsl_target(target)
+            adapter = CuTeDSLKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         else:
             # Handle invalid backend.
             raise ValueError(f"Invalid execution backend: {execution_backend}")
@@ -387,6 +409,18 @@ class JITKernel(Generic[_P, _T]):
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
             )
+        elif execution_backend == "cutedsl":
+            adapter = CuTeDSLKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         else:
             # Handle invalid backend.
             raise ValueError(f"Invalid execution backend: {execution_backend}")
@@ -437,7 +471,7 @@ class JITKernel(Generic[_P, _T]):
         str
             The source code of the compiled kernel function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi"}:
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}:
             return self.adapter.get_kernel_source(kernel_only=kernel_only)
         return self.artifact.kernel_source
 
@@ -445,7 +479,7 @@ class JITKernel(Generic[_P, _T]):
         """
         Returns the source code of the host function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi"}:
+        if self.execution_backend in {"ctypes", "cython", "nvrtc", "tvm_ffi", "cutedsl"}:
             return self.adapter.get_host_source()
         assert self.artifact.host_mod is not None, "host_mod is not available"
         return str(self.artifact.host_mod)
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 4ead7efd..a2b88f5e 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -15,6 +15,7 @@ SUPPORTED_TARGETS: dict[str, str] = {
     "llvm": "LLVM CPU target (accepts standard TVM LLVM options).",
     "webgpu": "WebGPU target for browser/WebGPU runtimes.",
     "c": "C source backend.",
+    "cutedsl": "CuTe DSL GPU target.",
 }
 
 
@@ -95,6 +96,14 @@ def determine_target(target: str | Target | Literal["auto"] = "auto", return_obj
             return_var = "metal"
         else:
             raise ValueError("No CUDA or HIP or MPS available on this system.")
+    elif isinstance(target, str) and target.startswith("cutedsl"):
+        cuda_target_str = target.replace("cutedsl", "cuda", 1)
+        temp_target = Target(cuda_target_str)
+
+        target_dict = dict(temp_target.export())
+        target_dict["keys"] = list(target_dict["keys"]) + ["cutedsl"]
+
+        return_var = Target(target_dict)
     else:
         # Validate the target if it's not "auto"
         if isinstance(target, Target):
@@ -115,6 +124,8 @@ def determine_target(target: str | Target | Literal["auto"] = "auto", return_obj
         else:
             raise AssertionError(f"Target {target} is not supported")
 
+    if isinstance(return_var, Target):
+        return return_var
     if return_object:
         if isinstance(return_var, Target):
             return return_var
-- 
GitLab


From f06726031ed3a6472f41986b7bacc92ca44e659e Mon Sep 17 00:00:00 2001
From: silentCoder-dev <silentcoder@foxmail.com>
Date: Fri, 19 Dec 2025 11:08:53 +0800
Subject: [PATCH 131/139] [Refactor] Rename test for curand & add triton
 baseline in `test_tilelang_language_rand.py` (#1464)

* rename test for curand & add triton baseline

* add a comment for calling T.rng_rand() four times

* refactor tilelang&triton kernel

* Add boundary checks for M not divisible by 128
---
 testing/python/language/test_rand.py          | 35 -----------
 .../language/test_tilelang_language_rand.py   | 59 +++++++++++++++++++
 2 files changed, 59 insertions(+), 35 deletions(-)
 delete mode 100644 testing/python/language/test_rand.py
 create mode 100644 testing/python/language/test_tilelang_language_rand.py

diff --git a/testing/python/language/test_rand.py b/testing/python/language/test_rand.py
deleted file mode 100644
index 5e25cc3b..00000000
--- a/testing/python/language/test_rand.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import tilelang
-import tilelang.language as T  # noqa: N812
-import torch
-import triton
-import triton.language as tl
-
-
-@tilelang.jit
-def tilelang_rand_1d(M=1024, seed=42):
-    blk_M = 128
-    num_threads = 128
-
-    @T.prim_func
-    def rand_kernel(A: T.Tensor((M,), "uint32")):
-        with T.Kernel(M // blk_M, threads=num_threads) as bx:
-            T.rng_init(seed)
-            for i in T.Parallel(blk_M):
-                A[bx * blk_M + i] = T.rng_rand()
-
-    return rand_kernel
-
-
-@triton.jit
-def triton_rand_1d(X, M, seed):
-    pid = tl.program_id(0)
-    offset = pid * M + tl.arange(0, M)
-    rand = tl.randint(seed, offset)
-    tl.store(X + offset, rand, mask=offset < M)
-
-
-if __name__ == "__main__":
-    M = 1024
-    kernel = tilelang_rand_1d()
-    x = torch.empty(M, dtype=torch.uint32, device="cuda")
-    kernel(x)
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
new file mode 100644
index 00000000..0f9383b8
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -0,0 +1,59 @@
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+import triton.language as tl
+import pytest
+import tilelang.testing
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, elements_per_thread, seed):
+    pid = tl.program_id(0)
+    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
+
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+
+    base_idx = offset * 4
+    tl.store(X + base_idx, r0, mask=base_idx < M)
+    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
+    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
+    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("M, seed", [(1024, 42), (512, 123), (128, 0)])
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    grid = (M // 128,)
+    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
+
+    torch.testing.assert_close(tilelang_result, triton_result)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
-- 
GitLab


From f6db2014a3f730144eb7f4686e69305acd831060 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:19:04 +0800
Subject: [PATCH 132/139] [ArgBinder] Enhance shape variable handling and
 assertions (#1467)

* feat(arg_binder): enhance shape variable handling and assertions

- Implemented special handling for comparing if_then_else expressions to simplify conditions involving NULL checks.
- Added methods to set shared shape variables and finalize deferred bindings, generating cascading if_then_else expressions and runtime assertions for non-NULL buffers.
- Updated the binding logic to defer shape variable bindings for shared variables, ensuring proper handling across multiple nullable buffers.

* refactor(arg_binder): clean up shape variable handling and remove unused code

- Removed deprecated methods for setting shared shape variables and finalizing deferred bindings, streamlining the argument binding process.
- Simplified the logic for handling shape values in the `BindDLTensor` function, ensuring immediate binding for normal shape variables.
- Enhanced clarity by eliminating unnecessary comments and code related to cascading if_then_else expressions for shared variables.

* refactor(arg_binder): enhance DLTensor binding with improved shape handling

- Replaced the single `BindDLTensor` method with `BindDLTensors` to support multiple buffers, improving flexibility in handling DLTensor bindings.
- Introduced a two-pass approach for shape variable handling, allowing for better management of symbolic dimensions and null checks.
- Updated the logic to assert non-null conditions at runtime and utilize cascaded if_then_else expressions for shape retrieval, enhancing robustness.
- Removed deprecated code and streamlined the binding process for clarity and maintainability.

* fix(test_nullable_buffer_params): improve formatting and consistency in test output

- Updated string formatting for better readability in the `test_nullable_shared_shape` function.
- Ensured consistent use of double quotes for string literals.
- Added a missing newline at the end of the file for proper formatting.

* refactor(arg_binder): simplify allocation size calculation in BindDLTensors

- Streamlined the calculation of allocation size by replacing a lambda function with a direct loop, enhancing readability and maintainability.
- Improved clarity in the null check message for data pointers, ensuring better understanding of the binding process.

* Remove debug prints from phase.py

Removed debug print statements after MakePackedAPI transformation.
---
 src/transform/arg_binder.cc                   | 994 ++++++++++--------
 src/transform/arg_binder.h                    |  18 +-
 src/transform/make_packed_api.cc              |  21 +-
 .../transform/test_nullable_buffer_params.py  |  73 ++
 4 files changed, 680 insertions(+), 426 deletions(-)
 create mode 100644 testing/python/transform/test_nullable_buffer_params.py

diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index c3aebc86..294c9f6b 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -311,446 +311,618 @@ inline PrimExpr TVMArrayGet(DataType t, Var arr,
   return TVMStructGet(t, arr, 0, kind);
 }
 
-void ArgBinder::BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                             const PrimExpr &device_id, const Var &handle,
-                             const std::string &arg_name, bool is_used) {
-  const DataType tvm_shape_type = DataType::ShapeIndex();
-  const DataType tvm_ndim_type = DataType::Int(32);
-  const Stmt nop = Evaluate(0);
+void ArgBinder::BindDLTensors(
+    const std::vector<std::pair<Var, Buffer>> &buffer_def,
+    const PrimExpr &device_type, const PrimExpr &device_id,
+    const std::string &func_name,
+    const std::unordered_set<const VarNode *> &used_param_buffers) {
+  ffi::Array<Buffer> buffers;
+  ffi::Array<Var> handles;
+
+  // First pass: collect shape var -> list of (buffer_name, dim_idx, handle_ptr)
+  struct ShapeVarSource {
+    std::string buf_name;
+    size_t dim_idx;
+    const VarNode *handle_ptr; // Raw pointer to check used_param_buffers
+  };
+  std::unordered_map<const VarNode *, std::vector<ShapeVarSource>>
+      shape_var_sources;
+
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    // Scan buffer shape for symbolic variables
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      if (buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) ||
+          buffer->dtype == DataType::Int(1)) {
+        break;
+      }
 
-  // Allow NULL DLTensor* for optional inputs.  When the handle is NULL,
-  // avoid dereferencing it by using expression-level conditionals and
-  // short-circuiting guards in asserts. Cache the null check in a Let-bound
-  // boolean so codegen does not repeat `(handle == NULL)` everywhere.
-
-  Var is_null_var(arg_name + "_is_null", DataType::Bool());
-  init_nest_.emplace_back(
-      LetStmt(is_null_var,
-              Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
-  const PrimExpr &is_null = is_used ? const_false() : is_null_var;
-  if (is_used) {
-    init_nest_.emplace_back(AssertStmt(
-        !is_null_var,
-        tvm::tir::StringImm(arg_name + " is expected to have non-NULL pointer"),
-        nop));
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        // This dimension is a symbolic variable
+        shape_var_sources[v].push_back({arg_name, k, handle.get()});
+      }
+    }
   }
 
-  // dimension checks
-  PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
+  // Second pass: Create is_null vars and shape buffers for all buffers first
+  std::unordered_map<std::string, Var> is_null_map;
+  std::unordered_map<std::string, Buffer> shape_buffer_map;
+  std::unordered_map<std::string, PrimExpr>
+      is_null_expr_map; // arg_name -> is_null expression (const_false for used
+                        // buffers)
 
-  // Helper functions for shape/stride name formatting
-  auto shape_handle_name = [&]() { return arg_name + ".shape"; };
-  auto stride_handle_name = [&]() { return arg_name + ".strides"; };
-  auto array_element_name = [&](const std::string &arr_name, size_t k) {
-    std::stringstream ss;
-    ss << arr_name << '[' << k << ']';
-    return ss.str();
-  };
-  auto shape_element_name = [&](size_t k) {
-    return array_element_name(shape_handle_name(), k);
-  };
-  auto stride_element_name = [&](size_t k) {
-    return array_element_name(stride_handle_name(), k);
-  };
+  const DataType tvm_shape_type = DataType::ShapeIndex();
+  const DataType tvm_ndim_type = DataType::Int(32);
+  const Stmt nop = Evaluate(0);
 
-  PrimExpr a_ndim =
-      make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
-  // Build clearer ndim message with kernel/buffer names
-  std::string kernel_nm = arg_name;
-  std::string buf_nm = arg_name;
-  size_t dot_pos = arg_name.find('.');
-  if (dot_pos != std::string::npos) {
-    kernel_nm = arg_name.substr(0, dot_pos);
-    buf_nm = arg_name.substr(dot_pos + 1);
-  }
-  // Only check ndim when handle is non-NULL: use packed error helper
-  PrimExpr ndim_ok = (a_ndim == v_ndim);
-  ffi::Array<PrimExpr> ndim_args;
-  ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
-  ndim_args.push_back(StringImm(kernel_nm));
-  ndim_args.push_back(StringImm(buf_nm));
-  ndim_args.push_back(cast(DataType::Int(64), a_ndim));
-  ndim_args.push_back(cast(DataType::Int(64), v_ndim));
-  Stmt ndim_call =
-      Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
-  init_nest_.emplace_back(
-      SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
-                          Evaluate(0)),
-               nop}));
-  // type checks
-  // Guard all dtype field loads by `is_null` using if_then_else
-  PrimExpr v_type_code = tvm::if_then_else(
-      Not(is_null),
-      TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
-      IntImm(DataType::UInt(8), buffer->dtype.code()));
-  PrimExpr v_type_bits = tvm::if_then_else(
-      Not(is_null),
-      TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
-      IntImm(DataType::UInt(8), buffer->dtype.bits()));
-  PrimExpr v_type_lanes = tvm::if_then_else(
-      Not(is_null),
-      TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
-      IntImm(DataType::UInt(16), buffer->dtype.lanes()));
-  PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
-  PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
-  PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
-
-  PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
-                   v_type_lanes == expect_lanes);
-
-  // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
-  if (buffer->dtype.is_float8_e4m3()) {
-    PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
-    PrimExpr code_e4m3fn = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
-    PrimExpr code_e4m3fnuz =
-        IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
-    PrimExpr code_match =
-        (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
-         v_type_code == code_e4m3fnuz);
-    cond = cond || (code_match && v_type_bits == expect_bits &&
-                    v_type_lanes == expect_lanes);
-  }
-  // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
-  if (buffer->dtype.is_float8_e5m2()) {
-    PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
-    PrimExpr code_e5m2fnuz =
-        IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
-    PrimExpr code_match =
-        (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
-    cond = cond || (code_match && v_type_bits == expect_bits &&
-                    v_type_lanes == expect_lanes);
-  }
-  // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
-  if (buffer->dtype.is_bool()) {
-    PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
-    PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
-    PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
-    PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
-    PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
-    PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
-    PrimExpr int8_ok =
-        (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
-    PrimExpr uint8_ok =
-        (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
-    // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
-    // bits=8 or bits=1.
-    PrimExpr kdlbool8_ok =
-        (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
-    PrimExpr kdlbool1_ok =
-        (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
-    // Also accept any dtype whose bitwidth=1, regardless of code, to be
-    // defensive.
-    PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
-    cond = cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+  // Create all is_null vars and shape buffers first
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    Var is_null_var(arg_name + "_is_null", DataType::Bool());
+    init_nest_.emplace_back(
+        LetStmt(is_null_var,
+                Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
+    const PrimExpr &is_null = is_used ? const_false() : is_null_var;
+
+    is_null_map[arg_name] = is_null_var;
+    is_null_expr_map[arg_name] = is_null;
+
+    if (is_used) {
+      init_nest_.emplace_back(
+          AssertStmt(!is_null_var,
+                     tvm::tir::StringImm(
+                         arg_name + " is expected to have non-NULL pointer"),
+                     nop));
+    }
   }
-  // Allow float4 to match int8 at runtime (PyTorch uses int8 as storage for
-  // FP4).
-  if (buffer->dtype.is_float4()) {
-    PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
-    PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
-    // For FP4, we pack 2 elements per byte, but we still use same lanes at
-    // storage level Accept int8 with same lanes as the fp4 type
-    PrimExpr fp4_lanes_ok = (v_type_lanes == expect_lanes);
-    PrimExpr int8_ok =
-        (v_type_code == code_int && v_type_bits == bits8 && fp4_lanes_ok);
-    cond = cond || int8_ok;
+
+  // Create all shape buffers before binding any shapes
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+
+    // shape field
+    Buffer buf_shape =
+        decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
+                    tvm_shape_type, shape_handle_name());
+    def_handle_dtype_.Set(buf_shape->data, make_const(tvm_shape_type, 0));
+    // Use if_then_else for NULL guard on the shape pointer itself, avoiding
+    // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
+    init_nest_.emplace_back(
+        LetStmt(buf_shape->data,
+                tvm::if_then_else(
+                    Not(is_null),
+                    TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
+                    make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
+
+    // Save for later use in shape binding
+    shape_buffer_map[arg_name] = buf_shape;
   }
-  if (!(buffer->dtype == DataType::Int(1) ||
-        buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4) || buffer->dtype.is_float4())) {
-    // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch occurs.
-    // Only issue the call when handle is non-NULL and cond is false.
-    ffi::Array<PrimExpr> packed_args;
-    packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
-    // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
-    // diagnostics
-    std::string kernel_name = arg_name;
-    std::string buffer_name = arg_name;
+
+  // Now process each buffer fully
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // dimension checks
+    PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+    auto stride_handle_name = [&]() { return arg_name + ".strides"; };
+    auto array_element_name = [&](const std::string &arr_name, size_t k) {
+      std::stringstream ss;
+      ss << arr_name << '[' << k << ']';
+      return ss.str();
+    };
+    auto shape_element_name = [&](size_t k) {
+      return array_element_name(shape_handle_name(), k);
+    };
+    auto stride_element_name = [&](size_t k) {
+      return array_element_name(stride_handle_name(), k);
+    };
+
+    PrimExpr a_ndim =
+        make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
+    // Build clearer ndim message with kernel/buffer names
+    std::string kernel_nm = arg_name;
+    std::string buf_nm = arg_name;
     size_t dot_pos = arg_name.find('.');
     if (dot_pos != std::string::npos) {
-      kernel_name = arg_name.substr(0, dot_pos);
-      buffer_name = arg_name.substr(dot_pos + 1);
+      kernel_nm = arg_name.substr(0, dot_pos);
+      buf_nm = arg_name.substr(dot_pos + 1);
     }
-    packed_args.push_back(StringImm(kernel_name));
-    packed_args.push_back(StringImm(buffer_name));
-
-    auto i64 = DataType::Int(64);
-    // Cast to int64 for FFI function signature
-    packed_args.push_back(cast(i64, v_type_code));  // actual_code
-    packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
-    packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
-    packed_args.push_back(cast(i64, expect_code));  // expect_code
-    packed_args.push_back(cast(i64, expect_bits));  // expect_bits
-    packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
-
-    Stmt call_err = Evaluate(
-        Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
-    // Guard the call: only when handle is not null and cond fails
-    Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
-    asserts_.emplace_back(SeqStmt({guarded, nop}));
-  }
-
-  // shape field
-  Buffer buf_shape =
-      decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
-                  tvm_shape_type, shape_handle_name());
-  Var v_shape(shape_handle_name(), DataType::Handle());
-  def_handle_dtype_.Set(v_shape, make_const(tvm_shape_type, 0));
-  // Use if_then_else for NULL guard on the shape pointer itself, avoiding
-  // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
-  init_nest_.emplace_back(
-      LetStmt(buf_shape->data,
-              tvm::if_then_else(
-                  Not(is_null),
-                  TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
-                  make_zero(DataType::Handle())),
-              nop));
-  init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
-
-  for (size_t k = 0; k < buffer->shape.size(); ++k) {
-    // These packed-bit dtype shapes were not bound in the original
-    // implementation, so we just use them as is.
-    if (buffer->dtype == DataType::Int(4) ||
-        buffer->dtype == DataType::UInt(4) ||
-        buffer->dtype == DataType::Int(1)) {
-      break;
+    // Only check ndim when handle is non-NULL: use packed error helper
+    PrimExpr ndim_ok = (a_ndim == v_ndim);
+    ffi::Array<PrimExpr> ndim_args;
+    ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
+    ndim_args.push_back(StringImm(kernel_nm));
+    ndim_args.push_back(StringImm(buf_nm));
+    ndim_args.push_back(cast(DataType::Int(64), a_ndim));
+    ndim_args.push_back(cast(DataType::Int(64), v_ndim));
+    Stmt ndim_call = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
+    init_nest_.emplace_back(
+        SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
+                            Evaluate(0)),
+                 nop}));
+    // type checks
+    // Guard all dtype field loads by `is_null` using if_then_else
+    PrimExpr v_type_code = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
+        IntImm(DataType::UInt(8), buffer->dtype.code()));
+    PrimExpr v_type_bits = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
+        IntImm(DataType::UInt(8), buffer->dtype.bits()));
+    PrimExpr v_type_lanes = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
+        IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+    PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
+    PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
+    PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
+
+    PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
+                     v_type_lanes == expect_lanes);
+
+    // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
+    if (buffer->dtype.is_float8_e4m3()) {
+      PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
+      PrimExpr code_e4m3fn =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
+      PrimExpr code_e4m3fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
+           v_type_code == code_e4m3fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
     }
-
-    // The "real" runtime shape value read from DLTensor
-    PrimExpr shape_val =
-        cast(buffer->shape[k].dtype(),
-             BufferLoad(buf_shape,
-                        {IntImm(DataType::Int(32), static_cast<int>(k))}));
-
-    // When first encountering a Var (e.g., m), this will generate:
-    //   Let(m, bound_shape_val, ...)
-    // Constant dimensions will only generate consistency assertions.
-    BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
-                 is_null);
-  }
-
-  // strides field
-  Buffer buf_strides =
-      decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
-                  tvm_shape_type, arg_name + ".strides");
-  def_handle_dtype_.Set(buf_strides->data, tir::TypeAnnotation(tvm_shape_type));
-  init_nest_.emplace_back(
-      LetStmt(buf_strides->data,
-              tvm::if_then_else(
-                  Not(is_null),
-                  TVMArrayGet(DataType::Handle(), handle, builtin::kArrStrides),
-                  make_zero(DataType::Handle())),
-              nop));
-  init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
-  PrimExpr v_strides_is_null =
-      Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
-
-  if (buffer->strides.empty()) {
-    // Assert the buffer is compact
-    DataType stype = buffer->DefaultIndexType();
-    PrimExpr expect_stride = make_const(stype, 1);
-    ffi::Array<PrimExpr> conds;
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      PrimExpr svalue = cast(
-          stype, BufferLoad(buf_strides,
-                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
-      conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
-      expect_stride = expect_stride * buffer->shape[k];
+    // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
+    if (buffer->dtype.is_float8_e5m2()) {
+      PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
+      PrimExpr code_e5m2fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
+    }
+    // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
+    if (buffer->dtype.is_bool()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
+      PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
+      PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
+      PrimExpr uint8_ok =
+          (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
+      // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
+      // bits=8 or bits=1.
+      PrimExpr kdlbool8_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
+      PrimExpr kdlbool1_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
+      // Also accept any dtype whose bitwidth=1, regardless of code, to be
+      // defensive.
+      PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
+      cond =
+          cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+    }
+    // Allow float4 to match int8 at runtime (PyTorch uses int8 as storage for
+    // FP4).
+    if (buffer->dtype.is_float4()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      // For FP4, we pack 2 elements per byte, but we still use same lanes at
+      // storage level Accept int8 with same lanes as the fp4 type
+      PrimExpr fp4_lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && fp4_lanes_ok);
+      cond = cond || int8_ok;
     }
-    std::ostringstream stride_err_msg;
-    stride_err_msg
-        << stride_handle_name()
-        << ": expected to be compact array, but got non-compact strides";
-    if (!conds.empty()) {
-      PrimExpr all_ok = foldl([](PrimExpr a, PrimExpr b,
-                                 Span span) { return logical_and(a, b, span); },
-                              const_true(1), conds);
-      // Packed generic violation for non-compact strides
-      std::string kernel_nm3 = arg_name;
-      std::string buf_nm3 = arg_name;
-      size_t dot_pos3 = arg_name.find('.');
-      if (dot_pos3 != std::string::npos) {
-        kernel_nm3 = arg_name.substr(0, dot_pos3);
-        buf_nm3 = arg_name.substr(dot_pos3 + 1);
+    if (!(buffer->dtype == DataType::Int(1) ||
+          buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) || buffer->dtype.is_float4())) {
+      // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch
+      // occurs. Only issue the call when handle is non-NULL and cond is false.
+      ffi::Array<PrimExpr> packed_args;
+      packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
+      // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
+      // diagnostics
+      std::string kernel_name = arg_name;
+      std::string buffer_name = arg_name;
+      size_t dot_pos = arg_name.find('.');
+      if (dot_pos != std::string::npos) {
+        kernel_name = arg_name.substr(0, dot_pos);
+        buffer_name = arg_name.substr(dot_pos + 1);
       }
-      ffi::Array<PrimExpr> pargs4;
-      pargs4.push_back(StringImm(tvm_error_constraint_violation));
-      pargs4.push_back(StringImm(kernel_nm3));
-      pargs4.push_back(StringImm(buf_nm3));
-      pargs4.push_back(StringImm("strides"));
-      Stmt call_err4 =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
-      // Only check when strides array is present and condition fails
-      Stmt check = IfThenElse(Not(v_strides_is_null),
-                              IfThenElse(Not(all_ok), call_err4), Evaluate(0));
-      asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+      packed_args.push_back(StringImm(kernel_name));
+      packed_args.push_back(StringImm(buffer_name));
+
+      auto i64 = DataType::Int(64);
+      // Cast to int64 for FFI function signature
+      packed_args.push_back(cast(i64, v_type_code));  // actual_code
+      packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
+      packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
+      packed_args.push_back(cast(i64, expect_code));  // expect_code
+      packed_args.push_back(cast(i64, expect_bits));  // expect_bits
+      packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
+
+      Stmt call_err = Evaluate(
+          Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
+      // Guard the call: only when handle is not null and cond fails
+      Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
+      asserts_.emplace_back(SeqStmt({guarded, nop}));
     }
-  } else if (buffer->buffer_type == kAutoBroadcast) {
-    PrimExpr stride_from_shape = 1;
-    for (size_t i = buffer->shape.size(); i != 0; --i) {
-      size_t k = i - 1;
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides,
-                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
 
-      PrimExpr stride_val = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape, explicit_stride);
+    // Get the pre-created shape buffer
+    Buffer buf_shape = shape_buffer_map[arg_name];
+
+    // Bind symbolic variables from buffer shape
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      // These packed-bit dtype shapes were not bound in the original
+      // implementation, so we just use them as is.
+      if (buffer->dtype == DataType::Int(4) ||
+          buffer->dtype == DataType::UInt(4) ||
+          buffer->dtype == DataType::Int(1)) {
+        break;
+      }
 
-      BindNullable(buffer->strides[k], stride_val, stride_element_name(k), true,
-                   is_null);
+      // The "real" runtime shape value read from DLTensor
+      PrimExpr shape_val =
+          cast(buffer->shape[k].dtype(),
+               BufferLoad(buf_shape,
+                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+      // Check if this dimension is a symbolic variable
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        auto it = def_map_->find(v);
+        if (it == def_map_->end()) {
+          // First time binding this symbolic variable
+          auto sources_it = shape_var_sources.find(v);
+          if (sources_it != shape_var_sources.end() &&
+              sources_it->second.size() > 1) {
+            // This variable appears in multiple buffers
+            // Assert that at least one buffer is non-null
+            PrimExpr any_nonnull = const_false();
+            for (const auto &src : sources_it->second) {
+              bool buf_is_used = used_param_buffers.count(src.handle_ptr);
+              if (buf_is_used) {
+                any_nonnull = const_true();
+                break;
+              }
+              Var src_is_null = is_null_map[src.buf_name];
+              any_nonnull = Or(any_nonnull, Not(src_is_null));
+            }
+
+            std::ostringstream err_msg;
+            err_msg << "Symbolic shape variable "
+                    << ffi::GetRef<Var>(v)->name_hint
+                    << " requires at least one non-null buffer among: ";
+            bool first = true;
+            for (const auto &src : sources_it->second) {
+              if (!first)
+                err_msg << ", ";
+              err_msg << src.buf_name;
+              first = false;
+            }
+
+            init_nest_.emplace_back(AssertStmt(
+                any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
+
+            // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
+            // if !is_null_b then b.shape[k] ... We need to construct this in
+            // reverse order
+            PrimExpr cascaded_value;
+            bool is_first_source = true;
+
+            for (auto rit = sources_it->second.rbegin();
+                 rit != sources_it->second.rend(); ++rit) {
+              const auto &src = *rit;
+
+              // Get the shape buffer for this source
+              auto it_buf = shape_buffer_map.find(src.buf_name);
+              if (it_buf == shape_buffer_map.end()) {
+                LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
+              }
+              Buffer src_shape_buf = it_buf->second;
+
+              // Construct the shape load
+              PrimExpr src_shape_val =
+                  cast(buffer->shape[k].dtype(),
+                       BufferLoad(src_shape_buf,
+                                  {IntImm(DataType::Int(32),
+                                          static_cast<int>(src.dim_idx))}));
+
+              // Check if this buffer is used (non-nullable)
+              bool src_is_used = used_param_buffers.count(src.handle_ptr);
+
+              if (is_first_source) {
+                // Base case: use this shape value directly (we know at least
+                // one is non-null from assert)
+                cascaded_value = src_shape_val;
+                is_first_source = false;
+              } else {
+                // if !is_null then use this shape, else use previous cascaded
+                // value But if buffer is used (non-nullable), always use its
+                // shape
+                if (src_is_used) {
+                  cascaded_value = src_shape_val;
+                } else {
+                  Var src_is_null = is_null_map[src.buf_name];
+                  cascaded_value = tvm::if_then_else(
+                      Not(src_is_null), src_shape_val, cascaded_value);
+                }
+              }
+            }
+
+            // Bind the variable to the cascaded expression
+            Var v_arg = ffi::GetRef<Var>(v);
+            defs_.emplace_back(v_arg);
+            (*def_map_)[v] = cascaded_value;
+            init_nest_.emplace_back(
+                LetStmt(v_arg, cascaded_value, Evaluate(0)));
+          } else {
+            // Single source or no special handling needed, use the original
+            // nullable binding
+            BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
+                         true, is_null);
+          }
+        } else {
+          // Variable already bound, add assertion with nullable guard
+          PrimExpr cond = (it->second == shape_val);
+          BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                          is_null);
+        }
+      } else {
+        // Constant dimension, just add assertion
+        BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
+                     is_null);
+      }
     }
-  } else {
-    PrimExpr stride_from_shape = 1;
 
-    for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0; --k) {
-      DataType stride_dtype = buffer->strides[k].dtype();
-      PrimExpr explicit_stride =
-          cast(stride_dtype,
-               BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
-      PrimExpr shape_stride = cast(
-          stride_dtype, BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
+    // strides field
+    Buffer buf_strides =
+        decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                    tvm_shape_type, arg_name + ".strides");
+    def_handle_dtype_.Set(buf_strides->data,
+                          tir::TypeAnnotation(tvm_shape_type));
+    init_nest_.emplace_back(
+        LetStmt(buf_strides->data,
+                tvm::if_then_else(Not(is_null),
+                                  TVMArrayGet(DataType::Handle(), handle,
+                                              builtin::kArrStrides),
+                                  make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+    PrimExpr v_strides_is_null =
+        Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+    if (buffer->strides.empty()) {
+      // Assert the buffer is compact
+      DataType stype = buffer->DefaultIndexType();
+      PrimExpr expect_stride = make_const(stype, 1);
+      ffi::Array<PrimExpr> conds;
+      for (size_t i = buffer->shape.size(); i != 0; --i) {
+        size_t k = i - 1;
+        PrimExpr svalue =
+            cast(stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32),
+                                                        static_cast<int>(k))}));
+        conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
+        expect_stride = expect_stride * buffer->shape[k];
+      }
+      std::ostringstream stride_err_msg;
+      stride_err_msg
+          << stride_handle_name()
+          << ": expected to be compact array, but got non-compact strides";
+      if (!conds.empty()) {
+        PrimExpr all_ok =
+            foldl([](PrimExpr a, PrimExpr b,
+                     Span span) { return logical_and(a, b, span); },
+                  const_true(1), conds);
+        // Packed generic violation for non-compact strides
+        std::string kernel_nm3 = arg_name;
+        std::string buf_nm3 = arg_name;
+        size_t dot_pos3 = arg_name.find('.');
+        if (dot_pos3 != std::string::npos) {
+          kernel_nm3 = arg_name.substr(0, dot_pos3);
+          buf_nm3 = arg_name.substr(dot_pos3 + 1);
+        }
+        ffi::Array<PrimExpr> pargs4;
+        pargs4.push_back(StringImm(tvm_error_constraint_violation));
+        pargs4.push_back(StringImm(kernel_nm3));
+        pargs4.push_back(StringImm(buf_nm3));
+        pargs4.push_back(StringImm("strides"));
+        Stmt call_err4 = Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
+        // Only check when strides array is present and condition fails
+        Stmt check =
+            IfThenElse(Not(v_strides_is_null),
+                       IfThenElse(Not(all_ok), call_err4), Evaluate(0));
+        asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+      }
+    } else if (buffer->buffer_type == kAutoBroadcast) {
+      PrimExpr stride_from_shape = 1;
+      for (size_t i = buffer->shape.size(); i != 0; --i) {
+        size_t k = i - 1;
+        DataType stride_dtype = buffer->strides[k].dtype();
+        PrimExpr explicit_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_strides,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
 
-      PrimExpr stride_val = tvm::if_then_else(
-          v_strides_is_null, stride_from_shape, explicit_stride);
+        PrimExpr stride_val = tvm::if_then_else(
+            v_strides_is_null, stride_from_shape, explicit_stride);
 
-      BindNullable(buffer->strides[k], stride_val, stride_element_name(k), true,
-                   is_null);
+        BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                     true, is_null);
+      }
+    } else {
+      PrimExpr stride_from_shape = 1;
+
+      for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0; --k) {
+        DataType stride_dtype = buffer->strides[k].dtype();
+        PrimExpr explicit_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+        PrimExpr shape_stride =
+            cast(stride_dtype,
+                 BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
+
+        PrimExpr stride_val = tvm::if_then_else(
+            v_strides_is_null, stride_from_shape, explicit_stride);
+
+        BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                     true, is_null);
+      }
     }
-  }
 
-  // Byte_offset field.
-  int data_bytes = GetVectorBytes(buffer->dtype);
+    // Byte_offset field.
+    int data_bytes = GetVectorBytes(buffer->dtype);
+
+    if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
+      // Constant elem_offset: only need consistency check, no need for
+      // additional Var binding.
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_byte_offset =
+          make_const(DataType::UInt(64), const_offset->value * data_bytes);
+      PrimExpr ok = (expect_byte_offset == actual_byte_offset);
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
+      pargs.push_back(StringImm(kernel_nm));
+      pargs.push_back(StringImm(buf_nm));
+      pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
+      pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
+           nop}));
+    } else {
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_elem_off = cast(
+          buffer->elem_offset.dtype(),
+          (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
+
+      BindNullable(buffer->elem_offset, expect_elem_off,
+                   arg_name + ".elem_offset", true, is_null);
+
+      if (buffer->offset_factor > 1) {
+        PrimExpr offset = buffer->elem_offset;
+        PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
+        PrimExpr zero = make_zero(offset.dtype());
+        BindNullable(offset, truncmod(offset, factor),
+                     arg_name + ".elem_offset", true, is_null);
+      }
+    }
 
-  if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    // Constant elem_offset: only need consistency check, no need for additional
-    // Var binding.
-    PrimExpr actual_byte_offset = tvm::if_then_else(
+    // device info.
+    // Define device_id from handle when available (so later passes can use it)
+    PrimExpr actual_dev_type = tvm::if_then_else(
         Not(is_null),
-        TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
-        make_const(DataType::UInt(64), 0));
-    PrimExpr expect_byte_offset =
-        make_const(DataType::UInt(64), const_offset->value * data_bytes);
-    PrimExpr ok = (expect_byte_offset == actual_byte_offset);
-    ffi::Array<PrimExpr> pargs;
-    pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
-    pargs.push_back(StringImm(kernel_nm));
-    pargs.push_back(StringImm(buf_nm));
-    pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
-    pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
-    Stmt call_err =
-        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
-    asserts_.emplace_back(SeqStmt(
-        {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
-         nop}));
-  } else {
-    PrimExpr actual_byte_offset = tvm::if_then_else(
+        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
+        make_zero(DataType::Int(32)));
+    PrimExpr actual_dev_id = tvm::if_then_else(
         Not(is_null),
-        TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
-        make_const(DataType::UInt(64), 0));
-    PrimExpr expect_elem_off =
-        cast(buffer->elem_offset.dtype(),
-             (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
-
-    BindNullable(buffer->elem_offset, expect_elem_off,
-                 arg_name + ".elem_offset", true, is_null);
-
-    if (buffer->offset_factor > 1) {
-      PrimExpr offset = buffer->elem_offset;
-      PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
-      PrimExpr zero = make_zero(offset.dtype());
-      BindNullable(offset, truncmod(offset, factor), arg_name + ".elem_offset",
-                   true, is_null);
-    }
-  }
-
-  // device info.
-  // Define device_id from handle when available (so later passes can use it)
-  PrimExpr actual_dev_type = tvm::if_then_else(
-      Not(is_null),
-      TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
-      make_zero(DataType::Int(32)));
-  PrimExpr actual_dev_id = tvm::if_then_else(
-      Not(is_null),
-      TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
-      make_zero(DataType::Int(32)));
-
-  // Bind device_id to a safe expression (0 when NULL handle)
-  BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
-               is_null);
-  // Check device_type consistency (device_id equality is implicitly ensured by
-  // binding above)
-  {
-    PrimExpr ok = (device_type == actual_dev_type);
-    ffi::Array<PrimExpr> pargs2;
-    pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
-    pargs2.push_back(StringImm(kernel_nm));
-    pargs2.push_back(StringImm(buf_nm));
-    pargs2.push_back(cast(DataType::Int(64), device_type));
-    pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
-    Stmt call_err2 =
-        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
-    asserts_.emplace_back(SeqStmt(
-        {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2), Evaluate(0)),
-         Evaluate(0)}));
-  }
+        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
+        make_zero(DataType::Int(32)));
 
-  // Data field.  Because the validation of the data field may depend
-  // on a dynamic size defined by the other DLTensor* parameters, this
-  // field must be generated last.
-  // Bind data pointer using expression-level guard to avoid deref on NULL.
-  {
-    Var vptr(buffer->data);
-    PrimExpr data_ptr = tvm::if_then_else(
-        Not(is_null),
-        TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
-        make_zero(DataType::Handle()));
-    BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
-
-    // Check if the data pointer is NULL.  This check is skipped for
-    // size-0 arrays and also skipped when handle itself is NULL.
-    auto alloc_size = [&]() -> PrimExpr {
-      PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
-      for (const auto &dim : buffer->shape)
-        product *= dim;
-      return product;
-    }();
-    // Improve message: kernel/buffer naming for data pointer null check
-    std::string kernel_nm2 = arg_name;
-    std::string buf_nm2 = arg_name;
-    size_t dot_pos2 = arg_name.find('.');
-    if (dot_pos2 != std::string::npos) {
-      kernel_nm2 = arg_name.substr(0, dot_pos2);
-      buf_nm2 = arg_name.substr(dot_pos2 + 1);
+    // Bind device_id to a safe expression (0 when NULL handle)
+    BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
+                 is_null);
+    // Check device_type consistency (device_id equality is implicitly ensured
+    // by binding above)
+    {
+      PrimExpr ok = (device_type == actual_dev_type);
+      ffi::Array<PrimExpr> pargs2;
+      pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
+      pargs2.push_back(StringImm(kernel_nm));
+      pargs2.push_back(StringImm(buf_nm));
+      pargs2.push_back(cast(DataType::Int(64), device_type));
+      pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
+      Stmt call_err2 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
+      asserts_.emplace_back(
+          SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2),
+                              Evaluate(0)),
+                   Evaluate(0)}));
     }
-    // expand combined condition via nested IfThenElse for portability
-    ffi::Array<PrimExpr> pargs3;
-    pargs3.push_back(StringImm(tvm_error_null_ptr));
-    pargs3.push_back(StringImm(kernel_nm2));
-    pargs3.push_back(StringImm(buf_nm2));
-    pargs3.push_back(StringImm("data pointer"));
-    Stmt call_err3 =
-        Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
-    asserts_.emplace_back(SeqStmt(
-        {IfThenElse(Not(is_null),
-                    IfThenElse(Not(alloc_size == 0),
-                               IfThenElse(Call(DataType::Bool(),
-                                               builtin::isnullptr(), {vptr}),
-                                          call_err3),
-                               Evaluate(0)),
-                    Evaluate(0)),
-         nop}));
-
-    // mark alignment of external bufs
-    init_nest_.emplace_back(
-        AttrStmt(vptr, tir::attr::storage_alignment,
-                 IntImm(DataType::Int(32), buffer->data_alignment), nop));
 
-    def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+    // Data field.  Because the validation of the data field may depend
+    // on a dynamic size defined by the other DLTensor* parameters, this
+    // field must be generated last.
+    // Bind data pointer using expression-level guard to avoid deref on NULL.
+    {
+      Var vptr(buffer->data);
+      PrimExpr data_ptr = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
+          make_zero(DataType::Handle()));
+      BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
+
+      // Check if the data pointer is NULL.  This check is skipped for
+      // size-0 arrays and also skipped when handle itself is NULL.
+      PrimExpr alloc_size = IntImm(buffer->DefaultIndexType(), 1);
+      for (const auto &dim : buffer->shape) {
+        alloc_size = alloc_size * dim;
+      }
+      // Improve message: kernel/buffer naming for data pointer null check
+      std::string kernel_nm2 = arg_name;
+      std::string buf_nm2 = arg_name;
+      size_t dot_pos2 = arg_name.find('.');
+      if (dot_pos2 != std::string::npos) {
+        kernel_nm2 = arg_name.substr(0, dot_pos2);
+        buf_nm2 = arg_name.substr(dot_pos2 + 1);
+      }
+      // expand combined condition via nested IfThenElse for portability
+      ffi::Array<PrimExpr> pargs3;
+      pargs3.push_back(StringImm(tvm_error_null_ptr));
+      pargs3.push_back(StringImm(kernel_nm2));
+      pargs3.push_back(StringImm(buf_nm2));
+      pargs3.push_back(StringImm("data pointer"));
+      Stmt call_err3 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null),
+                      IfThenElse(Not(alloc_size == 0),
+                                 IfThenElse(Call(DataType::Bool(),
+                                                 builtin::isnullptr(), {vptr}),
+                                            call_err3),
+                                 Evaluate(0)),
+                      Evaluate(0)),
+           nop}));
+
+      // mark alignment of external bufs
+      init_nest_.emplace_back(
+          AttrStmt(vptr, tir::attr::storage_alignment,
+                   IntImm(DataType::Int(32), buffer->data_alignment), nop));
+
+      def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+    }
   }
 }
 
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index 6a580636..bb7a0f46 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -95,17 +95,21 @@ public:
    */
   void BindBuffer(const Buffer &arg, const Buffer &value,
                   const std::string &arg_name, bool fuzzy_match);
+
   /*!
    * \brief Bind symbolic buffer to a DLTensor handle.
    * \param buffer The argument buffer to be binded.
-   * \param device_type The device id to be binded.
+   * \param device_type The device type to be binded.
    * \param device_id The device id to be binded.
-   * \param handle The DLTensor handle.
-   * \param arg_name argument name.
+   * \param buffer_def The buffer definition.
+   * \param func_name The function name.
+   * \param used_param_buffers The used param buffers.
    */
-  void BindDLTensor(const Buffer &buffer, const PrimExpr &device_type,
-                    const PrimExpr &device_id, const Var &handle,
-                    const std::string &arg_name, bool is_used);
+  void
+  BindDLTensors(const std::vector<std::pair<Var, Buffer>> &buffer_def,
+                const PrimExpr &device_type, const PrimExpr &device_id,
+                const std::string &func_name,
+                const std::unordered_set<const VarNode *> &used_param_buffers);
 
   /*! \return The defs generated in binding. */
   const std::vector<Var> &defs() const { return defs_; }
@@ -178,4 +182,4 @@ private:
 };
 } // namespace tl
 } // namespace tvm
-#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
+#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
\ No newline at end of file
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 942c652f..e9e8f76e 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -393,10 +393,15 @@ PrimFunc MakePackedAPI(PrimFunc func) {
         break;
       }
     }
-    if (!has_used_carrier && !carriers.empty()) {
-      // Choose the first carrier to anchor this symbol.
-      used_param_buffers.insert(carriers.front());
-    }
+    // NOTE: With the new nullable shape binding logic in
+    // ArgBinder::BindDLTensors, we no longer need to force one carrier to be
+    // non-NULL. The binder will:
+    // 1. Assert that at least one carrier is non-NULL at runtime
+    // 2. Use cascaded if_then_else to read from the first non-NULL carrier
+    // So we can allow all carriers to be nullable.
+    // if (!has_used_carrier && !carriers.empty()) {
+    //   used_param_buffers.insert(carriers.front());
+    // }
   }
 
   for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
@@ -508,14 +513,14 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     binder.Bind(param, expr, name_hint + "." + param->name_hint, true);
   }
 
+  binder.BindDLTensors(buffer_def, device_type, device_id, name_hint,
+                       used_param_buffers);
   for (const auto &[var, buffer] : buffer_def) {
     // Prefer buffer data var name in diagnostics to avoid exposing low-level
     // handle vars
-    std::string display = name_hint + "." + buffer->data->name_hint;
-    binder.BindDLTensor(buffer, device_type, device_id, var, display,
-                        used_param_buffers.count(var.get()));
     arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
+
   // reset global symbol to attach prefix
   func = WithAttrs(
       std::move(func),
@@ -614,4 +619,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 } // namespace tl
-} // namespace tvm
+} // namespace tvm
\ No newline at end of file
diff --git a/testing/python/transform/test_nullable_buffer_params.py b/testing/python/transform/test_nullable_buffer_params.py
new file mode 100644
index 00000000..5bbde254
--- /dev/null
+++ b/testing/python/transform/test_nullable_buffer_params.py
@@ -0,0 +1,73 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_nullable_shared_shape():
+    """Test that buffers sharing a shape variable can be nullable."""
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), T.int32],
+            b: T.Tensor[(m,), T.int32],
+            c: T.Tensor[(m,), T.int32],
+        ):
+            with T.Kernel(1, threads=64):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return test_kernel
+
+    m = 200
+    kernel = get_kernel()
+
+    # Create test tensors
+    tensor_a = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_b = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_c = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+
+    print("Test 1: All tensors provided")
+    kernel(tensor_a, tensor_b, tensor_c)
+    print("✓ PASS: All tensors provided")
+
+    print("\nTest 2: Only first tensor provided")
+    kernel(tensor_a, None, None)
+    print("✓ PASS: Only first tensor provided")
+
+    print("\nTest 3: Only middle tensor provided")
+    kernel(None, tensor_b, None)
+    print("✓ PASS: Only middle tensor provided")
+
+    print("\nTest 4: Only last tensor provided")
+    kernel(None, None, tensor_c)
+    print("✓ PASS: Only last tensor provided")
+
+    print("\nTest 5: First and last tensors provided")
+    kernel(tensor_a, None, tensor_c)
+    print("✓ PASS: First and last tensors provided")
+
+    print("\nTest 6: All tensors are None (should fail)")
+    try:
+        kernel(None, None, None)
+        print("✗ FAIL: Should have raised an error")
+        return False
+    except RuntimeError as e:
+        if "at least one non-null buffer" in str(e):
+            print(f"✓ PASS: Correctly rejected with error: {e}")
+        else:
+            print(f"✗ FAIL: Wrong error message: {e}")
+            return False
+
+    print("\n" + "=" * 60)
+    print("All tests passed!")
+    return True
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
-- 
GitLab


From 1a3a64fbfdbd9f1912553afc0597fa4022b238b2 Mon Sep 17 00:00:00 2001
From: Chaofan Lin <chaofan@deepseek.com>
Date: Fri, 19 Dec 2025 11:21:30 +0800
Subject: [PATCH 133/139] [Language] Make TL scripts friendly to Python syntax
 highlights (#1466)

* Language] Make TL scripts friendly to Python syntax highlights

* add comments

* fix submodule
---
 examples/gemm_sp/example_custom_compress.py   |  2 --
 examples/gemm_sp/example_gemm_sp.py           |  2 --
 tilelang/language/__init__.py                 | 10 +++++-----
 tilelang/language/atomic.py                   |  4 +---
 tilelang/language/builtin.py                  |  2 +-
 tilelang/language/{copy.py => copy_op.py}     |  2 +-
 tilelang/language/customize.py                |  2 +-
 tilelang/language/fastmath.py                 |  2 ++
 tilelang/language/{fill.py => fill_op.py}     |  2 +-
 tilelang/language/{gemm.py => gemm_op.py}     |  2 +-
 tilelang/language/kernel.py                   |  2 +-
 tilelang/language/logical.py                  |  2 +-
 tilelang/language/loop.py                     | 13 ++++++++++---
 tilelang/language/math_intrinsics.py          |  2 ++
 tilelang/language/{print.py => print_op.py}   |  0
 tilelang/language/proxy.py                    |  2 +-
 tilelang/language/{reduce.py => reduce_op.py} |  2 +-
 tilelang/quantize/quantization.py             |  3 +--
 18 files changed, 30 insertions(+), 26 deletions(-)
 rename tilelang/language/{copy.py => copy_op.py} (98%)
 rename tilelang/language/{fill.py => fill_op.py} (97%)
 rename tilelang/language/{gemm.py => gemm_op.py} (98%)
 rename tilelang/language/{print.py => print_op.py} (100%)
 rename tilelang/language/{reduce.py => reduce_op.py} (99%)

diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
index 7f18523b..7b93f2a7 100644
--- a/examples/gemm_sp/example_custom_compress.py
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -1,5 +1,3 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 import argparse
 
 import tilelang
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 708bc723..10f524ad 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -1,5 +1,3 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 import argparse
 
 import tilelang
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index bf2a144a..97f83858 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -58,12 +58,12 @@ from .allocate import (
     alloc_tcgen05_instr_desc,  # noqa: F401
     empty,  # noqa: F401
 )
-from .copy import copy, c2d_im2col  # noqa: F401
+from .copy_op import copy, c2d_im2col  # noqa: F401
 from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
-from .gemm import gemm, gemm_v1, gemm_v2  # noqa: F401
+from .gemm_op import gemm, gemm_v1, gemm_v2  # noqa: F401
 from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
-from .fill import fill, clear  # noqa: F401
-from .reduce import (
+from .fill_op import fill, clear  # noqa: F401
+from .reduce_op import (
     reduce,  # noqa: F401
     reduce_max,  # noqa: F401
     reduce_min,  # noqa: F401
@@ -81,7 +81,7 @@ from .reduce import (
     warp_reduce_bitand,  # noqa: F401
     warp_reduce_bitor,  # noqa: F401
 )
-from .print import print, device_assert  # noqa: F401
+from .print_op import print, device_assert  # noqa: F401
 from .customize import (
     atomic_max,  # noqa: F401
     atomic_min,  # noqa: F401
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index b63b48c5..a801f75f 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -1,6 +1,4 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-"""Atomic operations for tilelang."""
+"""Atomic operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index 60739e61..2932656c 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Builtin operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 
diff --git a/tilelang/language/copy.py b/tilelang/language/copy_op.py
similarity index 98%
rename from tilelang/language/copy.py
rename to tilelang/language/copy_op.py
index 1bc84a53..0b55c410 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy_op.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Copy operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 from typing import Literal
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index e2f4b1c8..ae4e754f 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Some customized operations frequently used in tensor programming, exposed on the TileLang language surface."""
 
 from __future__ import annotations
 import tilelang.language as T
diff --git a/tilelang/language/fastmath.py b/tilelang/language/fastmath.py
index 0146f53a..c77fad34 100644
--- a/tilelang/language/fastmath.py
+++ b/tilelang/language/fastmath.py
@@ -1,3 +1,5 @@
+"""Fast math operations exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
diff --git a/tilelang/language/fill.py b/tilelang/language/fill_op.py
similarity index 97%
rename from tilelang/language/fill.py
rename to tilelang/language/fill_op.py
index af301c26..a093a845 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill_op.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Fill operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 from tvm import tir
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm_op.py
similarity index 98%
rename from tilelang/language/gemm.py
rename to tilelang/language/gemm_op.py
index 6f650470..e2bda2b9 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm_op.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""GEMM (General Matrix Multiplication) operators exposed on the TileLang language surface."""
 
 from __future__ import annotations
 from tilelang.tileop.base import GemmWarpPolicy
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 625531b3..73f7ed94 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Kernel launching language interface in TileLang."""
 
 from __future__ import annotations
 from collections import deque
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index fb4b88a6..66f0a2e2 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Logical operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index f28f097c..4fbd4e9f 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Loop related language interfaces in TileLang."""
 
 from __future__ import annotations
 from typing import Any
@@ -175,5 +175,12 @@ def unroll(
         return UnrollForWithStep(start, stop, step, annotations=annotations)
 
 
-Serial = serial
-Unroll = unroll
+# "Serial" and "Unroll" are aliases of "T.serial" and "T.unroll". We use uppercase to emphasize that they are tile-level loops.
+
+
+def Serial(*args, **kwargs):
+    return serial(*args, **kwargs)
+
+
+def Unroll(*args, **kwargs):
+    return unroll(*args, **kwargs)
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 7a6104c7..6dfb617e 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -1,3 +1,5 @@
+"""Common math intrinsics exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
diff --git a/tilelang/language/print.py b/tilelang/language/print_op.py
similarity index 100%
rename from tilelang/language/print.py
rename to tilelang/language/print_op.py
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 7807a466..b739de6b 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Buffer/Tensor proxy in TileLang."""
 
 from __future__ import annotations
 
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce_op.py
similarity index 99%
rename from tilelang/language/reduce.py
rename to tilelang/language/reduce_op.py
index eb464f6a..9db56df0 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce_op.py
@@ -1,4 +1,4 @@
-"""The language interface for tl programs."""
+"""Reduce operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
 from tvm import tir
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index 13552f67..74a545f2 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
+
 # The code below is mostly copied from mlc.ai quantization.py in mlc-llm.
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable
 """TIR computation utilities for quantization."""
-- 
GitLab


From 95e3b5a7160da6679e9507602f801866c3672e6b Mon Sep 17 00:00:00 2001
From: silentCoder-dev <silentcoder@foxmail.com>
Date: Fri, 19 Dec 2025 12:34:39 +0800
Subject: [PATCH 134/139] [Refactor] Remove triton dependence in testing & move
 triton baseline into examples (#1470)

* remove triton dependence in testing & move triton baseline into example

* use ceildiv and handles arbitrary M correctly for triton
---
 examples/rand/rand_uint.py                    | 57 +++++++++++++++++++
 .../language/test_tilelang_language_rand.py   | 22 -------
 2 files changed, 57 insertions(+), 22 deletions(-)
 create mode 100644 examples/rand/rand_uint.py

diff --git a/examples/rand/rand_uint.py b/examples/rand/rand_uint.py
new file mode 100644
index 00000000..466a51b7
--- /dev/null
+++ b/examples/rand/rand_uint.py
@@ -0,0 +1,57 @@
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+import triton.language as tl
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, elements_per_thread, seed):
+    pid = tl.program_id(0)
+    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
+
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+
+    base_idx = offset * 4
+    tl.store(X + base_idx, r0, mask=base_idx < M)
+    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
+    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
+    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
+
+
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    grid = (triton.cdiv(M, 128),)
+    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
+
+    torch.testing.assert_close(tilelang_result, triton_result)
+
+
+if __name__ == "__main__":
+    test_rand_1d(1024, 42)
+    test_rand_1d(512, 123)
+    test_rand_1d(128, 0)
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
index 0f9383b8..daf51dbb 100644
--- a/testing/python/language/test_tilelang_language_rand.py
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -1,8 +1,6 @@
 import tilelang
 import tilelang.language as T
 import torch
-import triton
-import triton.language as tl
 import pytest
 import tilelang.testing
 
@@ -27,20 +25,6 @@ def tilelang_rand_1d(M=1024, seed=42):
     return rand_kernel
 
 
-@triton.jit
-def triton_rand_1d(X, M, elements_per_thread, seed):
-    pid = tl.program_id(0)
-    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
-
-    r0, r1, r2, r3 = tl.randint4x(seed, offset)
-
-    base_idx = offset * 4
-    tl.store(X + base_idx, r0, mask=base_idx < M)
-    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
-    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
-    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
-
-
 @tilelang.testing.requires_cuda
 @pytest.mark.parametrize("M, seed", [(1024, 42), (512, 123), (128, 0)])
 def test_rand_1d(M, seed):
@@ -48,12 +32,6 @@ def test_rand_1d(M, seed):
     tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
     kernel(tilelang_result)
 
-    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
-    grid = (M // 128,)
-    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
-
-    torch.testing.assert_close(tilelang_result, triton_result)
-
 
 if __name__ == "__main__":
     tilelang.testing.main()
-- 
GitLab


From 3516f1eedc040e0c72cd781d63dade0514036f73 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:22:52 +0800
Subject: [PATCH 135/139] [Language] Enhance T.dtype.as_torch conversion for
 compatibility (#1473)

* [Language] Enhance dtype conversion for PyTorch compatibility

- Added support for new float8 and float4 data types in the __dtype_as_torch__ method.
- Implemented backend-specific handling for float8_e4m3 based on HIP or CUDA.
- Included assertions to ensure compatibility with the required PyTorch versions for each dtype.
- Improved error handling for unsupported dtypes.

* Fix test script execution and improve error messages for dtype assertions

- Commented out the main execution call in the test script and replaced it with a direct call to the test function `test_divmod()`.
- Enhanced error messages in the dtype conversion assertions to improve clarity and readability, ensuring proper guidance for required PyTorch versions.
---
 tilelang/language/v2/dtypes.py | 36 +++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
index c872985f..a42ba5a6 100644
--- a/tilelang/language/v2/dtypes.py
+++ b/tilelang/language/v2/dtypes.py
@@ -157,8 +157,42 @@ def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var
 def __dtype_as_torch__(self: dtype) -> torch.dtype:
     """Convert TileLang dtype to PyTorch dtype."""
     dtype_str = str(self)
-    if dtype_str in _STR_TO_TORCH_DTYPE:
+
+    if dtype_str == "float8_e4m3":
+        # Check if we're on HIP (AMD ROCm) or CUDA
+        if torch.version.hip is not None:
+            # HIP backend - use float8_e4m3fnuz
+            assert hasattr(torch, "float8_e4m3fnuz"), (
+                "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+            )
+            return torch.float8_e4m3fnuz
+        else:
+            # CUDA backend - use float8_e4m3fn
+            assert hasattr(torch, "float8_e4m3fn"), (
+                "torch.float8_e4m3fn is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+            )
+            return torch.float8_e4m3fn
+    elif dtype_str == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif dtype_str == "e4m3fnuz_float8":
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif dtype_str == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif dtype_str == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif dtype_str in _STR_TO_TORCH_DTYPE:
         return _STR_TO_TORCH_DTYPE[dtype_str]
+
     raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
 
 
-- 
GitLab


From 2217eb742e7c1bbc15f04b1e95d7dd682394fc54 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:56:10 +0800
Subject: [PATCH 136/139] [News] update with latest news (#1475)

* Update README.md with latest news, including CuTeDSL backend support, Z3 theorem prover integration, and migration to apache-tvm-ffi for improved compatibility.

* Update README.md to enhance CuTeDSL backend announcement with a link to related issue and clarify migration benefits to apache-tvm-ffi, reducing CPU overhead.
---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 0a3cf381..131c8c04 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,9 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
 <img src=./images/MatmulExample.png />
 
 ## Latest News
+- 12/18/2025 🚀: Added [CuTeDSL backend](https://github.com/tile-ai/tilelang/pull/1421) support, enabling compilation to NVIDIA CUTLASS CuTe DSL! Join us in building and optimizing this exciting new backend: [Issue #1454](https://github.com/tile-ai/tilelang/issues/1454).
+- 12/17/2025 🔬: Integrated [Z3 theorem prover](https://github.com/tile-ai/tilelang/pull/1367) into TVM Arith Analyzer, bringing SMT-based symbolic reasoning for enhanced optimizations and automatic correctness verification!
+- 10/31/2025 🔧: Migrated to [apache-tvm-ffi](https://github.com/tile-ai/tilelang/pull/1108), significantly reducing CPU overhead!
 - 10/30/2025 📦: We have released v0.1.6.post2, which is the last version compatible with Python 3.8.
 - 10/07/2025 🍎: Added Apple Metal Device support, check out [Pull Request #799](https://github.com/tile-ai/tilelang/pull/799) for details.
 - 09/29/2025  🎉: Thrilled to announce that ​​AscendC​​ and ​Ascend​NPU IR​​ backends targeting Huawei Ascend chips are now supported!
-- 
GitLab


From 168aec7be0deaaef095cfc87577c4e643f6931fb Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:11:23 +0800
Subject: [PATCH 137/139] [Enhancement] Use static Z3 context  (#1482)

* use static Z3 context

* Update submodule reference for TVM to indicate a dirty state
---
 3rdparty/tvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/tvm b/3rdparty/tvm
index 88778fa8..79ed747d 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 88778fa89d3203d3feb66950b86ca2f942c70fa5
+Subproject commit 79ed747db67e60d3a1889d8afd33473bc2424ade
-- 
GitLab


From 7e8d1f827369040b2c65bed65227a03333e018be Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 21 Dec 2025 03:26:12 +0800
Subject: [PATCH 138/139] [Enhancement] Enhance let binding handling in layout
 inference and warp specialized pass (#1484)

* [Feature] Add FullyReplicated Fragment Layout and Enhance Layout Inference

* Introduced a new static method `FullyReplicated` in the `Fragment` class to create fully replicated fragment layouts, ensuring all threads hold identical copies of the buffer.
* Updated `CopyNode` to collect fragment layouts and mark them as fully replicated during layout inference.
* Enhanced `ParallelOpNode` to expand let bindings for fragment buffer accesses, improving layout inference accuracy.
* Added documentation for new methods and updated existing methods to support the new layout features.

* lint fix

* Remove debug logging statements from layout inference process to streamline output and improve performance.
---
 src/layout/layout.cc                          |   6 +
 src/layout/layout.h                           |  14 ++
 src/op/copy.cc                                |  61 ++++++++-
 src/op/copy.h                                 |  22 ++++
 src/op/fill.cc                                |  18 ++-
 src/op/operator.h                             |   6 +
 src/op/parallel.cc                            |  33 +++++
 src/op/parallel.h                             |   4 +
 src/transform/layout_inference.cc             |  42 +++++-
 src/transform/layout_reducer.cc               |   3 +-
 src/transform/lower_tile_op.cc                |  14 +-
 src/transform/warp_specialized_rewriter.cc    |  29 ++++-
 .../test_tilelang_language_let_layout.py      | 123 ++++++++++++++++++
 13 files changed, 353 insertions(+), 22 deletions(-)
 create mode 100644 testing/python/language/test_tilelang_language_let_layout.py

diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 34d12697..63d9c040 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -549,6 +549,12 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
   data_ = std::move(n);
 }
 
+Fragment Fragment::FullyReplicated(Array<PrimExpr> shape,
+                                   PrimExpr thread_extent) {
+  return Fragment(shape, {}, ReplicationPlaceholder(), thread_extent,
+                  std::nullopt);
+}
+
 // which means the forward_thread is rep_var -> lambda i, rep: rep
 bool FragmentNode::IsCompletedReplicated() const {
   arith::Analyzer analyzer;
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 369df4f2..c30abfb6 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -175,6 +175,20 @@ public:
                    PrimExpr forward_thread, PrimExpr replicate_size,
                    Optional<Var> replicate_var);
 
+  /*!
+   * \brief Create a fully replicated fragment layout.
+   *
+   * A fully replicated fragment means all threads hold identical copies of the
+   * entire buffer. This is useful for index buffers or masks that need to be
+   * accessed uniformly across all threads.
+   *
+   * \param shape The shape of the buffer.
+   * \param thread_extent The number of threads.
+   * \return A Fragment where each thread has a complete copy of all elements.
+   */
+  TVM_DLL static Fragment FullyReplicated(Array<PrimExpr> shape,
+                                          PrimExpr thread_extent);
+
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fragment, Layout, FragmentNode);
 };
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 72e73e16..066a09b1 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -555,14 +555,34 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
         copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkLoad1D;
     Buffer global_tensor = is_load ? src : dst;
     Buffer shared_tensor = is_load ? dst : src;
+
+    Map<Buffer, Layout> result_map;
+
+    // Collect fragment buffers from indices and mark them as fully replicated
+    // For Bulk Load/Store, fragment buffers used as indices should be
+    // replicated across all threads
+    PrimExpr thread_extent = T.thread_bounds->extent;
+    for (const auto &range : src_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+    for (const auto &range : dst_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+
     // check shared layout is non-swizzle
     // skip layout inference if shared layout is already annotated
     if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
       // create a new layout map for tma linear layout
       Layout linear_layout = ComputeLinearLayout(shared_tensor);
-      return Map<Buffer, Layout>({{shared_tensor, linear_layout}});
+      result_map.Set(shared_tensor, linear_layout);
     }
-    return {};
+    return result_map;
   }
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
@@ -571,7 +591,8 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     arith::Analyzer analyzer;
     par_op_ = ParallelOp((MakeSIMTLoop(&analyzer)));
   }
-  return par_op_->InferLayout(T, level);
+  auto layout_map = par_op_->InferLayout(T, level);
+  return layout_map;
 }
 /**
  * @brief Determine whether this CopyNode can be lowered to a Bulk Load (TMA)
@@ -940,8 +961,13 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
     for (auto level : levels) {
-      par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                           false, T.buffer_remap},
+      par_op->InferLayout({T.target,
+                           T.thread_bounds,
+                           T.layout_map,
+                           analyzer,
+                           false,
+                           T.buffer_remap,
+                           {}},
                           level);
     }
     auto loop_layout = par_op->GetLoopLayout();
@@ -2034,6 +2060,31 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   return args;
 }
 
+void CopyNode::CollectFragmentLayouts(const PrimExpr &expr,
+                                      const Map<Var, PrimExpr> &let_var_to_expr,
+                                      const LayoutMap &existing_layouts,
+                                      PrimExpr thread_extent,
+                                      Range thread_bounds,
+                                      Map<Buffer, Layout> &result_map) const {
+  PostOrderVisit(expr, [&](const ObjectRef &node) {
+    if (auto bl = node.as<BufferLoadNode>()) {
+      if (bl->buffer.scope() == "local.fragment" &&
+          !existing_layouts.count(bl->buffer) &&
+          !result_map.count(bl->buffer)) {
+        auto f = Fragment::FullyReplicated(bl->buffer->shape, thread_extent);
+        result_map.Set(bl->buffer, f->BindThreadRange(thread_bounds));
+      }
+    } else if (auto var_node = node.as<VarNode>()) {
+      auto var = tvm::ffi::GetRef<Var>(var_node);
+      if (let_var_to_expr.count(var)) {
+        CollectFragmentLayouts(let_var_to_expr[var], let_var_to_expr,
+                               existing_layouts, thread_extent, thread_bounds,
+                               result_map);
+      }
+    }
+  });
+}
+
 // Register the Copy operation with TVM's TIR system
 // This makes the copy operation available for use in TVM programs
 // - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
diff --git a/src/op/copy.h b/src/op/copy.h
index b08f5768..aca629f5 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -269,6 +269,28 @@ protected:
    * @return Reference to the singleton TVM Op representing this operator.
    */
   TileOperator Clone() const;
+
+private:
+  /*!
+   * \brief Collect fragment buffers from expression and create fully replicated
+   * layouts.
+   *
+   * Recursively searches the expression for BufferLoad nodes with
+   * "local.fragment" scope, following let bindings. For each found fragment
+   * buffer, creates a fully replicated layout and adds it to result_map.
+   *
+   * \param expr            Expression to search.
+   * \param let_var_to_expr Map from let variables to their bound expressions.
+   * \param existing_layouts Existing layout map to check for already-inferred
+   * layouts. \param thread_extent   Number of threads for replication. \param
+   * thread_bounds   Thread bounds for binding the layout. \param result_map
+   * Output map to store collected fragment layouts.
+   */
+  void CollectFragmentLayouts(const PrimExpr &expr,
+                              const Map<Var, PrimExpr> &let_var_to_expr,
+                              const LayoutMap &existing_layouts,
+                              PrimExpr thread_extent, Range thread_bounds,
+                              Map<Buffer, Layout> &result_map) const;
 };
 
 class Copy : public TileOperator {
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 714e97ad..794b3840 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -158,8 +158,13 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   if (dst.scope() == "local.fragment") {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
@@ -176,8 +181,13 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   } else if (dst.scope() == "shared.dyn" || dst.scope() == "shared" ||
              dst.scope() == "global") {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
diff --git a/src/op/operator.h b/src/op/operator.h
index 1453f9c1..c246864e 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -39,6 +39,9 @@ struct LowerArgs {
   AddWorkspaceCallback AddWorkspace;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 struct LayoutInferArgs {
@@ -48,6 +51,9 @@ struct LayoutInferArgs {
   arith::Analyzer *analyzer;
   bool buffer_oob = false;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 class TileOperator;
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 7f755b47..dbc6ea8e 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -182,6 +182,34 @@ TileOperator ParallelOpNode::Clone() const {
   return ParallelOp(op);
 }
 
+void ParallelOpNode::ExpandLetBindings(
+    const Map<Var, PrimExpr> &let_var_to_expr) {
+  if (let_var_to_expr.empty())
+    return;
+
+  // Helper function to recursively find BufferLoads through let bindings
+  std::function<void(const PrimExpr &)> expand = [&](const PrimExpr &expr) {
+    PostOrderVisit(expr, [&](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (bl->buffer.scope() == "local.fragment" &&
+            !indice_map_.count(bl->buffer)) {
+          indice_map_.Set(bl->buffer, bl->indices);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr.count(var)) {
+          expand(let_var_to_expr[var]);
+        }
+      }
+    });
+  };
+
+  // Scan all let bindings
+  for (const auto &[var, expr] : let_var_to_expr) {
+    expand(expr);
+  }
+}
+
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
@@ -215,6 +243,11 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   if (loop_layout_.defined())
     return {};
 
+  // Expand let bindings to find fragment buffer accesses
+  if (!T.let_var_to_expr.empty()) {
+    const_cast<ParallelOpNode *>(this)->ExpandLetBindings(T.let_var_to_expr);
+  }
+
   if (level == InferLevel::kStrict) {
     LayoutMap results;
     // Deduce buffers that should be complicated replicated.
diff --git a/src/op/parallel.h b/src/op/parallel.h
index 4ff5484b..88dd1deb 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -105,6 +105,10 @@ private:
   void AddPredicate(const PrimExpr &expr) const {
     predicate_ = predicate_.defined() ? And(expr, predicate_.value()) : expr;
   }
+  // Expand let bindings to find fragment buffer accesses and add them to
+  // indice_map_. This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0],
+  // ...)
+  void ExpandLetBindings(const Map<Var, PrimExpr> &let_var_to_expr);
 
   // Allow ParallelLoopNestVisitor to access private members.
   friend class ParallelLoopNestVisitor;
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index b44824af..1af81614 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -110,10 +110,14 @@ public:
            "required for layout inference.";
 
     // Run InferLayout
-    auto updates =
-        next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
-                                          cur_analyzer, buffer_oob},
-                          level);
+    auto updates = next->InferLayout(LayoutInferArgs{target_,
+                                                     thread_bounds,
+                                                     layout_map,
+                                                     cur_analyzer,
+                                                     buffer_oob,
+                                                     {},
+                                                     let_var_to_expr_},
+                                     level);
 
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
@@ -479,6 +483,10 @@ private:
         } else if (auto buffer = getBufferFromRegion(arg)) {
           addToUseList(buffer.value());
         }
+        // Check if the argument uses any LetStmt variables that reference
+        // fragment buffers. If so, add those buffers to the use list.
+        // This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0], ...)
+        CollectFragmentBuffersFromExpr(arg);
       }
       // Compute thread_var_ and thread_bounds_
       thread_var_vec_.push_back(thread_var_);
@@ -754,6 +762,30 @@ private:
     IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Record Let variable to its bound expression.
+    // This enables tracking fragment buffer accesses through let bindings.
+    let_var_to_expr_.Set(op->var, op->value);
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Helper: recursively collect fragment buffers from an expression,
+  // following let bindings chain.
+  void CollectFragmentBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (bl->buffer.defined() && bl->buffer.scope() == "local.fragment") {
+          addToUseList(bl->buffer);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr_.count(var)) {
+          CollectFragmentBuffersFromExpr(let_var_to_expr_[var]);
+        }
+      }
+    });
+  }
+
   void VisitExpr_(const BufferLoadNode *op) final {
     // Collect buffer from BufferLoad
     if (op->buffer.defined() && op->buffer->data.defined()) {
@@ -815,6 +847,8 @@ private:
   }
 
   Map<Var, Array<Buffer>> buffer_data_to_buffers_;
+  // Map from LetStmt variable to its bound expression
+  Map<Var, PrimExpr> let_var_to_expr_;
   std::vector<ObjectRef> infer_list_stmt_;
   std::vector<TileOperator> infer_list_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
diff --git a/src/transform/layout_reducer.cc b/src/transform/layout_reducer.cc
index 660fc6fd..957918c9 100644
--- a/src/transform/layout_reducer.cc
+++ b/src/transform/layout_reducer.cc
@@ -213,8 +213,7 @@ private:
         const auto &buffer = opt_buffer.value();
         Fragment f;
         if (info->rep == ReducerRepType::ALL) {
-          f = Fragment(buffer->shape, {}, ReplicationPlaceholder(),
-                       thread_extent, std::nullopt);
+          f = Fragment::FullyReplicated(buffer->shape, thread_extent);
         } else if (info->rep == ReducerRepType::NONE) {
           PrimExpr flatten_idx = InputPlaceholder(0);
           for (int i = 1; i < buffer->shape.size(); ++i)
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 4392f319..c88e05c5 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -638,10 +638,16 @@ private:
       thread_bounds = Range::FromMinExtent(0, 1);
     }
 
-    auto lowered =
-        tile_op->Lower(LowerArgs{target_, thread_bounds, thread_var_->var,
-                                 callback, layout_map_, buffer_remap_},
-                       analyzer_);
+    // Convert let_bindings_ to Map<Var, PrimExpr> for LowerArgs
+    Map<Var, PrimExpr> let_var_to_expr;
+    for (const auto &[var, expr] : let_bindings_) {
+      let_var_to_expr.Set(var, expr);
+    }
+
+    auto lowered = tile_op->Lower(
+        LowerArgs{target_, thread_bounds, thread_var_->var, callback,
+                  layout_map_, buffer_remap_, let_var_to_expr},
+        analyzer_);
     return IRMutatorWithAnalyzer::VisitStmt(lowered);
   }
 
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
index fd02c024..8e891d85 100644
--- a/src/transform/warp_specialized_rewriter.cc
+++ b/src/transform/warp_specialized_rewriter.cc
@@ -50,6 +50,7 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
 public:
   auto FindProducerusedBuffer(const Stmt &stmt) {
     producer_buffers_.clear();
+    let_var_to_expr_.clear();
     std::unordered_set<const BufferNode *> last_producer_buffers_;
     for (;;) {
       VisitStmt(stmt);
@@ -68,6 +69,28 @@ public:
     for (const auto &buffer : usage.buffer_use_count_) {
       producer_buffers_.insert(buffer.first);
     }
+    // Also collect buffers through let bindings
+    CollectBuffersFromExpr(expr);
+  }
+
+  // Collect buffers from expression, following let bindings
+  void CollectBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        producer_buffers_.insert(bl->buffer.get());
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        auto it = let_var_to_expr_.find(var.get());
+        if (it != let_var_to_expr_.end()) {
+          CollectBuffersFromExpr(it->second);
+        }
+      }
+    });
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    let_var_to_expr_[op->var.get()] = op->value;
+    StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
@@ -102,15 +125,15 @@ public:
   void VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
       for (auto arg : op->args) {
-        if (auto buffer_load = arg.as<BufferLoadNode>()) {
-          producer_buffers_.insert(buffer_load->buffer.get());
-        }
+        // Collect buffers from args, including through let bindings
+        CollectBuffersFromExpr(arg);
       }
     }
   }
 
 private:
   std::unordered_set<const BufferNode *> producer_buffers_;
+  std::unordered_map<const VarNode *, PrimExpr> let_var_to_expr_;
 };
 
 class WarpSpecializedRoleMarker : public StmtVisitor {
diff --git a/testing/python/language/test_tilelang_language_let_layout.py b/testing/python/language/test_tilelang_language_let_layout.py
new file mode 100644
index 00000000..fec30b91
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_let_layout.py
@@ -0,0 +1,123 @@
+"""
+Test layout inference for LetStmt expressions.
+
+This test validates that TileLang correctly handles layout inference when
+fragment buffer accesses occur through let bindings. For example:
+
+    block_mask_f = T.alloc_fragment((N_S,), T.int32)
+    T.copy(BlockMask[by, :], block_mask_f)
+    for i in T.Pipelined(N_S):
+        a = block_mask_f[i]  # LetStmt: a is bound to fragment buffer load
+        T.copy(A[a, 0], A_shared)  # a is used as index in TMA copy
+
+Key scenarios tested:
+1. Fragment buffer layout inference through let bindings
+2. TMA (Tensor Memory Accelerator) copy with let-bound indices
+3. CP.ASYNC copy with let-bound indices
+4. Warp specialization with let-bound fragment accesses
+"""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def blocksparse_copy_kernel(M, N, N_S, block_M, block_N, dtype=T.float16):
+    """BlockSparse copy kernel using fragment for block mask indices."""
+    block_mask_shape = (M // block_M, N_S)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)
+            block_mask_f = T.alloc_fragment((N_S,), T.int32)
+
+            T.clear(B_shared)
+            T.copy(BlockMask[by, :], block_mask_f)
+            for i in T.Pipelined(N_S):
+                a = block_mask_f[i]  # LetStmt: fragment buffer access
+                if a >= 0:
+                    T.copy(A[a, 0], A_shared)
+                    T.copy(A_shared, B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N])
+
+    return main
+
+
+def ref_blocksparse_copy(A, B, BlockMask, M, N, N_S, block_M, block_N):
+    """Reference implementation for blocksparse copy."""
+    ref_B = B.clone()
+    num_row_blocks = M // block_M
+
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            src_row_start = BlockMask[by, i].item()
+            ref_B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N] = A[
+                src_row_start : src_row_start + block_M, 0:block_N
+            ]
+
+    return ref_B
+
+
+def run_blocksparse_copy(M, N, block_M, block_N, pass_configs=None):
+    """Run blocksparse copy test with given parameters."""
+    N_S = N // block_N
+
+    program = blocksparse_copy_kernel(M, N, N_S, block_M, block_N)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs=pass_configs or {},
+    )
+
+    # Initialize tensors
+    a = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    b = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+
+    # Create BlockMask with valid row indices
+    num_row_blocks = M // block_M
+    block_mask = torch.zeros((num_row_blocks, N_S), dtype=torch.int32, device="cuda")
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            max_row_block = (M - block_M) // block_M
+            block_mask[by, i] = torch.randint(0, max_row_block + 1, (1,)).item() * block_M
+
+    # Run kernel
+    c = kernel(a, block_mask)
+
+    # Compute reference
+    ref_c = ref_blocksparse_copy(a, b, block_mask, M, N, N_S, block_M, block_N)
+
+    # Verify
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_tma():
+    """Test blocksparse copy with TMA (Tensor Memory Accelerator)."""
+    run_blocksparse_copy(M=1024, N=1024, block_M=128, block_N=128, pass_configs={})
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_cp_async():
+    """Test blocksparse copy with CP.ASYNC (without TMA)."""
+    run_blocksparse_copy(
+        M=1024,
+        N=1024,
+        block_M=128,
+        block_N=128,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
-- 
GitLab


From a874e4e8235a795fda3735500c6517965d7952b2 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Sun, 21 Dec 2025 16:12:24 +0800
Subject: [PATCH 139/139] [Refactor] Phaseout PassConfig
 `kDisableDynamicTailSplit` and `kDynamicAlignment` as they are legacy (#1486)

* [Cleanup] Remove dynamic shape example and related tests

* Deleted the dynamic shape example script `example_dynamic.py` and its corresponding test file `test_example_dynamic.py` to streamline the codebase.
* Removed unused dynamic tail split and dynamic alignment configurations from `builtin.h` and `pass_config.py`.
* Cleaned up the dynamic shape testing files to eliminate redundancy and improve maintainability.

* build fix
---
 examples/dynamic_shape/example_dynamic.py     | 109 ----
 .../dynamic_shape/test_example_dynamic.py     |  10 -
 src/op/builtin.cc                             |   2 -
 src/op/builtin.h                              |  20 -
 .../dynamic/test_tilelang_dynamic_symbolic.py | 490 ----------------
 .../test_tilelang_dynamic_symbolic_bench.py   | 544 ------------------
 tilelang/transform/pass_config.py             |   6 -
 7 files changed, 1181 deletions(-)
 delete mode 100644 examples/dynamic_shape/example_dynamic.py
 delete mode 100644 examples/dynamic_shape/test_example_dynamic.py
 delete mode 100644 testing/python/dynamic/test_tilelang_dynamic_symbolic.py
 delete mode 100644 testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py

diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
deleted file mode 100644
index 598c9edf..00000000
--- a/examples/dynamic_shape/example_dynamic.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import tilelang
-import tilelang.language as T
-import tilelang.testing
-from tilelang import tvm as tvm
-
-
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
-def matmul_dynamic_mnk(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def dynamic_matmul(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return dynamic_matmul
-
-
-def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads):
-    print(
-        f"M: {M}, N: {N}, K: {K}, block_M: {block_M}, block_N: {block_N}, block_K: {block_K}, trans_A: {trans_A}, trans_B: {trans_B}, in_dtype: {in_dtype}, out_dtype: {out_dtype}, accum_dtype: {accum_dtype}, num_stages: {num_stages}, threads: {threads}"
-    )
-    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
-
-    import torch
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-    print("Kernel output matches PyTorch reference.")
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Latency: {latency} ms")
-
-
-def main(M=16384, N=16384, K=16384):
-    block_M, block_N, block_K = 128, 128, 32
-    trans_A, trans_B = False, False
-    in_dtype, out_dtype = T.float16, T.float16
-    accum_dtype = T.float32
-    num_stages = 3
-    threads = 128
-    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/dynamic_shape/test_example_dynamic.py b/examples/dynamic_shape/test_example_dynamic.py
deleted file mode 100644
index 36a3743f..00000000
--- a/examples/dynamic_shape/test_example_dynamic.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import tilelang.testing
-import example_dynamic
-
-
-def test_example_dynamic():
-    example_dynamic.main(M=1024, N=1024, K=1024)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 1e065ed0..da69c52d 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -22,8 +22,6 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableSafeMemoryLegalize, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWarpSpecialized, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableThreadStorageSync, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kConfigIndexBitwidth, Integer);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDynamicTailSplit, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDynamicAlignment, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kForceLetInline, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
diff --git a/src/op/builtin.h b/src/op/builtin.h
index fd0bb22e..fddf1e13 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -60,14 +60,6 @@ static constexpr const char *kLayoutVisualizationEnable =
 static constexpr const char *kLayoutVisualizationFormats =
     "tl.layout_visualization_formats";
 static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
-/*!
- * \brief Whether to disable dynamic tail split
- *
- * kDisableDynamicTailSplit = "tl.disable_dynamic_tail_split"
- *
- */
-static constexpr const char *kDisableDynamicTailSplit =
-    "tl.disable_dynamic_tail_split";
 
 /*!
  * \brief Whether to disable thread storage synchronization
@@ -91,18 +83,6 @@ static constexpr const char *kDisableThreadStorageSync =
  */
 static constexpr const char *kForceLetInline = "tl.force_let_inline";
 
-/*!
- * \brief The size of the vectorized dimension in buffer, designed by user
- *
- * For example, if the vectorized dimension is 128 bits and the dtype of buffer
- * A[m, k] is float16, the size of the vectorized dimension (i.e. k) in buffer A
- * should be divisible by 8 (8 = 128 / 16).
- *
- * kDynamicAlignment = "tl.dynamic_alignment"
- *
- */
-static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
-
 /*!
  * \brief Get the type of the CUDA tensor map
  *
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
deleted file mode 100644
index f93c330c..00000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ /dev/null
@@ -1,490 +0,0 @@
-import torch
-import torch.backends
-from tilelang import tvm as tvm
-import tilelang.testing
-from tvm import DataType
-import tilelang.language as T
-from tilelang.intrinsics.utils import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
-
-tilelang.testing.set_random_seed(0)
-
-
-def make_swizzle_layout(shared_buf):
-    dtype = shared_buf.dtype
-    shape = shared_buf.shape
-
-    can_swizzle = shape[-1] * DataType(dtype).bits == 512
-    if not can_swizzle:
-        return T.Layout(shape, lambda *args: args)
-
-    def transform_func(i, j):
-        new_warp_i, new_warp_j = get_swizzle_layout(i, j, shape[-1], dtype)
-        return [new_warp_i, new_warp_j]
-
-    return T.Layout(shape, transform_func)
-
-
-def tl_matmul_macro(
-    N,
-    K,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-):
-    assert in_dtype in [
-        T.float16,
-        T.int8,
-    ], "Currently only float16 and int8 are supported"
-    assert out_dtype in [
-        T.float16,
-        T.float32,
-        T.int32,
-    ], "Currently only float16, float32 and int32 are supported"
-
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    if out_dtype == T.int32:
-        micro_size_k = 32
-
-    # This is a debug config
-    block_row_warps = 1
-    block_col_warps = 1
-    warp_row_tiles = 16
-    warp_col_tiles = 16
-    chunk = 32 if in_dtype == T.float16 else 64
-    shared_scope = "shared.dyn"
-
-    # Pipeline Stage
-    stage = 2
-
-    block_M = block_row_warps * warp_row_tiles
-    block_N = block_col_warps * warp_col_tiles
-    block_K = chunk
-
-    M = tvm.te.var("m")
-    A_shape = (M, K)
-    B_shape = (N, K)
-    A_shared_shape = (block_M, block_K)
-    B_shared_shape = (block_N, block_K)
-    C_shared_shape = (
-        block_M // micro_size_x,
-        block_N // micro_size_y,
-        micro_size_x,
-        micro_size_y,
-    )
-
-    warp_size = 32
-    threads = warp_size * (block_row_warps * block_col_warps)
-    local_size = (micro_size_x * micro_size_y) // warp_size
-    warp_rows = warp_row_tiles // micro_size_x
-    warp_cols = warp_col_tiles // micro_size_y
-
-    # MMA Wrapper to Auto Generate Code for MMA
-    mma_emitter = TensorCoreIntrinEmitter(
-        a_dtype=in_dtype,
-        b_dtype=in_dtype,
-        accum_dtype=accum_dtype,
-        a_transposed=False,
-        b_transposed=True,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-    )
-
-    @T.prim_func
-    def main(
-        A: T.Tensor(A_shape, in_dtype),
-        B: T.Tensor(B_shape, in_dtype),
-        C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
-            A_local = T.alloc_local((warp_rows * local_size), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size), in_dtype)
-            C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
-
-            T.annotate_layout(
-                {
-                    A_shared: make_swizzle_layout(A_shared),
-                    B_shared: make_swizzle_layout(B_shared),
-                }
-            )
-
-            # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
-
-            T.clear(C_local)
-
-            for ko in T.Pipelined((K // block_K), num_stages=stage):
-                # Load A into shared memory
-                for i, k in T.Parallel(block_M, block_K):
-                    A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
-
-                # Load B into shared memory
-                for j, k in T.Parallel(block_N, block_K):
-                    B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
-
-                for ki in T.serial(0, (block_K // micro_size_k)):
-                    # Load A into fragment
-                    mma_emitter.ldmatrix_a(
-                        A_local,
-                        A_shared,
-                        ki,
-                    )
-
-                    # Load B into fragment
-                    mma_emitter.ldmatrix_b(
-                        B_local,
-                        B_shared,
-                        ki,
-                    )
-
-                    # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local)
-
-            # Perform STMatrix
-            mma_emitter.stmatrix(
-                C_local,
-                C_shared,
-            )
-
-            # Store shared into global
-            for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i, bx * block_N + j] = C_shared[
-                    i // micro_size_x,
-                    j // micro_size_y,
-                    i % micro_size_x,
-                    j % micro_size_y,
-                ]
-
-    return main
-
-
-def assert_tl_matmul_macro_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
-    matmul = tl_matmul_macro(N, K, in_dtype, out_dtype, accum_dtype)
-
-    kernel = tilelang.compile(matmul, out_idx=[2])
-    src_code = kernel.get_kernel_source()
-
-    # src_code is the generated cuda source
-    assert src_code is not None
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
-
-    C = kernel(A, B)
-
-    # Get Reference Result
-    ref_c = torch.matmul(A, B.T).to(getattr(torch, accum_dtype))
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, out_idx=[2])
-
-    A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    C = kernel(A, B)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def tl_matmul_block_all_dynamic(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_all_dynamic_correctness(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-    dynamic_alignment=8,
-):
-    program = tl_matmul_block_all_dynamic(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    pass_configs = {
-        tilelang.PassConfigKey.TL_DISABLE_DYNAMIC_TAIL_SPLIT: dynamic_alignment != 0,
-        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment,
-    }
-    if M % 64 == 0 or N % 64 == 0 or K % 64 != 0:
-        # workaround for hopper tma lower pass
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER] = True
-        pass_configs[tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED] = True
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-
-def test_assert_tl_matmul_macro():
-    assert_tl_matmul_macro_correctness(128, 128, 128, T.float16, T.float16, T.float16)
-    assert_tl_matmul_macro_correctness(66, 128, 128, T.float16, T.float16, T.float16)
-    assert_tl_matmul_macro_correctness(32, 128, 128, T.float16, T.float16, T.float16)
-
-
-def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32)
-
-
-def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=8
-    )
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 128, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=8
-    )
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=4
-    )
-    # Tail split is enabled with dynamic alignment 0
-    assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, T.float16, T.float16, T.float16, 64, 64, 32, dynamic_alignment=0
-    )
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
deleted file mode 100644
index ea6efadb..00000000
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ /dev/null
@@ -1,544 +0,0 @@
-import torch
-from tilelang import tvm as tvm
-import tilelang.testing
-import tilelang.language as T
-
-
-def tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_static(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages=3,
-    num_threads=128,
-):
-    program = tl_matmul_block_static(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        accum_dtype,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    # print(kernel.get_kernel_source())
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench()
-    print(f"Static Latency: {latency} ms")
-
-
-def tl_matmul_block_dynamic_m(
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_m(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_m(
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic M Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mn(
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mn(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mn(
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MN Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def tl_matmul_block_dynamic_mnk(
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    num_threads,
-):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
-
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-
-    @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def assert_tl_matmul_block_dynamic_mnk(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    num_stages=3,
-    num_threads=128,
-    pass_configs=None,
-):
-    program = tl_matmul_block_dynamic_mnk(
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    kernel = tilelang.compile(program, pass_configs=pass_configs)
-
-    if trans_A:
-        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    if trans_B:
-        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
-    else:
-        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
-    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
-
-    kernel(A, B, C)
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    # Get Reference Result
-    ref_c = ref_program(A, B)
-
-    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
-
-    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    latency = profiler.do_bench(input_tensors=[A, B, C])
-    print(f"Dynamic MNK Latency with pass_configs: {pass_configs} is {latency} ms")
-
-
-def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, T.float16, T.float16, T.float32)
-
-
-def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
-    )
-    assert_tl_matmul_block_dynamic_m(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": False},
-    )
-
-
-def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
-    )
-    assert_tl_matmul_block_dynamic_mn(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": False},
-    )
-
-
-def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 4},
-    )
-    assert_tl_matmul_block_dynamic_mnk(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        False,
-        False,
-        T.float16,
-        T.float16,
-        T.float32,
-        pass_configs={"tl.disable_dynamic_tail_split": False},
-    )
-
-
-def test_all():
-    run_assert_tl_matmul_block_static(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_m(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mn(1024, 1024, 1024, 128, 128, 32)
-    run_assert_tl_matmul_block_dynamic_mnk(1024, 1024, 1024, 128, 128, 32)
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index b42ccd7e..8bb82106 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -10,12 +10,6 @@ class PassConfigKey(str, Enum):
     TL_SIMPLIFY = "tl.Simplify"
     """Enable/disable TileLang simplification passes. Default: True"""
 
-    TL_DYNAMIC_ALIGNMENT = "tl.dynamic_alignment"
-    """Memory alignment requirement for dynamic shapes. Default: 16"""
-
-    TL_DISABLE_DYNAMIC_TAIL_SPLIT = "tl.disable_dynamic_tail_split"
-    """Disable dynamic tail splitting optimization. Default: False"""
-
     TL_DISABLE_WARP_SPECIALIZED = "tl.disable_warp_specialized"
     """Disable warp specialization optimization. Default: False"""
 
-- 
GitLab