[Example] Add vertical slash sparse attention pattern (#762)

* upd sparse attn * lint * rename * update test file * update benchmark * lint * update benchmark

[Example] Add vertical slash sparse attention pattern (#762)
* upd sparse attn * lint * rename * update test file * update benchmark * lint * update benchmark
37051417 · Wenhao Xie · GitHub · 1774a1aa · 37051417 · 37051417
Unverified Commit 37051417 authored Aug 28, 2025 by Wenhao Xie Committed by GitHub Aug 28, 2025
5 changed files
--- a/examples/minference/README.md
+++ b/examples/minference/README.md
+# Performance Benchmark
+## Hardware & Environment
+- **Hardware**: NVIDIA H100 PCIe
+- **CUDA version**: 12.8.1
+- **PyTorch Version**: 2.7.1+cu128
+- **Triton Version**: 3.3.1
+## Performance Results
+BATCH_SIZE=1, HEAD=1, DIM=64
+| SEQ_LEN | VS_LIST      | Triton Time | TileLang Time | Speedup |
+|---------|--------------|-------------|---------------|---------|
+| 8192    | [1000, 200]  | 0.168 ms    | 0.105 ms     | 1.60x   |
+| 8192    | [1000, 600]  | 0.207 ms    | 0.119 ms     | 1.74x   |
+| 8192    | [800, 600]   | 0.207 ms    | 0.122 ms     | 1.70x   |
+|         |              |             |              |         |
+| 16384   | [1000, 200]  | 0.261 ms    | 0.167 ms      | 1.56x   |
+| 16384   | [1000, 600]  | 0.419 ms    | 0.258 ms      | 1.62x   |
+| 16384   | [800, 600]   | 0.422 ms    | 0.255 ms      | 1.65x   |
+|         |              |             |              |         |
+| 32768   | [1000, 200]  | 0.374 ms    | 0.248 ms      | 1.51x   |
+| 32768   | [1000, 600]  | 0.823 ms    | 0.554 ms      | 1.49x   |
+| 32768   | [800, 600]   | 0.826 ms    | 0.558 ms      | 1.48x   |
+|         |              |             |              |         |
+| 65536   | [1000, 200]  | 0.637 ms    | 0.524 ms      | 1.22x   |
+| 65536   | [1000, 600]  | 1.758 ms    | 1.501 ms      | 1.17x   |
+| 65536   | [800, 600]   | 1.783 ms    | 1.489 ms      | 1.20x   |
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
+# Copyright (c) 2024-2025 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+import math
+import argparse
+import torch
+import triton
+import triton.language as tl
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+from tilelang.testing import torch_assert_close
+tilelang.disable_cache()
+@tilelang.jit(out_idx=[3])
+def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
+    block_M = 64
+    block_N = 64
+    num_stages = 2
+    threads = 128
+    scale = (1.0 / dim)**0.5 * 1.44269504
+    shape = [batch, heads, seq_len, dim]
+    count_shape = [batch, heads, (seq_len + block_M - 1) // block_M]
+    offset_shape = count_shape + [slash_size]
+    index_shape = count_shape + [vertical_size]
+    vertical_size_round, slash_size_round = tilelang.next_power_of_2(
+        vertical_size), tilelang.next_power_of_2(slash_size)
+    dtype = "float16"
+    accum_dtype = "float"
+    int_dtype = "int32"
+    def kernel_func(block_M, block_N, num_stages, threads):
+        @T.macro
+        def Prefetch(
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            V_shared: T.SharedBuffer([block_N, dim], dtype),
+            column_index: T.SharedBuffer([vertical_size], int_dtype),
+            column_count: T.int32,
+            k: T.int32,
+            bz: T.int32,
+            by: T.int32,
+        ):
+            with T.attr("default", "async_scope", 1):
+                for i, j in T.Parallel(block_N, dim):
+                    K_shared[i, j] = T.if_then_else(k + i < column_count,
+                                                    K[bz, by, column_index[k + i], j], 0)
+            with T.attr("default", "async_scope", 1):
+                for i, j in T.Parallel(block_N, dim):
+                    V_shared[i, j] = T.if_then_else(k + i < column_count,
+                                                    V[bz, by, column_index[k + i], j], 0)
+            T.ptx_commit_group()
+        @T.macro
+        def Compute(
+                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+                scores_max: T.FragmentBuffer([block_M], accum_dtype),
+                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+                k: T.int32,
+                column_count: T.int32,
+                Q_shared: T.SharedBuffer([block_M, dim], dtype),
+                K_shared: T.SharedBuffer([block_N, dim], dtype),
+                V_shared: T.SharedBuffer([block_N, dim], dtype),
+                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+                logsum: T.FragmentBuffer([block_M], accum_dtype),
+        ):
+            T.ptx_wait_group(1)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.if_then_else(k + j < column_count, 0, -T.infinity(acc_s.dtype))
+            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+            T.copy(scores_max, scores_max_prev)
+            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+            for i, j in T.Parallel(block_M, block_N):
+                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] = acc_o[i, j] * scores_scale[i]
+            T.copy(acc_s, acc_s_cast)
+            T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            T.reduce_sum(acc_s, scores_sum, dim=1)
+            for i in T.Parallel(block_M):
+                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+        @T.prim_func
+        def vs_sparse_flashattn(
+                Q: T.Tensor(shape, dtype),
+                K: T.Tensor(shape, dtype),
+                V: T.Tensor(shape, dtype),
+                Output: T.Tensor(shape, dtype),
+                BlockCount: T.Tensor(count_shape, int_dtype),
+                BlockOffset: T.Tensor(offset_shape, int_dtype),
+                ColumnCount: T.Tensor(count_shape, int_dtype),
+                ColumnIndex: T.Tensor(index_shape, int_dtype),
+        ):
+            with T.Kernel(
+                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bc, by, bz):
+                bx = T.ceildiv(seq_len, block_M) - 1 - bc
+                Q_shared = T.alloc_shared([block_M, dim], dtype)
+                K_shared = T.alloc_shared([block_N, dim], dtype)
+                V_shared = T.alloc_shared([block_N, dim], dtype)
+                O_shared = T.alloc_shared([block_M, dim], dtype)
+                acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+                acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+                acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+                scores_max = T.alloc_fragment([block_M], accum_dtype)
+                scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+                scores_scale = T.alloc_fragment([block_M], accum_dtype)
+                scores_sum = T.alloc_fragment([block_M], accum_dtype)
+                logsum = T.alloc_fragment([block_M], accum_dtype)
+                block_count = T.alloc_local([1], int_dtype)
+                block_offset = T.alloc_shared([slash_size_round], int_dtype, scope="shared")
+                column_count = T.alloc_local([1], int_dtype)
+                column_index = T.alloc_shared([vertical_size_round], int_dtype, scope="shared")
+                K_shared_1 = T.alloc_shared([block_N, dim], dtype)
+                V_shared_1 = T.alloc_shared([block_N, dim], dtype)
+                K_shared_2 = T.alloc_shared([block_N, dim], dtype)
+                V_shared_2 = T.alloc_shared([block_N, dim], dtype)
+                block_count[0] = BlockCount[bz, by, bx]
+                column_count[0] = ColumnCount[bz, by, bx]
+                for vi in T.Parallel(slash_size_round):
+                    if vi < slash_size:
+                        block_offset[vi] = BlockOffset[bz, by, bx, vi]
+                for vi in T.Parallel(vertical_size_round):
+                    if vi < vertical_size:
+                        column_index[vi] = ColumnIndex[bz, by, bx, vi]
+                T.fill(acc_o, 0)
+                T.fill(logsum, 0)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                for bi in T.Pipelined(block_count[0], num_stages=num_stages):
+                    k = block_offset[bi]
+                    T.copy(K[bz, by, k:k + block_N, :], K_shared)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0,
+                                                     -T.infinity(acc_s.dtype))
+                    T.gemm(
+                        Q_shared,
+                        K_shared,
+                        acc_s,
+                        transpose_B=True,
+                        policy=T.GemmWarpPolicy.FullRow)
+                    T.copy(scores_max, scores_max_prev)
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_M, dim):
+                        acc_o[i, j] = acc_o[i, j] * scores_scale[i]
+                    T.copy(acc_s, acc_s_cast)
+                    T.copy(V[bz, by, k:k + block_N, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_M):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                if column_count[0] != 0:
+                    Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz, by)
+                    for bi in T.serial(T.ceildiv(column_count[0], block_N) - 1):
+                        k = bi * block_N
+                        if bi % 2 == 0:
+                            Prefetch(K, V, K_shared_2, V_shared_2, column_index, column_count[0],
+                                     k + block_N, bz, by)
+                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
+                                    column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
+                                    scores_sum, logsum)
+                        else:
+                            Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0],
+                                     k + block_N, bz, by)
+                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
+                                    column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
+                                    scores_sum, logsum)
+                    if T.ceildiv(column_count[0], block_N) % 2 == 0:
+                        Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
+                                scores_sum, logsum)
+                    else:
+                        Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
+                                T.ceildiv(column_count[0], block_N) * block_N - block_N,
+                                column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
+                                scores_sum, logsum)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] /= logsum[i]
+                T.copy(acc_o, O_shared)
+                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+        return vs_sparse_flashattn
+    return kernel_func(block_M, block_N, num_stages, threads)
+@triton.jit
+def _triton_mixed_sparse_attn_fwd_kernel(
+    Q,
+    K,
+    V,
+    seqlens,
+    sm_scale,
+    block_count,
+    block_offset,
+    column_count,
+    column_index,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_vk,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_ok,
+    Z,
+    H,
+    N_CTX,
+    NUM_ROWS,
+    NNZ_S,
+    NNZ_V,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)  # bx
+    off_hz = tl.program_id(1)  # by
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
+    v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
+    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok
+    num_blks = tl.load(block_count + off_hz * NUM_ROWS + start_m)
+    blks_ptr = block_offset + (off_hz * NUM_ROWS + start_m) * NNZ_S
+    num_cols = tl.load(column_count + off_hz * NUM_ROWS + start_m)
+    cols_ptr = column_index + (off_hz * NUM_ROWS + start_m) * NNZ_V
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    m_mask = offs_m[:, None] < seqlen
+    for block_index in range(num_blks):
+        start_n = tl.load(blks_ptr + block_index)
+        cols = start_n + offs_n
+        n_mask = cols < seqlen
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        causal_mask = cols[None, :] <= offs_m[:, None]
+        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    for start_n in range(0, num_cols, BLOCK_N):  #
+        # bi * BLOCK_N: bi * BLOCK_N + BLOCK_N
+        n_mask = start_n + offs_n < num_cols
+        cols = tl.load(cols_ptr + start_n + offs_n, mask=n_mask, other=0)
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    # write back O
+    acc /= l_i[:, None]
+    # acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    tl.store(o_ptrs, acc.to(dtype), mask=m_mask)
+def _triton_mixed_sparse_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seqlens: torch.Tensor,
+    block_count: torch.Tensor,
+    block_offset: torch.Tensor,
+    column_count: torch.Tensor,
+    column_index: torch.Tensor,
+    sm_scale: float,
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    _triton_mixed_sparse_attn_fwd_kernel[grid](
+        q,
+        k,
+        v,
+        seqlens,
+        sm_scale,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        o,
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        q.stride(3),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        k.stride(3),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        v.stride(3),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        o.stride(3),
+        q.shape[0],
+        q.shape[1],
+        q.shape[2],
+        block_count.shape[-1],
+        block_offset.shape[-1],
+        column_index.shape[-1],
+        BLOCK_M=block_size_M,
+        BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=4,
+        num_stages=2,
+    )
+    return o
+def vertical_slash_sparse_attention(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+):
+    from torch.utils.cpp_extension import load
+    import os
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    sources = [
+        os.path.join(current_dir, 'ops', 'kernels.cpp'),
+        os.path.join(current_dir, 'ops', 'vertical_slash_index.cu')
+    ]
+    ops = load(name='convert', sources=sources, verbose=False)
+    convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
+    batch_size, num_heads, context_size, head_dim = query.shape
+    pad = (block_size_M - context_size) & (block_size_M - 1)
+    if pad == block_size_M:
+        pad = 0
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
+        dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
+        dim=-1, descending=True)[0]
+    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    sm_scale = head_dim**-0.5
+    block_count, block_offset, column_count, column_index = convert_vertical_slash_indexes(
+        seqlens,
+        v_idx,
+        s_idx,
+        context_size,
+        block_size_M,
+        block_size_N,
+    )
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim,
+                                        v_idx.shape[2], s_idx.shape[2])
+    def run(is_triton: bool = True):
+        if is_triton:
+            out = _triton_mixed_sparse_attention(
+                query,
+                key,
+                value,
+                seqlens,
+                block_count,
+                block_offset,
+                column_count,
+                column_index,
+                sm_scale,
+                block_size_M,
+                block_size_N,
+            )
+        else:
+            out = tl_kernel(query, key, value, block_count, block_offset, column_count,
+                            column_index)
+        return out[..., :context_size, :head_dim]
+    return run
+def sum_all_diagonal_matrix(mat: torch.tensor):
+    b, h, n, m = mat.shape
+    zero_mat = torch.zeros((b, h, n, n)).to(mat.device)  # Zero matrix used for padding
+    mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)  # pads the matrix on left and right
+    mat_strided = mat_padded.as_strided(
+        (1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
+    sum_diags = torch.sum(mat_strided, 2)  # Sums the resulting matrix's columns
+    return sum_diags[:, :, 1:]
+def main(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--heads", type=int, default=1)
+    parser.add_argument("--seq_len", type=int, default=16384)
+    parser.add_argument("--head_dim", type=int, default=64)
+    parser.add_argument("--vertical_size", type=int, default=1000)
+    parser.add_argument("--slash_size", type=int, default=200)
+    args = parser.parse_args(argv)
+    # vs_list = [[1000, 200], [1000, 600], [800, 600]]
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = args.batch, args.heads, args.seq_len, args.head_dim
+    vertical_size, slash_size = args.vertical_size, args.slash_size
+    torch.manual_seed(0)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q_len = SEQ_LEN
+    vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
+    last_q = 64
+    qk = torch.einsum('bhmk, bhnk -> bhmn', q[:, :, -last_q:, :], k)
+    arange = torch.arange(last_q, device="cuda")
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :],
+                                        qk[:, :, :, -last_q:], -torch.inf)
+    qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+    vertical = qk.sum(-2, keepdim=True)
+    vertical[..., :30] = torch.inf
+    vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+    slash = sum_all_diagonal_matrix(qk)[..., :-last_q + 1]
+    slash[..., -30:] = torch.inf
+    slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+    _attn = vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    triton_out = _attn(True)
+    tilelang_out = _attn(False)
+    torch_assert_close(triton_out, tilelang_out, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.0)
+    print("Pass topk sparse attention test with qlen == klen")
+    triton_time = do_bench(lambda: _attn(True))
+    tilelang_time = do_bench(lambda: _attn(False))
+    print(f"triton_time: {triton_time:.3f}ms")
+    print(f"tilelang_time: {tilelang_time:.3f}ms")
+    print(f"speedup: {triton_time / tilelang_time:.2f}x")
+if __name__ == "__main__":
+    main()
--- a/examples/minference/ops/kernels.cpp
+++ b/examples/minference/ops/kernels.cpp
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+#include "torch/extension.h"
+#include <vector>
+std::vector<at::Tensor> convert_vertical_slash_indexes(
+    torch::Tensor seqlens,          // [BATCH, ]
+    torch::Tensor vertical_indexes, // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,    // [BATCH, N_HEADS, NNZ_S]
+    int context_size, int block_size_M, int block_size_N);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("convert_vertical_slash_indexes", &convert_vertical_slash_indexes,
+        "dynamic sparse index function");
+}
--- a/examples/minference/ops/vertical_slash_index.cu
+++ b/examples/minference/ops/vertical_slash_index.cu
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+#include <assert.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <torch/extension.h>
+#include <cuda.h>
+__device__ void save_blocks(int* block_offset, int range_start, int range_end, int block_size, int& block_count) {
+    for (int idx = range_start; idx < range_end; idx += block_size) {
+        block_offset[block_count++] = idx;
+    }
+}
+__global__ void convert_vertical_slash_indexes_kernel(
+    const int* seqlens,           // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int N_HEADS,
+    int N_ROWS,
+    int BLOCK_SIZE_M,
+    int BLOCK_SIZE_N,
+    int NNZ_V,
+    int NNZ_S
+) {
+    const int batch_idx = blockIdx.y;
+    const int head_idx = blockIdx.x;
+    const int group_idx = blockIdx.z;
+    int seqlen = seqlens[batch_idx];
+    int block_idx_m = group_idx * blockDim.x + threadIdx.x;
+    int start_m = block_idx_m * BLOCK_SIZE_M;
+    if (start_m >= seqlen) {
+        return;
+    }
+    int end_m = start_m + BLOCK_SIZE_M;
+    vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+    slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+    int row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+    block_count += row_offset;
+    block_offset += row_offset * NNZ_S;
+    column_count += row_offset;
+    column_index += row_offset * NNZ_V;
+    int tmp_col_cnt = 0, tmp_blk_cnt = 0;
+    int s = 0, v = 0;
+    int v_idx = vertical_indexes[v++];
+    int s_idx = slash_indexes[s++];
+    while (s_idx >= end_m) {
+        s_idx = slash_indexes[s++];
+    }
+    s_idx = max(end_m - s_idx, BLOCK_SIZE_M);
+    int range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+    while (1) {
+        if (v_idx < range_end) {
+            if (v_idx < range_start) {
+                column_index[tmp_col_cnt++] = v_idx;
+            }
+            if (v < NNZ_V) {
+                v_idx = vertical_indexes[v++];
+            } else {
+                v_idx = end_m + BLOCK_SIZE_M;
+            }
+        } else {
+            if (s < NNZ_S) {
+                s_idx = max(end_m - slash_indexes[s++], BLOCK_SIZE_M);
+            } else {
+                save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
+                break;
+            }
+            if (s_idx > range_end + BLOCK_SIZE_M) {
+                save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
+                range_start = s_idx - BLOCK_SIZE_M;
+                range_end = s_idx;
+            } else if (s_idx > range_end) {
+                range_end += BLOCK_SIZE_M;
+            }
+        }
+    }
+    block_count[0] = tmp_blk_cnt;
+    column_count[0] = tmp_col_cnt;
+}
+void convert_vertical_slash_indexes_64x64(
+    const int* seqlens,           // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int BATCH_SIZE,
+    int N_HEADS,
+    int N_ROWS,
+    int NNZ_V,
+    int NNZ_S
+) {
+    const int BLOCK_SIZE_M = 64;
+    const int BLOCK_SIZE_N = 64;
+    const int N_THREADS = 64;
+    const dim3 dimBlock(N_THREADS);
+    const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
+    convert_vertical_slash_indexes_kernel<<<dimGrid, dimBlock>>>(
+        seqlens, vertical_indexes, slash_indexes,
+        block_count, block_offset, column_count, column_index,
+        N_HEADS, N_ROWS, BLOCK_SIZE_M, BLOCK_SIZE_N, NNZ_V, NNZ_S
+    );
+}
+std::vector<at::Tensor> convert_vertical_slash_indexes(
+    torch::Tensor seqlens,           // [BATCH, ]
+    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int context_size,
+    int block_size_M,
+    int block_size_N
+) {
+    assert(block_size_M == 64);
+    assert(block_size_N == 64);
+    cudaSetDevice(seqlens.get_device());
+    int batch_size = slash_indexes.size(0);
+    int num_heads = slash_indexes.size(1);
+    int nnz_slash = slash_indexes.size(2);
+    int nnz_vertical = vertical_indexes.size(2);
+    int num_rows = (context_size + block_size_M - 1) / block_size_M;
+    torch::Tensor block_count = torch::zeros({batch_size, num_heads, num_rows}, seqlens.options());
+    torch::Tensor block_offset = torch::zeros({batch_size, num_heads, num_rows, nnz_slash}, seqlens.options());
+    torch::Tensor column_count = torch::zeros({batch_size, num_heads, num_rows}, seqlens.options());
+    torch::Tensor column_index = torch::zeros({batch_size, num_heads, num_rows, nnz_vertical}, seqlens.options());
+    convert_vertical_slash_indexes_64x64(
+        seqlens.data_ptr<int>(),
+        vertical_indexes.data_ptr<int>(),
+        slash_indexes.data_ptr<int>(),
+        block_count.data_ptr<int>(),
+        block_offset.data_ptr<int>(),
+        column_count.data_ptr<int>(),
+        column_index.data_ptr<int>(),
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_vertical,
+        nnz_slash
+    );
+    return { block_count, block_offset, column_count, column_index };
+}
--- a/examples/minference/test_vs_sparse_attn.py
+++ b/examples/minference/test_vs_sparse_attn.py
+import tilelang.testing
+import example_vertical_slash_sparse_attn
+@tilelang.testing.requires_cuda
+def test_vs_sparse_attn():
+    example_vertical_slash_sparse_attn.main()
+if __name__ == "__main__":
+    tilelang.testing.main()
\ No newline at end of file