[release] v0.0.1

4f83cf8f · Junxian · 4f83cf8f · 4f83cf8f · 4f83cf8f · 4f83cf8f
Commit 4f83cf8f authored Oct 10, 2024 by Junxian
20 changed files
--- a/block_sparse_tests/fwd/test_performance/blocksparse.py
+++ b/block_sparse_tests/fwd/test_performance/blocksparse.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import torch
+
+from block_sparse_attn import (
+    block_sparse_attn_func,
+    flash_attn_varlen_func,
+)
+
+from utils import (
+    time_fwd,
+    flops,
+    efficiency,
+    write_to_excel,
+)
+
+def generate_base_sparsity_mask(max_seqlen_q, max_seqlen_k, round_base, m_block_dim, n_block_dim, sparsity, causal=False, device="cuda"):
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    base_mask = torch.zeros(1, nrow, ncol, device=device, dtype=torch.bool)
+    total_block_num = 0
+
+    density = 1.0 - sparsity
+    if not density == 0.0 and not density == 1.0:
+        for i in range(nrow): # do in reverse order
+            idx = nrow - i - 1
+            if causal:
+                available_col_num = max(0, ncol - i)
+                total_block_num += available_col_num
+                num_one = max(1, int(density * available_col_num))
+                base_mask[0][idx, torch.randperm(available_col_num)[:num_one]] = True
+            else:
+                available_col_num = ncol
+                total_block_num += available_col_num
+                num_one = max(1, int(density * available_col_num))
+                base_mask[0][idx, torch.randperm(available_col_num)[:num_one]] = True
+    elif density == 1.0:
+        base_mask[0] = torch.ones_like(base_mask[0])
+        total_block_num = nrow * ncol
+    else:
+        total_block_num = nrow * ncol
+    
+    calculated_block_num = base_mask.sum().item()
+    real_sparsity = 1.0 - calculated_block_num / total_block_num
+    return base_mask, real_sparsity
+
+block_size = 128
+
+def get_sparsity_list(sampling_steps, seqlen, causal):
+    blockmask_element_num = (seqlen // block_size) ** 2 // (2 if causal else 1)
+    stride = max(blockmask_element_num // sampling_steps, 1)
+    actual_steps = (blockmask_element_num + stride - 1) // stride
+    sparsity_list = []
+    for i in range(actual_steps):
+        sparse_rate = (1 + i * stride) / blockmask_element_num
+        if sparse_rate > 0.95 or sparse_rate < 0.0:
+            continue
+        sparsity_list.append(sparse_rate)
+    return sparsity_list
+    
+    
+def profile_blocksparse_fwd():
+    repeats = 15
+    block_sparse_repeats = 3
+    device = 'cuda:0'
+    dtype = torch.float16
+    causal = True
+    batch_size = 8
+    sparsity_sampling_steps = 20
+    seqlen_vals = [1024,2048,4096,8192,16384,32768,65536]
+    headdim = 128
+    dim = 4096
+    dropout_p = 0.0
+    method = ("Block_Sparse_Flash2")
+    time_f = {}
+    speed_f = {}
+
+
+    excel_label = ["batch_size", "seqlen", "actual_sparsity", "speed", "latency", "speedup", "base_speed", "base_latency"]
+    excel_data = []
+    excel_dir_path = "./excel/blocksparse/"
+    excel_file_name = f"hdim{headdim}_nheads{dim // headdim}_bts{batch_size}_fwd"
+        
+    if causal:
+        excel_file_name += "_causal"
+    
+    all_results = {}
+    for seqlen in seqlen_vals:
+        results = {}
+        nheads = dim // headdim
+        shape = (batch_size * seqlen, nheads, headdim)
+        q = torch.randn(shape, device=device, dtype=dtype)
+        k = torch.randn(shape, device=device, dtype=dtype)
+        v = torch.randn(shape, device=device, dtype=dtype)
+        cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+        base_f = time_fwd(flash_attn_varlen_func, q, k, v, cu_seqlens, cu_seqlens, seqlen, seqlen, dropout_p, None, causal, repeats=repeats, verbose=False)
+        base_speed = efficiency(flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), base_f)
+        results["base"] = [[base_f], [base_speed]]
+        sparsity_list = get_sparsity_list(sparsity_sampling_steps, seqlen, causal)
+        print(f"sparsity_list: {sparsity_list}")
+        for sparsity in sparsity_list:
+            sum_sparsity, sum_speed, sum_latency = 0, 0, 0
+            for _ in range(block_sparse_repeats):
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+                head_mask_type = torch.tensor([1] * nheads, device=device, dtype=torch.int32)
+                base_blockmask, real_sparsity = generate_base_sparsity_mask(seqlen, seqlen, block_size, block_size, block_size, sparsity, causal = causal, device=device)
+                base_blockmask = base_blockmask.unsqueeze(0).repeat(batch_size, nheads, 1, 1)
+                config = (causal, headdim, nheads, batch_size, seqlen, sparsity, real_sparsity)
+                f = time_fwd(block_sparse_attn_func, q, k, v, cu_seqlens, cu_seqlens, head_mask_type, None, base_blockmask, seqlen, seqlen, dropout_p, is_causal=causal, exact_streaming=False, repeats=repeats, verbose=False)
+                time_f[config, method] = f
+                print(f"### causal={causal}, headdim={headdim}, nheads = {nheads}, batch_size={batch_size}, seqlen={seqlen}, real_sparsity={real_sparsity} ###")
+                speed_f[config, method] = efficiency(flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), time_f[config, method])
+                print(
+                    f"{method}"
+                    f"fwd: {speed_f[config, method]:.2f} TFLOPs/s, {(time_f[config, method]*1000):.2f} ms, "
+                    f"fwd base: {base_speed:.2f} TFLOPs/s, {base_f*1000:.2f} ms"
+                    ) 
+                sum_sparsity += real_sparsity
+                sum_speed += speed_f[config, method]
+                sum_latency += time_f[config, method]
+            
+            avg_sparsity = sum_sparsity / block_sparse_repeats
+            avg_speed = sum_speed / block_sparse_repeats
+            avg_latency = sum_latency / block_sparse_repeats
+            if avg_sparsity not in results:
+                    results[avg_sparsity] = [[],[]]
+            results[avg_sparsity][0].append(avg_latency)
+            results[avg_sparsity][1].append(avg_speed)
+            excel_data.append([batch_size, seqlen, avg_sparsity, avg_speed, avg_latency, avg_speed / base_speed, base_speed, base_f])
+        
+        for key in results.keys():
+            avg_latency = sum(results[key][0]) / len(results[key][0])
+            avg_speed = sum(results[key][1]) / len(results[key][1])
+            results[key] = [avg_latency, avg_speed]
+        all_results[seqlen] = results
+    
+    import json
+    with open(f"all_results_{excel_file_name}.json", "w") as f:
+        json.dump(all_results, f)
+            
+    write_to_excel(excel_label, excel_data, excel_dir_path, excel_file_name)
+
+profile_blocksparse_fwd()
\ No newline at end of file
--- a/block_sparse_tests/fwd/test_performance/token_streaming.py
+++ b/block_sparse_tests/fwd/test_performance/token_streaming.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import openpyxl
+from block_sparse_attn.utils.benchmark import benchmark_forward
+import math
+import torch
+
+from block_sparse_attn import (
+    token_streaming_attn_func,
+    flash_attn_varlen_func,
+)
+
+from utils import (
+    time_fwd,
+    flops,
+    efficiency,
+    write_to_excel,
+)
+
+
+def profile_exact_streaming_fwd():
+    repeats = 20
+    block_sparse_repeats = 10
+    device = 'cuda:0'
+    dtype = torch.float16
+    causal = True
+    batch_size = 8
+    sink_local_num = [64,256]
+    seqlen_vals = [4096,8192,16384,32768,65536]
+    headdim_vals = [128]
+    dim = 4096
+    dropout_p = 0.0
+    methods = (["Flash2"])
+    time_f = {}
+    speed_f = {}
+
+    for headdim in headdim_vals:
+        excel_label = ["batch_size", "seqlen", "speed", "latency", "speedup", "base_speed", "base_latency"]
+        excel_data = []
+        excel_dir_path = "./excel/streaming/"
+        excel_file_name = f"hdim{headdim}_nheads{dim // headdim}_bts{batch_size}_sink{sink_local_num[0]}_local{sink_local_num[1]}_fwd"
+
+        for seqlen in seqlen_vals:
+            nheads = dim // headdim
+            shape = (batch_size * seqlen, nheads, headdim)
+            q = torch.randn(shape, device=device, dtype=dtype)
+            k = torch.randn(shape, device=device, dtype=dtype)
+            v = torch.randn(shape, device=device, dtype=dtype)
+            cu_seqlens = torch.arange(
+                0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+            base_f = time_fwd(flash_attn_varlen_func, q, k, v, cu_seqlens, cu_seqlens, seqlen, seqlen, dropout_p, None, causal, repeats=repeats, verbose=False)
+            base_speed = efficiency(flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), base_f)
+            head_mask_type = torch.tensor([-1] * (nheads//2) + [0] * (nheads - nheads//2), device=device, dtype=torch.int32)
+            streaming_info = torch.tensor([sink_local_num[0], sink_local_num[1]] * nheads, device=device, dtype=torch.int32)
+            config = (causal, headdim, nheads, batch_size, seqlen, sink_local_num[0], sink_local_num[1])
+            sum_speed, sum_latency = 0,0
+            for _ in range(block_sparse_repeats):
+                f = time_fwd(
+                    token_streaming_attn_func, q, k, v, cu_seqlens, cu_seqlens, head_mask_type, streaming_info, seqlen, seqlen, repeats=repeats, verbose=False
+                )
+                time_f[config, "Flash2"] = f
+                print(f"### causal={causal}, headdim={headdim}, nheads = {nheads}, batch_size={batch_size}, seqlen={seqlen}, sink={sink_local_num[0]}, local={sink_local_num[1]} ###")
+                for method in methods:
+                    speed_f[config, method] = efficiency(
+                        flops(batch_size, seqlen, headdim,
+                                nheads, causal, mode="fwd"),
+                        time_f[config, method]
+                    )
+                    print(f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, {(time_f[config, method]*1000):.2f} ms, ")   
+                sum_speed += speed_f[config, "Flash2"]
+                sum_latency += time_f[config, "Flash2"]
+            excel_data.append([batch_size, seqlen, sum_speed / block_sparse_repeats, sum_latency / block_sparse_repeats, (sum_speed / block_sparse_repeats) / base_speed, base_speed, base_f])
+        write_to_excel(excel_label, excel_data, excel_dir_path, excel_file_name)
+
+profile_exact_streaming_fwd()
\ No newline at end of file
--- a/block_sparse_tests/fwd/test_performance/utils.py
+++ b/block_sparse_tests/fwd/test_performance/utils.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import openpyxl
+from block_sparse_attn.utils.benchmark import benchmark_forward
+import math
+import torch
+import os
+
+
+def benchmark_fwd(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    return benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+            )
+
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+
+def time_fwd(func, *args, **kwargs):
+    time_f = benchmark_fwd(func, *args, **kwargs)
+    return time_f[1].mean
+
+def write_to_excel(label, data, dir_path, file_name):
+    workbook = openpyxl.Workbook()
+    sheet = workbook.active
+    sheet.append(label)
+    os.makedirs(dir_path, exist_ok=True)
+    for row in data:
+        sheet.append(row)
+    workbook.save(dir_path + file_name + ".xlsx")
--- a/block_sparse_tests/fwd_bwd/test_correctness/full_test.py
+++ b/block_sparse_tests/fwd_bwd/test_correctness/full_test.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/tests/test_flash_attn.py
+
+import pytest
+import torch
+from einops import repeat
+from block_sparse_attn import (
+    block_sparse_attn_func,
+)
+from utils import (
+    generate_random_padding_mask,
+    generate_base_sparsity_mask,
+    generate_qkv,
+    generate_streaming_mask,
+    prepare_mixed_exact_mask,
+    prepare_mixed_mask,
+    convert_flash_attn_S_to_softmax,
+    normalize_flash_attn_S,
+    get_dropout_fraction,
+    attention_blocksparse_ref
+)
+
+MAX_HEADDIM_SM8x = 192
+block_size = 128
+is_sm75 = torch.cuda.get_device_capability("cuda") == (7, 5)
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8
+is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0)
+is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0)
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+@pytest.mark.parametrize("d", [32, 64, 128])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [   
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+
+@pytest.mark.parametrize(
+    "causal, exact_streaming, sink_num, local_num", 
+    [
+        # (True, True, 1, 3),
+        # (True, True, 64, 256),
+        (True, False, 1, 3),
+        (False, False, 1, 3),
+    ]
+)
+
+@pytest.mark.parametrize("p_dropout", [0.17, 0.0])
+@pytest.mark.parametrize("sparsity", [0, 0.1, 0.3, 0.7, 1.0])
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("nheads", [16, 32])
+
+def test_flash_attn_varlen_block_output(
+    seqlen_q, seqlen_k, d, p_dropout, causal, exact_streaming, sink_num, local_num, mha_type, dtype, sparsity, batch_size, nheads
+):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    device = "cuda:0"
+    # set seed
+    torch.random.manual_seed(42)
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 8)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1)
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True)
+
+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
+
+    alibi_slopes, attn_bias = None, None
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        dq_pad_fn,
+        dk_pad_fn,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    
+    num_streaming_heads = nheads // 3
+    num_blocksparse_heads = nheads // 3
+    num_dense_heads = nheads - num_streaming_heads - num_blocksparse_heads
+    sparsity_list = [sparsity] * num_blocksparse_heads
+    head_mask_type = torch.tensor([0] * num_dense_heads + [1] * num_blocksparse_heads + [-1] * num_streaming_heads, device=device, dtype=torch.int32)
+    base_blockmask = generate_base_sparsity_mask(max_seqlen_q, max_seqlen_k, block_size, block_size, block_size, batch_size, num_blocksparse_heads, sparsity_list, causal = causal, device=device)
+    
+    streaming_info = torch.tensor([sink_num, local_num] * nheads, device=device, dtype=torch.int32)
+    streaming_mask = generate_streaming_mask(max_seqlen_q, max_seqlen_k, batch_size, nheads, cu_seqlens_q, cu_seqlens_k, block_size, block_size, block_size, streaming_info, causal=causal, device=device)
+    
+    if exact_streaming:
+        assert causal
+    print(f"exact_streaming: {exact_streaming}")
+    if exact_streaming:
+        mixed_mask = prepare_mixed_exact_mask(base_blockmask, streaming_info, head_mask_type, batch_size, nheads, block_size, block_size, block_size, max_seqlen_q, max_seqlen_k, q.shape[1], k.shape[1], query_padding_mask, key_padding_mask, device=device)
+    else:
+        mixed_mask = prepare_mixed_mask(base_blockmask, streaming_mask, head_mask_type, batch_size, nheads, block_size, block_size, block_size, max_seqlen_q, max_seqlen_k, q.shape[1], k.shape[1], device=device)
+    
+    
+    out_unpad, sm_lse, S_dmask = block_sparse_attn_func(
+        q_unpad, k_unpad, v_unpad,
+        cu_seqlens_q, cu_seqlens_k,
+        head_mask_type,
+        streaming_info,
+        base_blockmask,
+        max_seqlen_q, max_seqlen_k,
+        p_dropout,
+        deterministic=True,
+        softmax_scale=None,
+        is_causal=causal,
+        exact_streaming=exact_streaming,
+        return_attn_probs=True,
+    )
+    
+    out = output_pad_fn(out_unpad)
+    
+    if p_dropout > 0.0:
+        assert S_dmask is not None
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen_q,
+            seqlen_k,
+            query_padding_mask,
+            key_padding_mask,
+            d,
+            p_dropout > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        attn_unnorm = S_dmask_converted.abs()
+        
+        k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        
+        attn = normalize_flash_attn_S(
+            attn_unnorm,
+            q,
+            k_rep,
+            v_rep,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            p_dropout > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        
+        dropout_fraction = get_dropout_fraction(
+            dropout_mask,
+            mixed_mask,
+            block_size, block_size, 
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            window_size=window_size,
+        ).item()
+        
+        print(f"Actual dropout fraction: {dropout_fraction}")
+    else:
+        dropout_mask = None
+
+    out_ref, attn_ref = attention_blocksparse_ref(
+            q,
+            k,
+            v,
+            mixed_mask,
+            block_size, block_size, 
+            query_padding_mask,
+            key_padding_mask,
+            p_dropout,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+        )
+    out_pt, attn_pt = attention_blocksparse_ref(
+            q,
+            k,
+            v,
+            mixed_mask,
+            block_size, block_size, 
+            query_padding_mask,
+            key_padding_mask,
+            p_dropout,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+        )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    g = torch.randn_like(out)
+    # g = torch.zeros_like(out)
+    if d <= MAX_HEADDIM_SM8x or (is_sm80 or is_sm90):
+        (
+            dq_unpad,
+            dk_unpad,
+            dv_unpad,
+        ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
+        dk = dk_pad_fn(dk_unpad)
+        dv = dk_pad_fn(dv_unpad)
+        (
+            dq_ref,
+            dk_ref,
+            dv_ref,
+        ) = torch.autograd.grad(out_ref, (q, k, v), g)
+        (
+            dq_pt,
+            dk_pt,
+            dv_pt,
+        ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        dq = dq_pad_fn(dq_unpad)
+        
+        
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+        
+        
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    if d <= MAX_HEADDIM_SM8x or (is_sm80 or is_sm90):
+        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
+        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
+        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
\ No newline at end of file
--- a/block_sparse_tests/fwd_bwd/test_correctness/utils.py
+++ b/block_sparse_tests/fwd_bwd/test_correctness/utils.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/tests/test_flash_attn.py
+
+import math
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from block_sparse_attn.bert_padding import pad_input, unpad_input
+from block_sparse_attn.flash_attn_interface import _get_block_size
+torch.set_printoptions(profile="full")
+
+
+
+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
+    elif mode == "random":
+        lengths = torch.randint(
+            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device
+        )
+    elif mode == "third":
+        lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
+    )
+    return padding_mask
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(-1, -1),  # -1 means infinite window size
+    query_padding_mask=None,
+    key_padding_mask=None,
+    device=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] < 0:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            col_idx < row_idx + sk - sq - window_size[0],
+        )
+
+def construct_exact_streaming_mask(
+    seqlen_q,
+    seqlen_k,
+    sink_size,  # -1 means infinite window size
+    local_size,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    device=None,
+):
+    assert sink_size >= 0
+    assert local_size >= 1
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+
+    sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+    mask = torch.logical_or(
+        col_idx > torch.minimum(row_idx + sk - sq, sk),
+        torch.logical_and(
+        col_idx < row_idx + sk - sq - (local_size-1), col_idx >= sink_size,
+        )
+    )
+
+    return mask
+
+def generate_qkv(
+    q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
+        )
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
+        )
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+
+def generate_base_sparsity_mask(max_seqlen_q, max_seqlen_k, round_base, m_block_dim, n_block_dim, batch_size, num_blocksparse_heads, sparsity_list, causal=False, device="cuda"):
+    assert len(sparsity_list) == num_blocksparse_heads
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    base_mask = torch.zeros(batch_size, num_blocksparse_heads, nrow, ncol, device=device, dtype=torch.bool)
+    
+    for batch in range(batch_size):
+        for head_rank in range(num_blocksparse_heads):
+            sparsity = sparsity_list[head_rank]
+            if not sparsity == 0.0 and not sparsity == 1.0:
+                for i in range(nrow):
+                    idx = nrow - i - 1
+                    if causal:
+                        available_col_num = max(0, ncol - i)
+                        num_one = max(1, int(sparsity * available_col_num))
+                        base_mask[batch][head_rank][idx, torch.randperm(available_col_num)[:num_one]] = True
+                    else:
+                        available_col_num = ncol
+                        num_one = max(1, int(sparsity * available_col_num))
+                        base_mask[batch][head_rank][idx, torch.randperm(available_col_num)[:num_one]] = True
+            elif sparsity == 1.0:
+                base_mask[batch][head_rank] = torch.ones_like(base_mask[batch][head_rank])
+                
+    return base_mask
+
+def generate_streaming_mask(max_seqlen_q, max_seqlen_k, batch_size, num_heads,cu_q_len_list, cu_k_len_list, round_base, m_block_dim, n_block_dim, streaming_info, causal=False, device="cuda"):
+    assert len(streaming_info) == 2 * num_heads
+    assert len(cu_q_len_list) == batch_size + 1
+    assert len(cu_k_len_list) == batch_size + 1
+    assert round_base == m_block_dim == n_block_dim == 128
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    
+    def ceil_div(x, y):
+        return (x + y - 1) // y
+
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    base_mask = torch.zeros(batch_size, num_heads, nrow, ncol, device=device, dtype=torch.bool)
+    
+    for batch in range(batch_size):
+        actual_q = cu_q_len_list[batch + 1] - cu_q_len_list[batch]
+        actual_k = cu_k_len_list[batch + 1] - cu_k_len_list[batch]
+        start_row_idx = max((actual_q - actual_k) // m_block_dim, 0) if causal else 0
+        for head_rank in range(num_heads):
+            sink_block_num, local_block_num = streaming_info[head_rank * 2], streaming_info[head_rank * 2 + 1]
+            for i in range(start_row_idx, nrow):
+                if causal:
+                    max_row_block_num = ceil_div(max(actual_k - actual_q, 0), n_block_dim) + 1 + i - start_row_idx
+                else:
+                    max_row_block_num = ncol
+
+                base_mask[batch, head_rank, i, min(max(max_row_block_num - local_block_num, 0), ncol):min(max_row_block_num, ncol)] = True
+                base_mask[batch, head_rank, i, :sink_block_num] = True
+    
+    return base_mask
+
+def replace_ones_with_count(tensor):
+    ones_mask = tensor == 1
+    count = torch.cumsum(ones_mask, dim=-1).to(tensor.dtype)
+    count = count * ones_mask
+    tensor = tensor.masked_scatter(ones_mask, count[ones_mask])
+    return tensor
+
+
+ 
+def prepare_mixed_mask(base_blocksparse_mask, base_streaming_mask, head_mask_type, batch_size, num_heads, round_base, m_block_dim, n_block_dim, max_seqlen_q, max_seqlen_k, actual_seqlen_q, actual_seqlen_k, device="cuda"):
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    mixed_mask = torch.zeros(batch_size, num_heads, nrow, ncol, device=device, dtype=torch.bool)
+    head_mask_type = replace_ones_with_count(head_mask_type)
+    for head_rank in range(num_heads):
+        mask_type = head_mask_type[head_rank]
+        if mask_type == 0:
+            mixed_mask[:, head_rank, :, :] = torch.ones_like(mixed_mask[:, head_rank, :, :])
+        elif mask_type > 0:
+            for i in range(batch_size):
+                mixed_mask[i, head_rank, :, :] = base_blocksparse_mask[i][mask_type - 1]
+        else:
+            for i in range(batch_size):
+                mixed_mask[i, head_rank, :, :] = base_streaming_mask[i, head_rank, :, :]
+    
+    mixed_mask = repeat(mixed_mask, "b h s_m s_n -> b h (s_m d_m) (s_n d_n)", d_m=m_block_dim, d_n=n_block_dim)
+    mixed_mask = tailor_mixedmask_for_test(mixed_mask, actual_seqlen_q, actual_seqlen_k)
+    mixed_mask = ~mixed_mask
+    
+    return mixed_mask
+
+def prepare_mixed_exact_mask(base_blocksparse_mask, streaming_info, head_mask_type, batch_size, num_heads, round_base, m_block_dim, n_block_dim, max_seqlen_q, max_seqlen_k, actual_seqlen_q, actual_seqlen_k, query_padding_mask,
+            key_padding_mask, device="cuda"):
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    mixed_mask = torch.zeros(batch_size, num_heads, nrow, ncol, device=device, dtype=torch.bool)
+    head_mask_type = replace_ones_with_count(head_mask_type)
+    for head_rank in range(num_heads):
+        mask_type = head_mask_type[head_rank]
+        if mask_type == 0:
+            mixed_mask[:, head_rank, :, :] = torch.ones_like(mixed_mask[:, head_rank, :, :])
+        elif mask_type > 0:
+            for i in range(batch_size):
+                mixed_mask[i, head_rank, :, :] = base_blocksparse_mask[i][mask_type - 1]
+    
+    mixed_mask = repeat(mixed_mask, "b h s_m s_n -> b h (s_m d_m) (s_n d_n)", d_m=m_block_dim, d_n=n_block_dim)
+    mixed_mask = tailor_mixedmask_for_test(mixed_mask, actual_seqlen_q, actual_seqlen_k)
+    mixed_mask = ~mixed_mask
+    
+    for head_rank in range(num_heads):
+        mask_type = head_mask_type[head_rank]
+        if mask_type < 0:
+            exact_streaming_mask = construct_exact_streaming_mask(
+                actual_seqlen_q,
+                actual_seqlen_k,
+                streaming_info[head_rank * 2],
+                streaming_info[head_rank * 2 + 1],
+                query_padding_mask,
+                key_padding_mask,
+                device=device,
+            )
+            if exact_streaming_mask.dim() == 4:
+                for i in range(batch_size):
+                    mixed_mask[i, head_rank, :, :] = exact_streaming_mask[i,0,:,:]
+            else:
+                for i in range(batch_size):
+                    mixed_mask[i, head_rank, :, :] = exact_streaming_mask
+    return mixed_mask
+
+def attention_blocksparse_ref(
+    q, k, v, 
+    mixed_mask,
+    m_block_dim, n_block_dim, 
+    query_padding_mask=None,
+    key_padding_mask=None, 
+    p_dropout=0.0, 
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),
+    upcast=True,
+    reorder_ops=False,
+    ):
+    # q, k, v = qkv.float().unbind(dim=2)
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    # local mask
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+        
+    
+
+    scores.masked_fill_(rearrange(mixed_mask, "b h t s -> b h t s"), float("-inf"))
+    
+    # print("processed blockmask: ", rearrange(~base_blockmask, "h t s -> 1 h t s"))
+    
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+     
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(torch.all(torch.bitwise_or(local_mask, rearrange(mixed_mask, "b h t s -> b h t s")), dim=-1, keepdim=True), 0.0)
+    
+    attention = attention.masked_fill(rearrange(mixed_mask, "b h t s -> b h t s"), 0.0)  
+    
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    dropout_scaling = 1.0 / (1 - p_dropout)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+   
+def convert_flash_attn_S_to_softmax(
+    S,
+    seqlen_q,
+    seqlen_k,
+    query_padding_mask,
+    key_padding_mask,
+    head_dim,
+    is_dropout,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """FlashAttention stores the S matrix in a different way.
+    Arguments:
+        S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded)
+        query_padding_mask: (batch_size, seqlen_q_rounded)
+        key_padding_mask: (batch_size, seqlen_k_rounded)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:]
+    warps_n = 4
+    blocksize_m, blocksize_n = _get_block_size(S.device, head_dim, is_dropout, causal)
+    nblocks_n = (seqlen_k_rounded + blocksize_n - 1) // blocksize_n
+    nblocks_m = (seqlen_q_rounded + blocksize_m - 1) // blocksize_m
+    mmas_n = (blocksize_n + 16 - 1) // 16
+    S_flat = rearrange(
+        S,
+        "b h (nblocks_m blocksize_m) (nblocks_n blocksize_n) -> b h nblocks_m nblocks_n (blocksize_m blocksize_n)",
+        blocksize_m=blocksize_m,
+        blocksize_n=blocksize_n,
+    )
+    S_converted = rearrange(
+        S_flat,
+        "b h nblocks_m nblocks_n (mmas_n mmas_m warps_n eight four c2 c1 c0) -> b h (nblocks_m mmas_m warps_n c1 eight) (nblocks_n mmas_n c2 four c0)",
+        mmas_n=mmas_n,
+        warps_n=warps_n,
+        eight=8,
+        c0=2,
+        c1=2,
+        c2=2,
+        four=4,
+    )
+
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            S.device,
+        )
+        local_mask = F.pad(
+            local_mask,
+            (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q),
+            value=True,
+        )
+        S_converted.masked_fill_(local_mask, 0.0)
+
+    # Need to zero out things not in attention_mask in case S was initialized with random values
+    # and some of those values aren't overwritten.
+    seqlen_q_og = (
+        query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded
+    )
+    if query_padding_mask is not None:
+        query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og))
+        S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k
+    if key_padding_mask is not None:
+        key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og))
+        S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0)
+    S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded))
+    S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded))
+    return S_converted[:, :, :seqlen_q, :seqlen_k]
+
+def normalize_flash_attn_S(
+    attn_unnorm,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    is_dropout=False,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k, v: (batch_size, seqlen_k, nheads, head_dim)
+        key_padding_mask: (batch_size, seqlen_q)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+    Output:
+        softmax_lse: (batch_size, nheads, seqlen_q)
+        softmax_max: (batch_size, nheads, seqlen_q)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    q, k, v = q.float(), k.float(), v.float()
+    _, seqlen_q, _, head_dim = q.shape
+    seqlen_k = k.shape[1]
+    scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k)
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias.to(dtype=scores.dtype)
+    _, block_size_n = _get_block_size(scores.device, head_dim, is_dropout, causal)
+    scores_block = scores.split(block_size_n, dim=-1)
+    lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1)
+    lse = torch.logsumexp(lse_block, dim=-1)
+    # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf
+    # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN.
+    lse[lse == float("-inf")] = float("inf")
+    scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1)
+    cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1)
+    attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1)
+    attn_norm = torch.cat(
+        [
+            a * rearrange(torch.exp(m - lse), "b h s -> b h s 1")
+            for a, m in zip(attn_unnorm_block, cummax_block)
+        ],
+        dim=-1,
+    )
+    if query_padding_mask is not None:
+        attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    return attn_norm.to(dtype=attn_unnorm.dtype)
+
+def get_dropout_fraction(
+    dropout_mask,
+    mixed_mask,
+    m_block_dim, n_block_dim,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """
+    dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k), bool. True means keep, False means drop.
+    query_padding_mask: (batch_size, seqlen_q)
+    key_padding_mask: (batch_size, seqlen_k)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    batch_size, nheads, seqlen_q, seqlen_k = dropout_mask.shape
+    dropped = ~dropout_mask
+    valid = torch.ones_like(dropout_mask)
+    if mixed_mask is not None:
+        # mixed_mask = repeat(mixed_mask, "b h s_m s_n -> b h (s_m d_m) (s_n d_n)", d_m=m_block_dim, d_n=n_block_dim)
+        # mixed_mask = tailor_mixedmask_for_test(mixed_mask, seqlen_q, seqlen_k)
+        dropped.masked_fill_(rearrange(mixed_mask, "b h t s -> b h t s"), False)
+        valid.masked_fill_(rearrange(mixed_mask, "b h t s -> b h t s"), False)
+    
+    if query_padding_mask is not None:
+        dropped.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False)
+        valid.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False)
+    if key_padding_mask is not None:
+        dropped.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False)
+        valid.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False)
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            dropout_mask.device,
+        )
+        dropped.masked_fill_(local_mask, False)
+        valid.masked_fill_(local_mask, False)
+    dropped_total = dropped.sum()
+    return dropped.sum() / valid.sum()
+
+def modified_check(a, a_ref, a_pt, rel_paremeter):
+    assert a.shape == a_ref.shape
+    assert a.shape == a_pt.shape
+    left = (a - a_ref).abs().max().item()
+    right = (a_pt - a_ref).abs().max().item()
+    rtol = 1e-3
+    if not right == 0:
+        assert left < rel_paremeter * right or left < rtol * a_ref.abs().max().item()
+    else:
+        assert round(left, 4) == 0 or left < rtol * a_ref.abs().max().item()
+
+def tailor_mixedmask_for_test(spanded_base_mixedmask, seqlen_q, seqlen_k):
+    batch_size = spanded_base_mixedmask.shape[0]
+    nheads = spanded_base_mixedmask.shape[1]
+    spanded_base_mixedmask = spanded_base_mixedmask[:, :, :seqlen_q, :seqlen_k]
+    pad_blockmask = torch.zeros(batch_size, nheads, seqlen_q, seqlen_k, dtype=torch.bool, device = spanded_base_mixedmask.device)
+    pad_blockmask[:, :, :spanded_base_mixedmask.shape[2], :spanded_base_mixedmask.shape[3]] = spanded_base_mixedmask
+    spanded_base_mixedmask = pad_blockmask
+    spanded_base_mixedmask = spanded_base_mixedmask.contiguous()
+    return spanded_base_mixedmask
\ No newline at end of file
--- a/block_sparse_tests/fwd_bwd/test_performance/block_streaming.py
+++ b/block_sparse_tests/fwd_bwd/test_performance/block_streaming.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import openpyxl
+from block_sparse_attn.utils.benchmark import benchmark_forward
+import math
+import torch
+
+from block_sparse_attn import (
+    block_streaming_attn_func,
+    flash_attn_varlen_func,
+)
+
+from utils import (
+    time_fwd_bwd,
+    flops,
+    efficiency,
+    write_to_excel,
+)
+
+
+def profile_block_streaming_fwd_bwd():
+    repeats = 10
+    block_sparse_repeats = 5
+    device = 'cuda:0'
+    dtype = torch.float16
+    causal = True
+    batch_size = 1
+    sink_local_block_num = [1,3]
+    seqlen_vals = [1024, 2048, 4096, 8192, 16384, 20480, 24576, 28672, 32768, 65536, 131072]
+    headdim_vals = [128]
+    dim = 4096
+    p_dropout = 0.0
+    methods = (["Flash2"])
+    time_f = {}
+    time_b = {}
+    time_f_b = {}
+    speed_f = {}
+    speed_b = {}
+    speed_f_b = {}
+
+    for headdim in headdim_vals:
+        excel_label = ["batch_size", "seqlen", "speed", "latency", "speedup", "base_speed", "base_latency"]
+        excel_data = []
+        excel_dir_path = "./excel/block_streaming/"
+        excel_file_name = f"hdim{headdim}_nheads{dim // headdim}_bts{batch_size}_sink_block{sink_local_block_num[0]}_local_block{sink_local_block_num[1]}_fwd_bwd"
+
+        for seqlen in seqlen_vals:
+            nheads = dim // headdim
+            shape = (batch_size * seqlen, nheads, headdim)
+            q = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+            k = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+            v = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+            cu_seqlens = torch.arange(
+                0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+            base_f, base_b = time_fwd_bwd(flash_attn_varlen_func, q, k, v, cu_seqlens, cu_seqlens, seqlen, seqlen, p_dropout, None, causal, repeats=repeats, verbose=False)
+            base_speed = efficiency(flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), base_f + base_b)
+            head_mask_type = torch.tensor([-1] * (nheads//2) + [0] * (nheads - nheads//2), device=device, dtype=torch.int32)
+            streaming_info = torch.tensor([sink_local_block_num[0], sink_local_block_num[1]] * nheads, device=device, dtype=torch.int32)
+            config = (causal, headdim, nheads, batch_size, seqlen, sink_local_block_num[0], sink_local_block_num[1])
+            sum_speed, sum_latency = 0,0
+            for _ in range(block_sparse_repeats):
+                f, b = time_fwd_bwd(
+                    block_streaming_attn_func, q, k, v, cu_seqlens, cu_seqlens, head_mask_type, streaming_info, seqlen, seqlen, p_dropout, False, None, causal, repeats=repeats, verbose=False
+                )
+                time_f[config, "Flash2"] = f
+                time_b[config, "Flash2"] = b
+                print(f"### causal={causal}, headdim={headdim}, nheads = {nheads}, batch_size={batch_size}, seqlen={seqlen}, sink={sink_local_block_num[0]}, local={sink_local_block_num[1]} ###")
+                for method in methods:
+                    time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                    speed_f[config, method] = efficiency(
+                        flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                        time_f[config, method]
+                    )
+                    speed_b[config, method] = efficiency(
+                        flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                        time_b[config, method]
+                    )
+                    speed_f_b[config, method] = efficiency(
+                        flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                        time_f_b[config, method]
+                    )
+                    print(
+                        f"{method}"
+                        f"fwd: {speed_f[config, method]:.2f} TFLOPs/s, {(time_f[config, method]*1000):.2f} ms, "
+                        f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, {(time_b[config, method]*1000):.2f} ms, "
+                        f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s, {(time_f_b[config, method]*1000):.2f} ms, "
+                        f"fwd + bwd base: {base_speed:.2f} TFLOPs/s, {(base_f + base_b)*1000:.2f} ms"
+                        )   
+                sum_speed += speed_f_b[config, "Flash2"]
+                sum_latency += time_f_b[config, "Flash2"]
+            excel_data.append([batch_size, seqlen, sum_speed / block_sparse_repeats, sum_latency / block_sparse_repeats, (sum_speed / block_sparse_repeats) / base_speed, base_speed, base_f])
+        write_to_excel(excel_label, excel_data, excel_dir_path, excel_file_name)
+
+profile_block_streaming_fwd_bwd()
\ No newline at end of file
--- a/block_sparse_tests/fwd_bwd/test_performance/blocksparse.py
+++ b/block_sparse_tests/fwd_bwd/test_performance/blocksparse.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import torch
+
+from block_sparse_attn import (
+    block_sparse_attn_func,
+    flash_attn_varlen_func,
+)
+
+from utils import (
+    time_fwd_bwd,
+    flops,
+    efficiency,
+    write_to_excel,
+)
+
+def generate_base_sparsity_mask(max_seqlen_q, max_seqlen_k, round_base, m_block_dim, n_block_dim, sparsity, causal=False, device="cuda"):
+    def round_to_multiple(x, base):
+        return ((x + base - 1) // base) * base
+    nrow, ncol = round_to_multiple(max_seqlen_q, round_base) // m_block_dim, round_to_multiple(max_seqlen_k, round_base) // n_block_dim
+    base_mask = torch.zeros(1, nrow, ncol, device=device, dtype=torch.bool)
+    total_block_num = 0
+
+    density = 1.0 - sparsity
+    if not density == 0.0 and not density == 1.0:
+        for i in range(nrow): # do in reverse order
+            idx = nrow - i - 1
+            if causal:
+                available_col_num = max(0, ncol - i)
+                total_block_num += available_col_num
+                num_one = max(1, int(density * available_col_num))
+                base_mask[0][idx, torch.randperm(available_col_num)[:num_one]] = True
+            else:
+                available_col_num = ncol
+                total_block_num += available_col_num
+                num_one = max(1, int(density * available_col_num))
+                base_mask[0][idx, torch.randperm(available_col_num)[:num_one]] = True
+    elif density == 1.0:
+        base_mask[0] = torch.ones_like(base_mask[0])
+        total_block_num = nrow * ncol
+    else:
+        total_block_num = nrow * ncol
+    
+    calculated_block_num = base_mask.sum().item()
+    real_sparsity = 1.0 - calculated_block_num / total_block_num
+    return base_mask, real_sparsity
+
+block_size = 128
+
+def get_sparsity_list(sampling_steps, seqlen, causal):
+    blockmask_element_num = (seqlen // block_size) ** 2 // (2 if causal else 1)
+    stride = max(blockmask_element_num // sampling_steps, 1)
+    actual_steps = (blockmask_element_num + stride - 1) // stride
+    sparsity_list = []
+    for i in range(actual_steps):
+        sparse_rate = (1 + i * stride) / blockmask_element_num
+        if sparse_rate > 0.95 or sparse_rate < 0.0:
+            continue
+        sparsity_list.append(sparse_rate)
+    return sparsity_list
+    
+    
+def profile_blocksparse_fwd_bwd():
+    repeats = 10
+    block_sparse_repeats = 5
+    device = 'cuda:0'
+    dtype = torch.float16
+    causal = True
+    batch_size = 1
+    sparsity_sampling_steps = 20
+    seqlen_vals = [8192,16384,32768]
+    headdim = 128
+    dim = 4096
+    dropout_p = 0.0
+    method = ("Block_Sparse_Attn")
+    time_f = {}
+    time_b = {}
+    time_f_b = {}
+    speed_f = {}
+    speed_b = {}
+    speed_f_b = {}
+
+    excel_label = ["batch_size", "seqlen", "actual_sparsity", "speed", "latency", "speedup", "base_speed", "base_latency"]
+    excel_data = []
+    excel_dir_path = "./excel/blocksparse/"
+    excel_file_name = f"hdim{headdim}_nheads{dim // headdim}_bts{batch_size}_fwd_bwd"
+        
+    if causal:
+        excel_file_name += "_causal"
+    
+    all_results = {}
+    for seqlen in seqlen_vals:
+        results = {}
+        nheads = dim // headdim
+        shape = (batch_size * seqlen, nheads, headdim)
+        q = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+        k = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+        v = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+        cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+        base_f, base_b = time_fwd_bwd(flash_attn_varlen_func, q, k, v, cu_seqlens, cu_seqlens, seqlen, seqlen, dropout_p, None, causal, repeats=repeats, verbose=False)
+        base_speed = efficiency(flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), base_f + base_b)
+        results["base"] = [[base_f + base_b], [base_speed]]
+        sparsity_list = get_sparsity_list(sparsity_sampling_steps, seqlen, causal)
+        print(f"sparsity_list: {sparsity_list}")
+        for sparsity in sparsity_list:
+            sum_sparsity, sum_speed, sum_latency = 0, 0, 0
+            for _ in range(block_sparse_repeats):
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=device)
+                head_mask_type = torch.tensor([1] * nheads, device=device, dtype=torch.int32)
+                base_blockmask, real_sparsity = generate_base_sparsity_mask(seqlen, seqlen, block_size, block_size, block_size, sparsity, causal = causal, device=device)
+                base_blockmask = base_blockmask.unsqueeze(0).repeat(batch_size, nheads, 1, 1)
+                config = (causal, headdim, nheads, batch_size, seqlen, sparsity, real_sparsity)
+                f, b = time_fwd_bwd(block_sparse_attn_func, q, k, v, cu_seqlens, cu_seqlens, head_mask_type, None, base_blockmask, seqlen, seqlen, dropout_p, is_causal=causal, exact_streaming=False, repeats=repeats, verbose=False)
+                time_f[config, method] = f
+                time_b[config, method] = b
+                print(f"### causal={causal}, headdim={headdim}, nheads = {nheads}, batch_size={batch_size}, seqlen={seqlen}, real_sparsity={real_sparsity} ###")
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method}"
+                    f"fwd: {speed_f[config, method]:.2f} TFLOPs/s, {(time_f[config, method]*1000):.2f} ms, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, {(time_b[config, method]*1000):.2f} ms, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s, {(time_f_b[config, method]*1000):.2f} ms, "
+                    f"fwd + bwd base: {base_speed:.2f} TFLOPs/s, {(base_f + base_b)*1000:.2f} ms"
+                    ) 
+                sum_sparsity += real_sparsity
+                sum_speed += speed_f_b[config, method]
+                sum_latency += time_f_b[config, method]
+            
+            avg_sparsity = sum_sparsity / block_sparse_repeats
+            avg_speed = sum_speed / block_sparse_repeats
+            avg_latency = sum_latency / block_sparse_repeats
+            if avg_sparsity not in results:
+                    results[avg_sparsity] = [[],[]]
+            results[avg_sparsity][0].append(avg_latency)
+            results[avg_sparsity][1].append(avg_speed)
+            excel_data.append([batch_size, seqlen, avg_sparsity, avg_speed, avg_latency, avg_speed / base_speed, base_speed, base_f + base_b])
+        
+        for key in results.keys():
+            avg_latency = sum(results[key][0]) / len(results[key][0])
+            avg_speed = sum(results[key][1]) / len(results[key][1])
+            results[key] = [avg_latency, avg_speed]
+        all_results[seqlen] = results
+    
+    import json
+    with open(f"all_results_{excel_file_name}.json", "w") as f:
+        json.dump(all_results, f)
+            
+    write_to_excel(excel_label, excel_data, excel_dir_path, excel_file_name)
+
+profile_blocksparse_fwd_bwd()
\ No newline at end of file
--- a/block_sparse_tests/fwd_bwd/test_performance/utils.py
+++ b/block_sparse_tests/fwd_bwd/test_performance/utils.py
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py
+
+import openpyxl
+from block_sparse_attn.utils.benchmark import benchmark_forward, benchmark_backward
+import math
+import torch
+import os
+
+
+def benchmark_fwd_bwd(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    return (
+        benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_backward(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+    )
+
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+
+def write_to_excel(label, data, dir_path, file_name):
+    workbook = openpyxl.Workbook()
+    sheet = workbook.active
+    sheet.append(label)
+    os.makedirs(dir_path, exist_ok=True)
+    for row in data:
+        sheet.append(row)
+    workbook.save(dir_path + file_name + ".xlsx")
--- a/csrc/block_sparse_attn/flash_api.cpp
+++ b/csrc/block_sparse_attn/flash_api.cpp
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+/******************************************************************************
+ * Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/flash_api.cpp
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers.
+#include <torch/python.h>
+#include <torch/nn/functional.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cutlass/numeric_types.h>
+
+#include "flash.h"
+#include "static_switch.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+
+void set_params_fprop(Flash_fwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      at::Tensor out,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *seqused_k,
+                      void *p_d,
+                      void *softmax_lse_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.is_bf16 = q.dtype() == torch::kBFloat16;
+
+    // Set the pointers and strides.
+    params.q_ptr = q.data_ptr();
+    params.k_ptr = k.data_ptr();
+    params.v_ptr = v.data_ptr();
+    // All stride are in elements, not bytes.
+    params.q_row_stride = q.stride(-3);
+    params.k_row_stride = k.stride(-3);
+    params.v_row_stride = v.stride(-3);
+    params.q_head_stride = q.stride(-2);
+    params.k_head_stride = k.stride(-2);
+    params.v_head_stride = v.stride(-2);
+    params.o_ptr = out.data_ptr();
+    params.o_row_stride = out.stride(-3);
+    params.o_head_stride = out.stride(-2);
+
+    if (cu_seqlens_q_d == nullptr) {
+        params.q_batch_stride = q.stride(0);
+        params.k_batch_stride = k.stride(0);
+        params.v_batch_stride = v.stride(0);
+        params.o_batch_stride = out.stride(0);
+    }
+
+    params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
+    params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
+    params.seqused_k = static_cast<int *>(seqused_k);
+
+    // P = softmax(QK^T)
+    params.p_ptr = p_d;
+
+    // Softmax sum
+    params.softmax_lse_ptr = softmax_lse_d;
+
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.h_k = h_k;
+    params.h_h_k_ratio = h / h_k;
+    params.seqlen_q = seqlen_q;
+    params.seqlen_k = seqlen_k;
+    params.seqlen_q_rounded = seqlen_q_rounded;
+    params.seqlen_k_rounded = seqlen_k_rounded;
+    params.d = d;
+    params.d_rounded = d_rounded;
+
+    // Set the different scale values.
+    params.scale_softmax = softmax_scale;
+    params.scale_softmax_log2 = softmax_scale * M_LOG2E;
+
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    // Convert p from float to int so we don't have to convert the random uint to float to compare.
+    // [Minor] We want to round down since when we do the comparison we use <= instead of <
+    // params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
+    // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
+    params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
+    params.rp_dropout = 1.f / params.p_dropout;
+    params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
+    TORCH_CHECK(p_dropout < 1.f);
+
+    // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+    // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+    params.is_causal = window_size_left < 0 && window_size_right == 0;
+
+    if (window_size_left < 0 && window_size_right >= 0) { window_size_left = seqlen_k; }
+    if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; }
+    params.window_size_left = window_size_left;
+    params.window_size_right = window_size_right;
+
+    params.is_seqlens_k_cumulative = true;
+}
+
+void set_params_dgrad(Flash_bwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      const at::Tensor out,
+                      const at::Tensor dout,
+                      at::Tensor dq,
+                      at::Tensor dk,
+                      at::Tensor dv,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *dq_accum_d,
+                      void *dk_accum_d,
+                      void *dv_accum_d,
+                      void *softmax_lse_d,
+                      void *dsoftmax_sum_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right,
+                      bool deterministic) {
+
+    set_params_fprop(params,
+                     b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
+                     q, k, v, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k_d,
+                     nullptr,
+                     nullptr,
+                     softmax_lse_d,
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    // Set the pointers and strides.
+    params.do_ptr = dout.data_ptr();
+    params.do_row_stride = dout.stride(-3);
+    params.do_head_stride = dout.stride(-2);
+    params.dq_ptr = dq.data_ptr();
+    params.dk_ptr = dk.data_ptr();
+    params.dv_ptr = dv.data_ptr();
+    params.dq_row_stride = dq.stride(-3);
+    params.dk_row_stride = dk.stride(-3);
+    params.dv_row_stride = dv.stride(-3);
+    params.dq_head_stride = dq.stride(-2);
+    params.dk_head_stride = dk.stride(-2);
+    params.dv_head_stride = dv.stride(-2);
+
+    if (cu_seqlens_q_d == nullptr) {
+        params.do_batch_stride = dout.stride(0);
+        params.dq_batch_stride = dq.stride(0);
+        params.dk_batch_stride = dk.stride(0);
+        params.dv_batch_stride = dv.stride(0);
+    }
+
+    params.dq_accum_ptr = dq_accum_d;
+    params.dk_accum_ptr = dk_accum_d;
+    params.dv_accum_ptr = dv_accum_d;
+
+    // Softmax sum
+    params.dsoftmax_sum = dsoftmax_sum_d;
+
+    params.deterministic = deterministic;
+}
+
+void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        FWD_HEADDIM_SWITCH(params.d, [&] {
+            if (params.num_splits <= 1 && !force_split_kernel) {  // If we don't set it num_splits == 0
+                run_mha_fwd_<elem_type, kHeadDim>(params, stream);
+            } else {
+                run_mha_fwd_splitkv_dispatch<elem_type, kHeadDim>(params, stream);
+            }
+        });
+    });
+}
+
+void run_mha_fwd_block(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        FWD_BLOCK_HEADDIM_SWITCH(params.d, [&] {
+            if (params.num_splits <= 1 && !force_split_kernel) {
+                run_mha_fwd_block_<elem_type, kHeadDim>(params, stream);
+            }
+        });
+    });
+}
+
+// Find the number of splits that maximizes the occupancy. For example, if we have
+// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is
+// better than having 3 splits (efficiency = 0.67). However, we also don't want too many
+// splits as that would incur more HBM reads/writes.
+// So we find the best efficiency, then find the smallest number of splits that gets 85%
+// of the best efficiency.
+inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n_blocks, int max_splits) {
+    // If we have enough to almost fill the SMs, then just use 1 split
+    if (batch_nheads_mblocks >= 0.8f * num_SMs) { return 1; }
+    max_splits = std::min({max_splits, num_SMs, num_n_blocks});
+    float max_efficiency = 0.f;
+    std::vector<float> efficiency;
+    efficiency.reserve(max_splits);
+    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
+    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
+    // (i.e. it's 11 splits anyway).
+    // So we check if the number of blocks per split is the same as the previous num_splits.
+    auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
+        return num_splits == 1 || ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
+    };
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        if (!is_split_eligible(num_splits)) {
+            efficiency.push_back(0.f);
+        } else {
+            float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs;
+            float eff = n_waves / ceil(n_waves);
+            // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+            if (eff > max_efficiency) { max_efficiency = eff; }
+            efficiency.push_back(eff);
+        }
+    }
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        if (!is_split_eligible(num_splits)) { continue; }
+        if (efficiency[num_splits - 1] >= 0.85 * max_efficiency) {
+            // printf("num_splits chosen = %d\n", num_splits);
+            return num_splits;
+        }
+    }
+    return 1;
+}
+
+std::vector<at::Tensor>
+mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,
+        const float softmax_scale,
+        bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const bool return_softmax,
+        c10::optional<at::Generator> gen_) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+    if (is_causal) { window_size_right = 0; }
+
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
+    if (seqlenq_ngroups_swapped) {
+        const int ngroups = num_heads / num_heads_k;
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og);
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+    at::Tensor p;
+    // Only return softmax if there's dropout to reduce compilation time
+    if (return_softmax) {
+        TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0");
+        p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts);
+    }
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, k_padded, v_padded, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     return_softmax ? p.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    // This needs to match with run_mha_fwd_splitkv_dispatch
+    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
+    const int num_n_blocks = (seqlen_k + block_n - 1) / block_n;
+    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
+    // In any case we don't expect seqlen_q to be larger than 64 for inference.
+    const int num_m_blocks = (seqlen_q + 64 - 1) / 64;
+    params.num_splits = 1;
+    if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
+        params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount, num_n_blocks, 128);
+        if (params.num_splits > 1) {
+            at::Tensor softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+            at::Tensor out_accum = torch::empty({params.num_splits, batch_size, num_heads, seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
+            params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
+            params.oaccum_ptr = out_accum.data_ptr();
+        }
+        TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+    }
+
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+
+    if (seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state};
+}
+
+std::vector<at::Tensor>
+mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               const int max_seqlen_q,
+               const int max_seqlen_k,
+               const float p_dropout,
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const bool return_softmax,
+               c10::optional<at::Generator> gen_) {
+
+    if (is_causal) { window_size_right = 0; }
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q);
+    CHECK_DEVICE(cu_seqlens_k);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+    if (seqused_k.has_value()){
+        auto seqused_k_ = seqused_k.value();
+        TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32");
+        TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device");
+        TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous");
+        CHECK_SHAPE(seqused_k_, batch_size);
+    }
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, total_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+    at::Tensor p;
+    // Only return softmax if there's dropout to reduce compilation time
+    if (return_softmax) {
+        TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0");
+        p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts);
+    }
+
+    if (zero_tensors) {
+        out.zero_();
+        softmax_lse.fill_(-std::numeric_limits<float>::infinity());
+        if (return_softmax) {p.zero_();}
+    }
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, k_padded, v_padded, out,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
+                     return_softmax ? p.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+
+    if (max_seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state};
+}
+
+void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        if (params.d <= 32) {
+            run_mha_bwd_<elem_type, 32>(params, stream, configure);
+        } else if (params.d <= 64) {
+            run_mha_bwd_<elem_type, 64>(params, stream, configure);
+        } else if (params.d <= 96) {
+            run_mha_bwd_<elem_type, 96>(params, stream, configure);
+        } else if (params.d <= 128) {
+            run_mha_bwd_<elem_type, 128>(params, stream, configure);
+        } else if (params.d <= 160) {
+            run_mha_bwd_<elem_type, 160>(params, stream, configure);
+        } else if (params.d <= 192) {
+            run_mha_bwd_<elem_type, 192>(params, stream, configure);
+        } else if (params.d <= 224) {
+          run_mha_bwd_<elem_type, 224>(params, stream, configure);
+        } else if (params.d <= 256) {
+          run_mha_bwd_<elem_type, 256>(params, stream, configure);
+        }
+    });
+}
+
+void run_mha_bwd_block(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        BWD_BLOCK_HEADDIM_SWITCH(params.d, [&] {
+          run_mha_bwd_block_<elem_type, kHeadDim>(params, stream, configure);
+        });
+    });
+}
+
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_size_og
+        const at::Tensor &q,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &softmax_lse,     // b x h x seqlen_q
+        c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,         // probability to drop
+        const float softmax_scale,
+        const bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const bool deterministic,
+        c10::optional<at::Generator> gen_,
+        c10::optional<at::Tensor> &rng_state) {
+
+    if (is_causal) { window_size_right = 0; }
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    const int seqlen_q = sizes[1];
+    const int num_heads = sizes[2];
+    const int head_size_og = dout.size(3);
+    const int head_size = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
+    if (head_size > 192) {
+        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 requires A100/A800 or H100/H800");
+    }
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(k);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // bool loop = seqlen_k > blocksize_c;
+    // TODO: change later, for now set to true for simplicity
+    bool loop = true;
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    at::Tensor dk_accum, dv_accum;
+    if (loop) {
+        if (!deterministic) {
+            dq_accum = torch::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = torch::zeros({nsplits, batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+        // dk_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+        // dv_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+    }
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    Flash_bwd_params params;
+
+    set_params_dgrad(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout_padded, dq, dk_expanded, dv_expanded,
+                     nullptr,
+                     nullptr,
+                     loop ? dq_accum.data_ptr() : nullptr,
+                     // loop ? dk_accum.data_ptr() : nullptr,
+                     // loop ? dv_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     deterministic);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
+
+    auto launch = &run_mha_bwd;
+    // launch(params, stream, /*configure=*/true);
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+
+    if ( rng_state.has_value() ) {
+        params.rng_state = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+    } else if( is_dropout ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        params.rng_state[0] = std::get<0>(seeds);
+        params.rng_state[1] = std::get<1>(seeds);
+    }
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+
+    if (seqlen_q > 0) {
+        launch(params, stream, /*configure=*/false);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+        at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+    }
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d };
+}
+
+std::vector<at::Tensor>
+mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
+               const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &out,   // total_q x num_heads x head_size
+               const at::Tensor &softmax_lse,     // b x h x s   softmax logsumexp
+               c10::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               const int max_seqlen_q,
+               const int max_seqlen_k,          // max sequence length to choose the kernel
+               const float p_dropout,         // probability to drop
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const bool deterministic,
+               c10::optional<at::Generator> gen_,
+               c10::optional<at::Tensor> &rng_state) {
+
+    if (is_causal) { window_size_right = 0; }
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = dout.size(2);
+    const int head_size = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
+    if (head_size > 192) {
+        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 requires A100/A800 or H100/H800");
+    }
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+
+    TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
+
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, total_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, total_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, total_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // bool loop = max_seqlen_k > blocksize_c;
+    // TODO: change later, for now set to true for simplicity
+    bool loop = true;
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    if (loop) {
+        // We don't want to allocate dq_accum of size (batch, seqlen_q_rounded, num_heads, head_size_rounded)
+        // because that would be too large if there is a very long sequence and the rest of the sequences are short.
+        // Instead, we allocate dq_accum of size (total_q + 128 * batch, num_heads, head_size_rounded).
+        // Note that 128 is the max block size on the seqlen_q dimension.
+        // For dQ, the i-th sequence is stored in indices from cu_seqlens[i] + 128 * i to
+        // cu_seqlens[i + 1] * 128 * i - 1. This ensures that the i-th sequence and (i + 1)-th sequence will
+        // be at least 128 apart. It's ok for us to do atomicAdds up to 128 rows beyond what we're normally
+        // allowed to do. So we won't have to do any bound checking, and performance should stay the same.
+        if (!deterministic) {
+            dq_accum = torch::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = torch::zeros({nsplits, total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+    }
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    if( zero_tensors ) {
+        dq.zero_();
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    Flash_bwd_params params;
+
+    set_params_dgrad(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout_padded, dq, dk_expanded, dv_expanded,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     loop ? dq_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     deterministic);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
+
+    auto launch = &run_mha_bwd;
+    // launch(params, stream, /*configure=*/true);
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+
+    if ( rng_state.has_value() ) {
+        params.rng_state = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+    } else if( is_dropout ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        params.rng_state[0] = std::get<0>(seeds);
+        params.rng_state[1] = std::get<1>(seeds);
+    }
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+
+    if (max_seqlen_q > 0) {
+        launch(params, stream, /*configure=*/false);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+        at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+    }
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d };
+}
+
+std::vector<at::Tensor>
+mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
+                const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size
+                const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size
+                c10::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
+                c10::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
+                c10::optional<const at::Tensor> &seqlens_k_, // batch_size
+                c10::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
+                c10::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
+                c10::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+                c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+                const float softmax_scale,
+                bool is_causal,
+                int window_size_left,
+                int window_size_right,
+                bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+                int num_splits
+                ) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+    // We will support Turing in the near future
+    // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(kcache); CHECK_DEVICE(vcache);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(kcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(vcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+    const int seqlen_k = kcache.size(1);
+    const int num_heads_k = kcache.size(2);
+    const int batch_size_c = kcache.size(0);
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+    if (is_causal) { window_size_right = 0; }
+
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
+    if (seqlenq_ngroups_swapped) {
+        const int ngroups = num_heads / num_heads_k;
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+
+    at::Tensor q_padded, kcache_padded, vcache_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        kcache_padded = torch::nn::functional::pad(kcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        kcache_padded = kcache;
+        vcache_padded = vcache;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, kcache_padded, vcache_padded, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     /*p_ptr=*/nullptr,
+                     softmax_lse.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    at::Tensor k, v, k_padded, v_padded;
+    if (k_.has_value()) {
+        TORCH_CHECK(v_.has_value(), "If key is supplied, value must also be passed in");
+        TORCH_CHECK(seqlens_k_.has_value(), "If key is supplied, seqlens_k must also be passed in");
+        TORCH_CHECK(seqlen_q <= seqlen_k, "If key is supplied, it must have seqlen <= the seqlen of the KV cache");
+        k = k_.value();
+        v = v_.value();
+        TORCH_CHECK(k.dtype() == q_dtype, "Key must have the same dtype as query");
+        TORCH_CHECK(v.dtype() == q_dtype, "Value must have the same dtype as query");
+        CHECK_DEVICE(k); CHECK_DEVICE(v);
+        TORCH_CHECK(k.stride(-1) == 1, "Key tensor must have contiguous last dimension");
+        TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension");
+        int seqlen_knew = k.size(1);
+        CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        if (head_size_og % 8 != 0) {
+            k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+            v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        } else {
+            k_padded = k;
+            v_padded = v;
+        }
+        params.seqlen_knew = seqlen_knew;
+        params.knew_ptr = k_padded.data_ptr();
+        params.vnew_ptr = v_padded.data_ptr();
+        // All stride are in elements, not bytes.
+        params.knew_batch_stride = k_padded.stride(0);
+        params.vnew_batch_stride = v_padded.stride(0);
+        params.knew_row_stride = k_padded.stride(-3);
+        params.vnew_row_stride = v_padded.stride(-3);
+        params.knew_head_stride = k_padded.stride(-2);
+        params.vnew_head_stride = v_padded.stride(-2);
+    }
+
+    if (seqlens_k_.has_value()) {
+        auto seqlens_k = seqlens_k_.value();
+        TORCH_CHECK(seqlens_k.dtype() == torch::kInt32, "seqlens_k must have dtype int32");
+        CHECK_DEVICE(seqlens_k);
+        CHECK_CONTIGUOUS(seqlens_k);
+        CHECK_SHAPE(seqlens_k, batch_size);
+        params.cu_seqlens_k = static_cast<int *>(seqlens_k.data_ptr());
+    }
+    params.is_seqlens_k_cumulative = !(seqlens_k_.has_value());
+
+    if (rotary_cos_.has_value()) {
+        TORCH_CHECK(k_.has_value(), "If rotary cos/sin are provided, new key / value to be appended to KV cache must also be provided");
+        auto rotary_cos = rotary_cos_.value();
+        CHECK_DEVICE(rotary_cos);
+        params.rotary_dim = rotary_cos.size(1) * 2;
+        TORCH_CHECK(params.rotary_dim <= head_size, "rotary_dim must be <= headdim");
+        TORCH_CHECK(params.rotary_dim % 16 == 0, "Only rotary dimensions divisible by 16 are currently supported");
+        const int seqlen_ro = rotary_cos.size(0);
+        TORCH_CHECK(seqlen_ro >= seqlen_k, "cos/sin seqlen must be at least the seqlen of KV cache");
+        CHECK_SHAPE(rotary_cos, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_cos);
+        TORCH_CHECK(rotary_cos.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+
+        TORCH_CHECK(rotary_sin_.has_value(), "If rotary cos is provided, rotary sin must also be provided");
+        auto rotary_sin = rotary_sin_.value();
+        CHECK_DEVICE(rotary_sin);
+        CHECK_SHAPE(rotary_sin, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_sin);
+        TORCH_CHECK(rotary_sin.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+        params.rotary_cos_ptr = rotary_cos.data_ptr();
+        params.rotary_sin_ptr = rotary_sin.data_ptr();
+        params.is_rotary_interleaved = is_rotary_interleaved;
+    } else {
+        params.rotary_dim = 0;
+    }
+
+    if (cache_batch_idx_.has_value()) {
+        auto cache_batch_idx = cache_batch_idx_.value();
+        CHECK_DEVICE(cache_batch_idx);
+        CHECK_CONTIGUOUS(cache_batch_idx);
+        TORCH_CHECK(cache_batch_idx.scalar_type() == torch::kInt32, "cache_batch_idx must have dtype int32");
+        params.cache_batch_idx = reinterpret_cast<int *>(cache_batch_idx.data_ptr());
+    }
+    // This needs to match with run_mha_fwd_splitkv_dispatch
+    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
+    const int num_n_blocks = (seqlen_k + block_n - 1) / block_n;
+    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
+    // In any case we don't expect seqlen_q to be larger than 64 for inference.
+    const int num_m_blocks = (seqlen_q + 64 - 1) / 64;
+    params.num_splits = num_splits;
+    if (num_splits < 1) {
+        params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount, num_n_blocks, 128);
+    }
+    TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+    if (params.num_splits > 1) {
+        at::Tensor softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+        at::Tensor out_accum = torch::empty({params.num_splits, batch_size, num_heads, seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
+        params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
+        params.oaccum_ptr = out_accum.data_ptr();
+    }
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx
+    run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value());
+
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+        if (k_.has_value()) {
+            // It's expensive to copy the KV cache here for the case where head size not divisible by 8,
+            // but we don't expect to get this case in practice. This is just so that the code works for that case.
+            kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+            vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+        }
+    }
+
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+    return {out, softmax_lse};
+}
+
+// add by JXGuo
+const int SPARSE_SIZE = 128;
+
+std::vector<at::Tensor>
+mha_fwd_block(const at::Tensor &q,         
+// total_q x num_heads x head_size, total := \sum_{i=0}^{b} s_i
+              const at::Tensor &k,         
+              // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+              const at::Tensor &v,         
+              // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+              const at::Tensor &cu_seqlens_q,  // b+1
+              const at::Tensor &cu_seqlens_k,  // b+1
+            //   const bool is_blocksparse,
+              const int m_block_dim,
+              const int n_block_dim,
+              const at::Tensor &head_mask_type, // (num_heads)
+              c10::optional<at::Tensor> &streaming_info_, // (num_heads, 2)
+              c10::optional<at::Tensor> &row_blockmask_,   // (batch_size, num_blocksparse_heads, seqlen_m / m_block_dim, seqlen_n / n_block_dim)
+              const int max_seqlen_q_,
+              const int max_seqlen_k_,
+              const float p_dropout,
+              const float softmax_scale,
+              const bool is_causal,
+              const bool exact_streaming,
+              const bool return_softmax,
+              int window_size_left,
+              int window_size_right,
+              c10::optional<at::Generator> gen_)
+{
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    const bool has_blockmask = row_blockmask_.has_value();
+    const bool has_streaming_info = streaming_info_.has_value();
+    at::Tensor row_blockmask, streaming_info;
+    if (has_blockmask){
+        row_blockmask = row_blockmask_.value();
+    }
+    if (has_streaming_info){
+        streaming_info = streaming_info_.value();
+    }
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    TORCH_CHECK(head_mask_type.dtype() == torch::kInt32, "head_mask_type must have dtype int32");
+    if(has_blockmask){
+        TORCH_CHECK(row_blockmask.dtype() == torch::kInt32, "row_blockmask must have dtype int32");
+        TORCH_CHECK(m_block_dim % SPARSE_SIZE == 0, "m_block_dim must be a multiple of 128");
+        TORCH_CHECK(n_block_dim % SPARSE_SIZE == 0, "n_block_dim must be a multiple of 128");
+    }
+    if(has_streaming_info){
+        TORCH_CHECK(streaming_info.dtype() == torch::kInt32, "streaming_info must have dtype int32");
+        TORCH_CHECK(m_block_dim % SPARSE_SIZE == 0, "m_block_dim must be a multiple of 128");
+        TORCH_CHECK(n_block_dim % SPARSE_SIZE == 0, "n_block_dim must be a multiple of 128");
+    }
+    
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q);
+    CHECK_DEVICE(cu_seqlens_k);
+    CHECK_DEVICE(head_mask_type);
+    if(has_blockmask){
+        CHECK_DEVICE(row_blockmask);
+    }
+    if(has_streaming_info){
+        CHECK_DEVICE(streaming_info);
+    }
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+    CHECK_CONTIGUOUS(head_mask_type);
+    if(has_blockmask){
+        CHECK_CONTIGUOUS(row_blockmask);
+    }
+    if(has_streaming_info){
+        CHECK_CONTIGUOUS(streaming_info);
+    }
+
+    const auto sizes = q.sizes();
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    int num_blocksparse_heads = 0;
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    
+    if (window_size_left >= max_seqlen_k_) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k_) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out; 
+    out = torch::empty_like(q_padded);
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int max_seqlen_q_rounded = round_multiple(max_seqlen_q_, SPARSE_SIZE);
+    const int max_seqlen_k_rounded = round_multiple(max_seqlen_k_, SPARSE_SIZE);
+    
+    CHECK_SHAPE(head_mask_type, num_heads);
+    if(has_blockmask){
+        num_blocksparse_heads = row_blockmask.size(1);
+        CHECK_SHAPE(row_blockmask, batch_size, num_blocksparse_heads, max_seqlen_q_rounded / m_block_dim, max_seqlen_k_rounded / n_block_dim);
+    }
+    if(has_streaming_info){
+        CHECK_SHAPE(streaming_info, num_heads * 2);
+    }
+
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q_}, opts.dtype(at::kFloat));
+    at::Tensor p;
+    // Only return softmax if there's dropout to reduce compilation time
+    if (return_softmax) {
+        TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0");
+        p = torch::zeros({ batch_size, num_heads, max_seqlen_q_rounded, max_seqlen_k_rounded }, opts);
+    }
+
+    if(is_causal){
+        window_size_right = 0;
+    }
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     max_seqlen_q_, max_seqlen_k_,
+                     max_seqlen_q_rounded, max_seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, k_padded, v_padded, out,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     nullptr,
+                     return_softmax ? p.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    params.head_mask_type = static_cast<int *>(head_mask_type.data_ptr());
+    if(has_blockmask){
+        params.blockmask = static_cast<int *>(row_blockmask.data_ptr());
+        params.m_block_dim = m_block_dim;
+        params.n_block_dim = n_block_dim;
+        params.num_blocksparse_heads = num_blocksparse_heads;
+    }else{
+        params.blockmask = nullptr;
+    }
+
+    if(has_streaming_info){
+        params.streaming_info = static_cast<int *>(streaming_info.data_ptr());
+        params.is_exact_streaming = exact_streaming;
+        params.m_block_dim = m_block_dim;
+        params.n_block_dim = n_block_dim;
+    }else{
+        params.streaming_info = nullptr;
+        params.is_exact_streaming = false;
+    }
+
+
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+    
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    run_mha_fwd_block(params, stream);
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state};
+}
+
+std::vector<at::Tensor>
+mha_bwd_block(const at::Tensor &dout,  // total_q x num_heads, x head_size
+               const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &out,   // total_q x num_heads x head_size
+               const at::Tensor &softmax_lse,     // b x h x s   softmax logsumexp
+               c10::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               const int m_block_dim,
+               const int n_block_dim,
+               const at::Tensor &head_mask_type, // (num_heads)
+               c10::optional<at::Tensor> &streaming_info_,
+               c10::optional<at::Tensor> &col_blockmask_,   // (batch_size, num_blocksparse_heads, seqlen_n / n_block_dim, seqlen_m / m_block_dim)
+               const int max_seqlen_q_,
+               const int max_seqlen_k_,          // max sequence length to choose the kernel
+               const float p_dropout,         // probability to drop
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const bool deterministic,
+               c10::optional<at::Generator> gen_,
+               c10::optional<at::Tensor> &rng_state) 
+{
+    if (is_causal) { window_size_right = 0; }
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    const bool has_blockmask = col_blockmask_.has_value();
+    const bool has_streaming_info = streaming_info_.has_value();
+    at::Tensor col_blockmask, streaming_info;
+    if (has_blockmask){
+        col_blockmask = col_blockmask_.value();
+    }
+    if (has_streaming_info){
+        streaming_info = streaming_info_.value();
+    }
+
+    TORCH_CHECK(is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer."); 
+    
+    bool is_dropout = p_dropout > 0.0;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    if (q_dtype == torch::kBFloat16) {
+        TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+    }
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    TORCH_CHECK(head_mask_type.dtype() == torch::kInt32, "head_mask_type must have dtype int32");
+    if(has_blockmask){
+        TORCH_CHECK(col_blockmask.dtype() == torch::kInt32, "col_blockmask must have dtype int32");
+        TORCH_CHECK(m_block_dim % SPARSE_SIZE == 0, "m_block_dim must be a multiple of 128");
+        TORCH_CHECK(n_block_dim % SPARSE_SIZE == 0, "n_block_dim must be a multiple of 128");
+    }
+    if(has_streaming_info){
+        TORCH_CHECK(streaming_info.dtype() == torch::kInt32, "streaming_info must have dtype int32");
+        TORCH_CHECK(m_block_dim % SPARSE_SIZE == 0, "m_block_dim must be a multiple of 128");
+        TORCH_CHECK(n_block_dim % SPARSE_SIZE == 0, "n_block_dim must be a multiple of 128");
+    }
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
+    CHECK_DEVICE(head_mask_type);
+    if(has_blockmask){
+        CHECK_DEVICE(col_blockmask);
+    }
+    if(has_streaming_info){
+        CHECK_DEVICE(streaming_info);
+    }
+
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+    CHECK_CONTIGUOUS(head_mask_type);
+    if(has_blockmask){
+        CHECK_CONTIGUOUS(col_blockmask);
+    }
+    if(has_streaming_info){
+        CHECK_CONTIGUOUS(streaming_info);
+    }
+
+    const auto sizes = q.sizes();
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = dout.size(2);
+    const int head_size = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    int num_blocksparse_heads = 0;
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
+    if (head_size > 192) {
+        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 requires A100/A800 or H100/H800");
+    }
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int max_seqlen_q_rounded = round_multiple(max_seqlen_q_, SPARSE_SIZE);
+    const int max_seqlen_k_rounded = round_multiple(max_seqlen_k_, SPARSE_SIZE);
+
+    TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
+
+    if (window_size_left >= max_seqlen_k_) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k_) { window_size_right = -1; }
+
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    CHECK_SHAPE(head_mask_type, num_heads);
+    if(has_blockmask){
+        num_blocksparse_heads = col_blockmask.size(1);
+        CHECK_SHAPE(col_blockmask, batch_size, num_blocksparse_heads, max_seqlen_k_rounded / n_block_dim, max_seqlen_q_rounded / m_block_dim);// todo: check the shape
+    }
+    if(has_streaming_info){
+        CHECK_SHAPE(streaming_info, num_heads * 2);
+    }
+
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, total_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, total_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, total_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // bool loop = max_seqlen_k > blocksize_c;
+    // TODO: change later, for now set to true for simplicity
+    bool loop = true;
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+
+    if (loop) {
+        // We don't want to allocate dq_accum of size (batch, seqlen_q_rounded, num_heads, head_size_rounded)
+        // because that would be too large if there is a very long sequence and the rest of the sequences are short.
+        // Instead, we allocate dq_accum of size (total_q + 128 * batch, num_heads, head_size_rounded).
+        // Note that 128 is the max block size on the seqlen_q dimension.
+        // For dQ, the i-th sequence is stored in indices from cu_seqlens[i] + 128 * i to
+        // cu_seqlens[i + 1] * 128 * i - 1. This ensures that the i-th sequence and (i + 1)-th sequence will
+        // be at least 128 apart. It's ok for us to do atomicAdds up to 128 rows beyond what we're normally
+        // allowed to do. So we won't have to do any bound checking, and performance should stay the same.
+        if (!deterministic) {
+            dq_accum = torch::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = torch::zeros({nsplits, total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+    }
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    if( zero_tensors ) {
+        dq.zero_();
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    Flash_bwd_params params;
+
+    set_params_dgrad(
+        params,
+        batch_size,
+        max_seqlen_q_, max_seqlen_k_,
+        max_seqlen_q_rounded, max_seqlen_k_rounded,
+        num_heads, num_heads_k,
+        head_size, head_size_rounded,
+        q, k, v, out,
+        dout_padded, dq, dk_expanded, dv_expanded,
+        cu_seqlens_q.data_ptr(),
+        cu_seqlens_k.data_ptr(),
+        loop ? dq_accum.data_ptr() : nullptr,
+        nullptr,
+        nullptr,
+        softmax_lse.data_ptr(),
+        softmax_d.data_ptr(),
+        p_dropout,
+        softmax_scale,
+        window_size_left, window_size_right, deterministic
+    );
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
+    
+    params.head_mask_type = static_cast<int *>(head_mask_type.data_ptr());
+    if(has_blockmask){
+        params.blockmask = static_cast<int *>(col_blockmask.data_ptr());
+        params.m_block_dim = m_block_dim;
+        params.n_block_dim = n_block_dim;
+        params.num_blocksparse_heads = num_blocksparse_heads;
+    }else{
+        params.blockmask = nullptr;
+    }
+
+    if(has_streaming_info){
+        params.streaming_info = static_cast<int *>(streaming_info.data_ptr());
+        params.m_block_dim = m_block_dim;
+        params.n_block_dim = n_block_dim;
+    }else{
+        params.streaming_info = nullptr;
+    }
+
+
+    auto launch = &run_mha_bwd_block;
+    // launch(params, stream, /*configure=*/true);
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+
+    if ( rng_state.has_value() ) {
+        params.rng_state = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+    } else if(is_dropout) {
+        // See Note [Acquire lock when using random generators]
+
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+
+
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+
+        
+        params.rng_state[0] = std::get<0>(seeds);
+        params.rng_state[1] = std::get<1>(seeds);
+
+    }
+
+    if (max_seqlen_q_ > 0) {
+        launch(params, stream, /*configure=*/false);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+        at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+    }
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d };
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "FlashAttention";
+    m.def("fwd", &mha_fwd, "Forward pass");
+    m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
+    m.def("bwd", &mha_bwd, "Backward pass");
+    m.def("varlen_bwd", &mha_varlen_bwd, "Backward pass (variable length)");
+    m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache");
+
+    m.def("fwd_block", &mha_fwd_block, "Forward pass, with blockmask");
+    m.def("bwd_block", &mha_bwd_block, "Forward pass, with blockmask");
+}
--- a/csrc/block_sparse_attn/src/alibi.h
+++ b/csrc/block_sparse_attn/src/alibi.h
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_causal, typename Engine, typename Layout>
+inline __device__ void apply_alibi(Tensor<Engine, Layout> &tensor, 
+                                   const int col_idx_offset_,
+                                   const int max_seqlen_k, 
+                                   const int row_idx_offset,
+                                   const int max_seqlen_q, 
+                                   const int warp_row_stride,
+                                   const float alibi_slope) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+        #pragma unroll
+        for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+            const int col_idx_base = col_idx_offset + nj * 8;
+            #pragma unroll
+            for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                const int col_idx = col_idx_base + j;
+                #pragma unroll
+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                    tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                }
+            }
+        }
+    } else {  // Bias depends on both row_idx and col_idx
+        #pragma unroll
+        for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+            const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+            #pragma unroll
+            for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                const int row_idx = row_idx_base + i * 8;
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * 8;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j;
+                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                    }
+                }
+            }
+        }
+    }
+}
+
+}  // namespace flash
--- a/csrc/block_sparse_attn/src/block_info.h
+++ b/csrc/block_sparse_attn/src/block_info.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+namespace flash {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<bool Varlen=true>
+struct BlockInfo {
+
+    template<typename Params>
+    __device__ BlockInfo(const Params &params, const int bidb)
+        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
+        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb]))
+        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
+        {
+        }
+
+    template <typename index_t>
+    inline __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
+    }
+
+    template <typename index_t>
+    inline __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
+    }
+
+    const int sum_s_q;
+    const int sum_s_k;
+    const int actual_seqlen_q;
+    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
+    const int seqlen_k_cache;
+    const int actual_seqlen_k;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
--- a/csrc/block_sparse_attn/src/flash.h
+++ b/csrc/block_sparse_attn/src/flash.h
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+/******************************************************************************
+ * Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash.h
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+#include <ATen/cuda/CUDAGraphsUtils.cuh> // For at::cuda::philox::unpack
+
+constexpr int TOTAL_DIM = 0;
+constexpr int H_DIM = 1;
+constexpr int D_DIM = 2;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Qkv_params {
+    using index_t = int64_t;
+    // The QKV matrices.
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t q_batch_stride;
+    index_t k_batch_stride;
+    index_t v_batch_stride;
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+
+    // The number of heads.
+    int h, h_k;
+    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
+    // different from nheads (query).
+    int h_h_k_ratio; // precompute h / h_k,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_params : public Qkv_params {
+
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+    void * __restrict__ oaccum_ptr;
+
+    // The stride between rows of O.
+    index_t o_batch_stride;
+    index_t o_row_stride;
+    index_t o_head_stride;
+
+    // The pointer to the P matrix.
+    void * __restrict__ p_ptr;
+
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+    void * __restrict__ softmax_lseaccum_ptr;
+
+    // The dimensions.
+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim;
+
+    // The scaling factors for the kernel.
+    float scale_softmax;
+    float scale_softmax_log2;
+
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens_q;
+    int * __restrict__ cu_seqlens_k;
+
+    // If provided, the actual length of each k sequence.
+    int * __restrict__ seqused_k;
+
+    int *__restrict__ blockmask;
+    int *__restrict__ streaming_info;
+    int *__restrict__ head_mask_type;
+    // add by JXGuo
+    int m_block_dim, n_block_dim, num_blocksparse_heads;
+
+    // The K_new and V_new matrices.
+    void * __restrict__ knew_ptr;
+    void * __restrict__ vnew_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t knew_batch_stride;
+    index_t vnew_batch_stride;
+    index_t knew_row_stride;
+    index_t vnew_row_stride;
+    index_t knew_head_stride;
+    index_t vnew_head_stride;
+
+    // The cos and sin matrices for rotary embedding.
+    void * __restrict__ rotary_cos_ptr;
+    void * __restrict__ rotary_sin_ptr;
+
+    // The indices to index into the KV cache.
+    int *__restrict__ cache_batch_idx;
+
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    // uint32_t p_dropout_in_uint;
+    // uint16_t p_dropout_in_uint16_t;
+    uint8_t p_dropout_in_uint8_t;
+
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    float scale_softmax_rp_dropout;
+
+    // Local window size
+    int window_size_left, window_size_right;
+
+    // Random state.
+    at::PhiloxCudaState philox_args;
+
+    // Pointer to the RNG seed (idx 0) and offset (idx 1).
+    uint64_t * rng_state;
+
+    bool is_bf16;
+    bool is_causal;
+    bool is_exact_streaming;
+
+    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+    bool is_seqlens_k_cumulative;
+
+    bool is_rotary_interleaved;
+
+    int num_splits;  // For split-KV version
+
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_bwd_params : public Flash_fwd_params {
+
+    // The dO and dQKV matrices.
+    void *__restrict__ do_ptr;
+    void *__restrict__ dq_ptr;
+    void *__restrict__ dk_ptr;
+    void *__restrict__ dv_ptr;
+
+    // To accumulate dQ
+    void *__restrict__ dq_accum_ptr;
+    void *__restrict__ dk_accum_ptr;
+    void *__restrict__ dv_accum_ptr;
+
+    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q
+    // dimension void *__restrict__ dk_accum_ptr; void *__restrict__
+    // dv_accum_ptr;
+
+    // The stride between rows of the dO, dQ, dK and dV matrices.
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    index_t do_batch_stride;
+    index_t do_row_stride;
+    index_t do_head_stride;
+    index_t dq_batch_stride;
+    index_t dk_batch_stride;
+    index_t dv_batch_stride;
+    index_t dq_row_stride;
+    index_t dk_row_stride;
+    index_t dv_row_stride;
+    index_t dq_head_stride;
+    index_t dk_head_stride;
+    index_t dv_head_stride;
+
+    // The pointer to the softmax d sum.
+    void *__restrict__ dsoftmax_sum;
+
+    bool deterministic;
+    index_t dq_accum_split_stride;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);
+
+template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream, const bool configure);
+
+template<typename T, int Headdim> void run_mha_fwd_block_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim> void run_mha_bwd_block_(Flash_bwd_params &params, cudaStream_t stream, const bool configure);
--- a/csrc/block_sparse_attn/src/flash_blockmask.h
+++ b/csrc/block_sparse_attn/src/flash_blockmask.h
+/******************************************************************************
+ * Copyright (c) 2024, Junxian Guo.
+ ******************************************************************************/
+
+#pragma once
+
+namespace flash {
+
+class fwdIteratorBase{
+};
+
+
+// ////////////////////////////////////////////////////////////////////////////////////////////////////
+class fwdStreaming: public fwdIteratorBase{
+    public:
+    template<typename Params, typename BlockInfo>
+    __device__ fwdStreaming(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max) {//row first
+        this -> row_factor = params.m_block_dim / kBlockM;
+        this -> col_factor = params.n_block_dim / kBlockN;
+        this -> sink_block_num = params.streaming_info[head_idx * 2] * col_factor;
+        this -> local_block_num = params.streaming_info[head_idx * 2 + 1] * col_factor;
+        this -> m_block_dim = params.m_block_dim;
+        this -> n_block_dim = params.n_block_dim;
+        this -> mask_type = params.head_mask_type[head_idx];
+        this -> n_block_min = n_block_min;
+        this -> n_block_max = n_block_max;
+        int act_k = binfo.actual_seqlen_k;
+        int act_q = binfo.actual_seqlen_q;
+        bool causal = params.is_causal;
+        if (causal){
+            int start_row_idx = max(int((act_q-act_k)/m_block_dim), 0);
+            this -> start_block_val = (cute::ceil_div(max(act_k - act_q, 0), n_block_dim) + 1 + loop_step_idx/row_factor - start_row_idx) * col_factor;
+        }else{
+            this -> start_block_val = max(cute::ceil_div(n_block_max * kBlockN, n_block_dim) * col_factor, 0);
+        };
+        this -> no_gap = start_block_val - n_block_min < sink_block_num + local_block_num;
+        this -> max_block_idx = min(sink_block_num + local_block_num, start_block_val - n_block_min);
+
+        assert(mask_type < 0);
+        assert(params.m_block_dim % kBlockM == 0);
+        assert(params.n_block_dim % kBlockN == 0);
+    };
+
+    __device__ int mask_val(int block_col_idx) const {
+        if (block_col_idx > max_block_idx || block_col_idx < 0){
+            return -1;
+        };
+        int ret = 0;
+        if (no_gap){
+            ret = start_block_val - 1 - block_col_idx;
+            return ret >= n_block_min ? ret : -1;
+        }else{
+            if (block_col_idx < local_block_num){
+                return start_block_val - 1 - block_col_idx;
+            }else{
+                ret = sink_block_num - 1 - (block_col_idx - local_block_num);
+                return ret >= n_block_min ? ret : -1;
+            };
+        };
+    };
+
+    __device__ int max_no_larger(int target) const {
+        if(max_block_idx == 0){
+            return -1;
+        };
+        int left = 0;
+        int right = max_block_idx - 1;
+        while (left <= right) {
+            int mid = left + (right - left) / 2;
+            if (mask_val(mid) > target) {
+                left = mid + 1;
+            } else {
+                right = mid - 1;
+            };
+        };
+        return (left < max_block_idx && mask_val(left) <= target) ? left : -1;
+    };
+
+    int sink_block_num, local_block_num;
+    int start_block_val;
+    bool no_gap;
+    
+    int max_block_idx;
+    int m_block_dim, n_block_dim;
+    int mask_type;
+    int n_block_min, n_block_max;
+    int row_factor, col_factor;
+};
+
+
+class fwdExactStreaming: public fwdIteratorBase{
+    public:
+    template<typename Params, typename BlockInfo>
+    __device__ fwdExactStreaming(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max) {//row first
+        this -> row_factor = params.m_block_dim / kBlockM;
+        this -> col_factor = params.n_block_dim / kBlockN;
+        int sink_num = params.streaming_info[head_idx * 2];
+        int local_num = params.streaming_info[head_idx * 2 + 1];
+        this -> m_block_dim = params.m_block_dim;
+        this -> n_block_dim = params.n_block_dim;
+        this -> sink_block_num = cute::ceil_div(sink_num, n_block_dim) * col_factor;
+        this -> local_block_num = (cute::ceil_div(local_num, n_block_dim)+2) * col_factor;
+
+        
+        
+        this -> mask_type = params.head_mask_type[head_idx];
+        this -> n_block_min = n_block_min;
+        this -> n_block_max = n_block_max;
+        int act_k = binfo.actual_seqlen_k;
+        int act_q = binfo.actual_seqlen_q;
+        bool causal = params.is_causal;
+        if (causal){
+            int start_row_idx = max(int((act_q-act_k)/m_block_dim), 0);
+            this -> start_block_val = (cute::ceil_div(max(act_k - act_q, 0), n_block_dim) + 1 + loop_step_idx/row_factor - start_row_idx) * col_factor;
+        }else{
+            this -> start_block_val = max(cute::ceil_div(n_block_max * kBlockN, n_block_dim) * col_factor, 0);
+        };
+        this -> no_gap = start_block_val - n_block_min < sink_block_num + local_block_num;
+        this -> max_block_idx = min(sink_block_num + local_block_num, start_block_val - n_block_min);
+
+        assert(mask_type < 0);
+        assert(params.m_block_dim % kBlockM == 0);
+        assert(params.n_block_dim % kBlockN == 0);
+    };
+
+    __device__ int mask_val(int block_col_idx) const {
+        if (block_col_idx > max_block_idx || block_col_idx < 0){
+            return -1;
+        };
+        int ret = 0;
+        if (no_gap){
+            ret = start_block_val - 1 - block_col_idx;
+            return ret >= n_block_min ? ret : -1;
+        }else{
+            if (block_col_idx < local_block_num){
+                return start_block_val - 1 - block_col_idx;
+            }else{
+                ret = sink_block_num - 1 - (block_col_idx - local_block_num);
+                return ret >= n_block_min ? ret : -1;
+            };
+        };
+    };
+
+    __device__ int max_no_larger(int target) const {
+        if(max_block_idx == 0){
+            return -1;
+        };
+        int left = 0;
+        int right = max_block_idx - 1;
+        while (left <= right) {
+            int mid = left + (right - left) / 2;
+            if (mask_val(mid) > target) {
+                left = mid + 1;
+            } else {
+                right = mid - 1;
+            };
+        };
+        return (left < max_block_idx && mask_val(left) <= target) ? left : -1;
+    };
+
+    int sink_block_num, local_block_num;
+    int start_block_val;
+    bool no_gap;
+    
+    int max_block_idx;
+    int m_block_dim, n_block_dim;
+    int mask_type;
+    int n_block_min, n_block_max;
+    int row_factor, col_factor;
+};
+
+// ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class fwdBlockmask: public fwdIteratorBase{
+    public:
+    template<typename Params, typename BlockInfo>
+    __device__ fwdBlockmask(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max) {//row first
+        this -> row_factor = params.m_block_dim / kBlockM;
+        this -> col_factor = params.n_block_dim / kBlockN;
+        this -> max_block_idx = cute::ceil_div(binfo.actual_seqlen_k, params.n_block_dim) * col_factor;
+        this -> m_block_dim = params.m_block_dim;
+        this -> n_block_dim = params.n_block_dim;
+        this -> mask_type = params.head_mask_type[head_idx];
+        this -> n_block_min = n_block_min;
+        this -> n_block_max = n_block_max;
+
+        assert(mask_type > 0);
+        assert(params.m_block_dim % kBlockM == 0);
+        assert(params.n_block_dim % kBlockN == 0);
+        
+        blockmask_ptr = params.blockmask + (batch_idx * params.num_blocksparse_heads + mask_type - 1) * int(params.seqlen_q_rounded / m_block_dim) * int(params.seqlen_k_rounded / n_block_dim) + int(loop_step_idx / row_factor) * int(params.seqlen_k_rounded / n_block_dim);
+    };
+
+    __device__ int mask_val(int block_col_idx) const {
+        if (block_col_idx > max_block_idx || block_col_idx < 0){
+            return -1;
+        };
+        int real_block_idx = block_col_idx / col_factor;
+        int block_col_offset = block_col_idx % col_factor;
+        int mask_val = blockmask_ptr[real_block_idx];
+        return mask_val == -1 ? -1 : col_factor * mask_val + col_factor - 1 - block_col_offset;
+    };
+
+    __device__ int max_no_larger(int target) const {
+        if(max_block_idx == 0){
+            return -1;
+        };
+        int left = 0;
+        int right = max_block_idx - 1;
+        while (left <= right) {
+            int mid = left + (right - left) / 2;
+            if (mask_val(mid) > target) {
+                left = mid + 1;
+            } else {
+                right = mid - 1;
+            };
+        };
+        return (left < max_block_idx && mask_val(left) <= target) ? left : -1;
+    };
+
+    int *blockmask_ptr;
+    int max_block_idx;
+    int m_block_dim, n_block_dim;
+    int mask_type;
+    int n_block_min, n_block_max;
+    int row_factor, col_factor;
+};
+
+// ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<bool Is_streaming, bool Is_exact_streaming>   
+class fwdIterator{};
+
+template<>
+struct fwdIterator<false, false>: public fwdBlockmask{
+    template<typename Params, typename BlockInfo>
+    __device__ fwdIterator(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max): fwdBlockmask(params, binfo, kBlockM, kBlockN, batch_idx, head_idx, loop_step_idx, n_block_min, n_block_max) {};
+};
+
+template<>
+struct fwdIterator<true, false>: public fwdStreaming{
+    template<typename Params, typename BlockInfo>
+    __device__ fwdIterator(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max): fwdStreaming(params, binfo, kBlockM, kBlockN, batch_idx, head_idx, loop_step_idx, n_block_min, n_block_max) {};
+};
+
+template<>
+struct fwdIterator<true, true>: public fwdExactStreaming{
+    template<typename Params, typename BlockInfo>
+    __device__ fwdIterator(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int n_block_min, int n_block_max): fwdExactStreaming(params, binfo, kBlockM, kBlockN, batch_idx, head_idx, loop_step_idx, n_block_min, n_block_max) {};
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class bwdIteratorBase{
+};
+
+
+struct bwdStreaming: public bwdIteratorBase{
+    public:
+    template<typename Params, typename BlockInfo>
+    __device__ bwdStreaming(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int m_block_min, int m_block_max) {// col first
+        this -> row_factor = params.m_block_dim / kBlockM;
+        this -> col_factor = params.n_block_dim / kBlockN;
+        
+        this -> m_block_dim = params.m_block_dim;
+        this -> n_block_dim = params.n_block_dim;
+        this -> mask_type = params.head_mask_type[head_idx];
+        this -> m_block_min = m_block_min;
+        this -> m_block_max = m_block_max;
+
+        int mask_block_col = cute::ceil_div(loop_step_idx+1, col_factor);
+        int sink = (this -> mask_type) < 0 ? params.streaming_info[head_idx * 2]: cute::ceil_div(binfo.actual_seqlen_k, this -> n_block_dim);
+        int local = (this -> mask_type) < 0 ? params.streaming_info[head_idx * 2 + 1]: 0;
+        this -> sink_block_num = sink * col_factor;
+        this -> local_block_num = local * col_factor;
+        int act_q = binfo.actual_seqlen_q;
+        int act_k = binfo.actual_seqlen_k;
+        bool causal = params.is_causal;
+
+        if(mask_block_col <= sink){
+            this -> start_block_val = m_block_max;
+            this -> max_block_idx = m_block_max - m_block_min;
+        }else{
+            if (causal){
+                int free_token_num = act_q - min(act_q, act_k - loop_step_idx * kBlockN);
+                int end_mask_block_row_idx = free_token_num / params.m_block_dim;//zero based
+                int num_mask_block_in_end_row = max(0, cute::ceil_div(act_k - act_q + (end_mask_block_row_idx + 1) * params.m_block_dim, params.n_block_dim));
+                int local_col_mask_block_num = max(0, local - (num_mask_block_in_end_row - mask_block_col));
+                if(local_col_mask_block_num > 0){
+                    this -> start_block_val = min((end_mask_block_row_idx + local_col_mask_block_num) * row_factor, m_block_max);
+                    this -> max_block_idx = min(local_col_mask_block_num * row_factor, m_block_max - m_block_min);
+                }else{
+                    this -> start_block_val = 0;
+                    this -> max_block_idx = 0;
+                };
+            }else{
+                int n_mask_block_col = max(cute::ceil_div(act_k, n_block_dim), 0);
+                bool in_none_causal_local = !causal && mask_block_col <= n_mask_block_col && mask_block_col > n_mask_block_col - local;
+                if(in_none_causal_local){
+                    this -> start_block_val = m_block_max;
+                    this -> max_block_idx = m_block_max - m_block_min;
+                }else{
+                    this -> start_block_val = 0;
+                    this -> max_block_idx = 0;
+                };
+            };
+        }
+        
+        assert(mask_type <= 0); //for blocksparse, mask_type > 0; for streaming, mask_type < 0; for dense, mask_type = 0
+        assert(params.m_block_dim % kBlockM == 0);
+        assert(params.n_block_dim % kBlockN == 0);
+    };
+
+    __device__ int mask_val(int block_row_idx) const {
+        if (block_row_idx > max_block_idx || block_row_idx < 0){
+            return -1;
+        };
+        int ret = start_block_val - 1 - block_row_idx;
+        return ret >= m_block_min ? ret : -1;
+    };
+
+    __device__ int max_no_larger(int target) const {
+        if(max_block_idx == 0){
+            return -1;
+        };
+        int left = 0;
+        int right = max_block_idx - 1;
+        while (left <= right) {
+            int mid = left + (right - left) / 2;
+            if (mask_val(mid) > target) {
+                left = mid + 1;
+            } else {
+                right = mid - 1;
+            };
+        };
+        return (left < max_block_idx && mask_val(left) <= target) ? left : -1;
+    };
+
+    int sink_block_num, local_block_num;
+    int start_block_val;
+
+    int max_block_idx;
+    int m_block_dim, n_block_dim;
+    int mask_type;
+    int m_block_min, m_block_max;
+    int row_factor, col_factor;
+};
+
+struct bwdBlockmask: public bwdIteratorBase{
+    public:
+    template<typename Params, typename BlockInfo>
+    __device__ bwdBlockmask(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int m_block_min, int m_block_max) {
+        this -> row_factor = params.m_block_dim / kBlockM;
+        this -> col_factor = params.n_block_dim / kBlockN;
+        this -> max_block_idx = cute::ceil_div(binfo.actual_seqlen_q, params.m_block_dim) * row_factor;
+        this -> m_block_dim = params.m_block_dim;
+        this -> n_block_dim = params.n_block_dim;
+        this -> mask_type = params.head_mask_type[head_idx];
+        this -> m_block_min = m_block_min;
+        this -> m_block_max = m_block_max;
+        assert(mask_type > 0);
+        assert(params.m_block_dim % kBlockM == 0);
+        assert(params.n_block_dim % kBlockN == 0);
+
+        blockmask_ptr = params.blockmask + (batch_idx * params.num_blocksparse_heads + mask_type - 1) * int(params.seqlen_k_rounded / n_block_dim) * int(params.seqlen_q_rounded / m_block_dim) + int(loop_step_idx / col_factor) * int(params.seqlen_q_rounded / m_block_dim);
+    };
+
+    __device__ int mask_val(int block_row_idx) const {
+        if (block_row_idx > max_block_idx || block_row_idx < 0){
+            return -1;
+        };
+        int real_block_idx = block_row_idx / row_factor;
+        int block_row_offset = block_row_idx % row_factor;
+        int mask_val = blockmask_ptr[real_block_idx];
+        return mask_val == -1 ? -1 : row_factor * mask_val + row_factor - 1 - block_row_offset;
+    };
+
+    __device__ int max_no_larger(int target) const {
+        if(max_block_idx == 0){
+            return -1;
+        };
+        int left = 0;
+        int right = max_block_idx - 1;
+        while (left <= right) {
+            int mid = left + (right - left) / 2;
+            if (mask_val(mid) > target) {
+                left = mid + 1;
+            } else {
+                right = mid - 1;
+            };
+        };
+        return (left < max_block_idx && mask_val(left) <= target) ? left : -1;
+    };
+
+    int *blockmask_ptr;
+    int max_block_idx;
+    int m_block_dim, n_block_dim;
+    int mask_type;
+    int m_block_min, m_block_max;
+    int row_factor, col_factor;
+};
+
+
+
+template<bool Is_streaming>   
+class bwdIterator{};
+
+template<>
+struct bwdIterator<false>: public bwdBlockmask{
+    template<typename Params, typename BlockInfo>
+    __device__ bwdIterator(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int m_block_min, int m_block_max): bwdBlockmask(params, binfo, kBlockM, kBlockN, batch_idx, head_idx, loop_step_idx, m_block_min, m_block_max) {};
+};
+
+template<>
+struct bwdIterator<true>: public bwdStreaming{
+    template<typename Params, typename BlockInfo>
+    __device__ bwdIterator(const Params &params, const BlockInfo &binfo, const int kBlockM, const int kBlockN, const int batch_idx, const int head_idx, const int loop_step_idx, int m_block_min, int m_block_max): bwdStreaming(params, binfo, kBlockM, kBlockN, batch_idx, head_idx, loop_step_idx, m_block_min, m_block_max) {};
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
\ No newline at end of file
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim128_bf16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim128_bf16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::bfloat16_t, 128>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim128<cutlass::bfloat16_t>(params, stream, configure);
+}
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim128_fp16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim128_fp16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::half_t, 128>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim128<cutlass::half_t>(params, stream, configure);
+}
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim32_bf16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim32_bf16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::bfloat16_t, 32>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim32<cutlass::bfloat16_t>(params, stream, configure);
+}
\ No newline at end of file
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim32_fp16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim32_fp16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::half_t, 32>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim32<cutlass::half_t>(params, stream, configure);
+}
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim64_bf16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim64_bf16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::bfloat16_t, 64>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim64<cutlass::bfloat16_t>(params, stream, configure);
+}
--- a/csrc/block_sparse_attn/src/flash_bwd_block_hdim64_fp16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_block_hdim64_fp16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Adapted by Junxian Guo from https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_block_<cutlass::half_t, 64>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_block_hdim64<cutlass::half_t>(params, stream, configure);
+}
--- a/csrc/block_sparse_attn/src/flash_bwd_hdim128_bf16_sm80.cu
+++ b/csrc/block_sparse_attn/src/flash_bwd_hdim128_bf16_sm80.cu
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t>(params, stream, configure);
+}